Commit d38ad8a5 authored by Ondřej Mička's avatar Ondřej Mička

Matrix transpose

parent c87aafd8
test: matrix_transpose_test
./$<
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare
matrix_transpose_test: matrix_transpose_test.cpp matrix_transpose.h matrix_tests.h test_main.cpp
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@
clean:
rm -f matrix_transpose_test
.PHONY: clean test
#include <string>
#include <vector>
#include <iostream>
#include <algorithm>
/* A matrix stored in a simulated cache */
class CachedMatrix {
unsigned B; // Block size
unsigned mem_blocks; // Memory size in blocks
unsigned cache_blocks; // Cache size in blocks
unsigned cache_used; // How many blocks of cache we already used
// We store the matrix as a one-dimensional array
vector<unsigned> items;
unsigned pos(unsigned i, unsigned j) { return i*N + j; }
/*
* For each memory block, we keep the following structure.
* If the block is currently cached, we set cached == true
* and lru_prev/next point to neighboring blocks in the cyclic LRU list.
* Otherwise, cached == false and the block is not in the LRU.
*/
class MemoryBlock {
public:
unsigned lru_prev, lru_next;
bool cached;
MemoryBlock()
{
lru_prev = lru_next = 0;
cached = false;
}
};
vector<MemoryBlock> blocks;
// One block at the end of "blocks" serves as a head of the LRU list.
unsigned lru_head;
public:
// Number of rows and columns of the matrix
unsigned N;
int debug_level; // Verbosity
CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0)
{
EXPECT(N > 0, "CachedMatrix must be non-empty.");
EXPECT(B > 0, "Blocks must be non-empty.");
EXPECT(!(M % B), "Cache size must be divisible by block size.");
EXPECT(M >= 2*B, "Cache must have at least 2 blocks.");
unsigned NN = N*N;
items.resize(NN, 0);
this->N = N;
this->B = B;
this->debug_level = debug_level;
mem_blocks = (NN+B-1) / B;
cache_blocks = M / B;
cache_used = 0;
// Initialize the LRU list
blocks.resize(mem_blocks + 1);
lru_head = mem_blocks;
blocks[lru_head].lru_prev = lru_head;
blocks[lru_head].lru_next = lru_head;
if (debug_level > 0)
cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n";
}
// Read value at position (i,j), used only in testing code
unsigned read(unsigned i, unsigned j)
{
EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
return items[addr];
}
// Write value at position (i,j), used only in testing code
void write(unsigned i, unsigned j, unsigned data)
{
EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
items[addr] = data;
}
// Swap items (i1,j1) and (i2,j2)
void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
{
EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
if (debug_level > 1)
cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl;
unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2);
access(addr1);
access(addr2);
std::swap(items[addr1], items[addr2]);
}
unsigned stat_cache_misses;
unsigned stat_accesses;
// Reset statistic counters.
void reset_stats()
{
stat_cache_misses = 0;
stat_accesses = 0;
}
static string coord_string(unsigned i, unsigned j)
{
return "(" + to_string(i) + "," + to_string(j) + ")";
}
#include "matrix_transpose.h"
private:
// Bring the given address to the cache.
void access(unsigned addr)
{
int i = addr / B; // Which block to bring
if (blocks[i].cached) {
lru_remove(i);
} else {
if (cache_used < cache_blocks) {
// We still have room in the cache.
cache_used++;
if (debug_level > 1)
cout << "\t\tLoading block " << i << endl;
} else {
// We need to evict the least-recently used block to make space.
unsigned replace = blocks[lru_head].lru_prev;
lru_remove(replace);
EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list.");
blocks[replace].cached = false;
if (debug_level > 1)
cout << "\t\tLoading block " << i << ", replacing " << replace << endl;
}
blocks[i].cached = true;
stat_cache_misses++;
}
lru_add_after(i, lru_head);
stat_accesses++;
}
// Remove block from the LRU list.
void lru_remove(unsigned i)
{
unsigned prev = blocks[i].lru_prev;
unsigned next = blocks[i].lru_next;
blocks[prev].lru_next = next;
blocks[next].lru_prev = prev;
}
// Add block at the given position in the LRU list.
void lru_add_after(unsigned i, unsigned after)
{
unsigned next = blocks[after].lru_next;
blocks[next].lru_prev = i;
blocks[after].lru_next = i;
blocks[i].lru_next = next;
blocks[i].lru_prev = after;
}
};
/* A cached matrix extended by methods for testing */
class TestMatrix : public CachedMatrix {
public:
TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { }
// Fill matrix with a testing pattern.
void fill_matrix()
{
if (debug_level > 1)
cout << "\tInitializing\n";
for (unsigned i = 0; i < N; i++)
for (unsigned j = 0; j < N; j++)
write(i, j, i*N + j);
}
// Check that the pattern corresponds to the properly transposed matrix.
void check_result()
{
if (debug_level > 1)
cout << "\tChecking\n";
for (unsigned i = 0; i < N; i++) {
for (unsigned j = 0; j < N; j++) {
unsigned want = j*N + i;
unsigned found = read(i, j);
unsigned found_i = found / N;
unsigned found_j = found % N;
EXPECT(found == want,
"Mismatch at position " + coord_string(i, j) +
": expected element from " + coord_string(j, i) +
", found element from " + coord_string(found_i, found_j) +
".");
}
}
}
// Transpose the matrix naively.
void naive_transpose()
{
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
};
/*
* This file is #include'd inside the definition of a matrix class
* like this:
*
* class ClassName {
* // Number of rows and columns of the matrix
* unsigned N;
*
* // Swap elements (i1,j1) and (i2,j2)
* void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2);
*
* // Your code
* #include "matrix_transpose.h"
* }
*/
void transpose()
{
// TODO: Implement this efficiently
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
#include <functional>
#include <vector>
#include <iostream>
#include <iomanip>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message);
#include "matrix_tests.h"
void generic_test(unsigned N, unsigned M, unsigned B, double max_ratio, int debug_level)
{
TestMatrix m(N, M, B, debug_level);
m.fill_matrix();
m.reset_stats();
m.transpose();
cout << "\t" << m.stat_cache_misses << " misses in " << m.stat_accesses << " accesses\n";
if (m.stat_accesses) {
double mpa = (double) m.stat_cache_misses / m.stat_accesses;
double lb = 1. / B;
double ratio = mpa / lb;
cout << "\t" <<
std::fixed << std::setprecision(6) <<
mpa << " misses/access (lower bound is " << lb <<
" => ratio " << ratio <<
", need " << max_ratio << ")\n";
EXPECT(ratio <= max_ratio, "Algorithm did too many I/O operations.");
}
m.check_result();
}
/*** A list of all tests ***/
vector<pair<string, function<void()>>> tests = {
// name N M B max_ratio debug_level
{ "small2k", [] { generic_test( 8, 32, 8, 8, 2 ); } },
{ "small", [] { generic_test( 13, 64, 8, 4, 2 ); } },
{ "n100b16", [] { generic_test( 100, 1024, 16, 3, 1 ); } },
{ "n1000b16", [] { generic_test(1000, 1024, 16, 3, 1 ); } },
{ "n1000b64", [] { generic_test(1000, 8192, 64, 3, 1 ); } },
{ "n1000b256", [] { generic_test(1000, 65536, 256, 5, 1 ); } },
{ "n1000b4096", [] { generic_test(1000, 65536, 4096, 50, 1 ); } },
};
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
extern vector<pair<string, function<void()>>> tests;
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
int main(int argc, char* argv[]) {
vector<string> required_tests;
if (argc > 1) {
required_tests.assign(argv + 1, argv + argc);
} else {
for (const auto& test : tests)
required_tests.push_back(test.first);
}
for (const auto& required_test : required_tests) {
bool found = false;
for (const auto& test : tests)
if (required_test == test.first) {
cerr << "Running test " << required_test << endl;
test.second();
found = true;
break;
}
if (!found) {
cerr << "Unknown test " << required_test << endl;
return 1;
}
}
return 0;
}
from matrix_transpose import Matrix
import numpy
# Description of memory blocks is stored in an array blocks[block_index][B_xxx].
# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to
# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block
# is not in the LRU.
B_CACHED = 0
B_LRU_NEXT = 1
B_LRU_PREV = 2
class CachedMatrix(Matrix):
"""A matrix stored in simulated cache"""
def __init__(self, N, M, B, debug_level=0):
assert N>0, "CachedMatrix must be non-empty."
assert B>0, "Blocks must be non-empty."
assert M%B == 0, "Cache size must be divisible by block size."
assert M >= 2*B, "Cache must have at least 2 blocks."
Matrix.__init__(self, N)
NN = N*N
self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C")
self.B = B
self.M = M
self.debug_level = debug_level
self.mem_blocks = (NN+B-1) // B
self.cache_blocks = M//B
self.cache_used = 0
# Initialize the LRU list. There is a virtual block right after the last real block,
# which serves as a head of the cyclic LRU list.
self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C")
self.lru_head = self.mem_blocks
self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head
self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head
self.reset_stats()
if debug_level > 0:
print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks))
def _pos(self, i, j):
"""Convert position in matrix to linear address in simulated memory."""
return i*self.N + j
def read(self, i, j):
"""Read value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
return self.items[addr]
def write(self, i, j, value):
"""Write value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
self.items[addr] = value
def swap(self, i1, j1, i2, j2):
"""Swap items (i1,j1) and (i2,j2)."""
assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \
i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \
"Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2)
if self.debug_level > 1:
print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2))
addr1 = self._pos(i1, j1)
addr2 = self._pos(i2, j2)
self._access(addr1)
self._access(addr2)
items = self.items
items[addr1], items[addr2] = items[addr2], items[addr1]
def reset_stats(self):
"""Reset statistic counters."""
self.stat_cache_misses = 0
self.stat_accesses = 0
def _access(self, addr):
"""Bring the given address to the cache."""
blocks = self.blocks
i = addr // self.B # Which block to bring
if blocks[i, B_CACHED] > 0:
self._lru_remove(i)
else:
if self.cache_used < self.cache_blocks:
# We still have room in the cache.
self.cache_used += 1
if self.debug_level > 1:
print("\t\tLoading block {}".format(i))
else:
# We need to evict the least-recently used block to make space.
replace = blocks[self.lru_head, B_LRU_PREV]
self._lru_remove(replace)
assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list"
blocks[replace, B_CACHED] = 0
if self.debug_level > 1:
print("\t\tLoading block {}, replacing {}".format(i, replace))
blocks[i, B_CACHED] = 1
self.stat_cache_misses += 1
self._lru_add_after(i, self.lru_head)
self.stat_accesses += 1
def _lru_remove(self, i):
"""Remove block from the LRU list."""
blocks = self.blocks
prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT]
blocks[prev, B_LRU_NEXT] = next
blocks[next, B_LRU_PREV] = prev
def _lru_add_after(self, i, after):
"""Add block at the given position in the LRU list."""
blocks = self.blocks
next = blocks[after, B_LRU_NEXT]
blocks[after, B_LRU_NEXT] = i
blocks[next, B_LRU_PREV] = i
blocks[i, B_LRU_NEXT] = next
blocks[i, B_LRU_PREV] = after
class TestMatrix(CachedMatrix):
"""A cached matrix extended by methods for testing."""
# Constructor is inherited
def fill_matrix(self):
"""Fill matrix with a testing pattern."""
if self.debug_level > 1:
print("\tInitializing")
N = self.N
for i in range(N):
for j in range(N):
self.write(i, j, i*N + j)
def check_result(self):
"""Check that the pattern corresponds to the properly transposed matrix."""
if self.debug_level > 1:
print("\tChecking")
N = self.N
for i in range(N):
for j in range(N):
want = j*N + i
have = self.read(i, j)
have_i = have // N
have_j = have % N
assert have == want, "Mismatch at position ({},{}): expected element from ({},{}), found element from ({},{})".format(i, j, j, i, have_i, have_j)
def naive_transpose(self):
"""Transpose the matrix naively."""
for i in range(self.N):
for j in range(i):
self.swap(i, j, j, i)
class Matrix:
"""Interface of a matrix.
This class provides only the matrix size N and a method for swapping
two items. The actual storage of the matrix in memory is provided by
subclasses in testing code.
"""
def __init__(self, N):
self.N = N
def swap(self, i1, j1, i2, j2):
"""Swap elements (i1,j1) and (i2,j2)."""
# Overridden in subclasses
raise NotImplementedError
def transpose(self):
"""Transpose the matrix."""
# TODO: Implement more efficiently
for i in range(self.N):
for j in range(i):
self.swap(i, j, j, i)
#!/usr/bin/env python3
import math
import sys
from matrix_tests import TestMatrix
def generic_test(N, M, B, max_ratio, debug_level):
m = TestMatrix(N, M, B, debug_level)
m.fill_matrix()
m.reset_stats()
m.transpose()
print("\t{} misses in {} accesses".format(m.stat_cache_misses, m.stat_accesses))
if m.stat_accesses:
mpa = m.stat_cache_misses / m.stat_accesses
lb = 1 / B
ratio = mpa / lb
print("\t{:.6f} misses/access (lower bound is {:.6f} => ratio {:.6f}, need {:.6f})".format(mpa, lb, ratio, max_ratio))
assert ratio <= max_ratio, "Algorithm did too many I/O operations."
m.check_result()
# A list of all tests
tests = [
# name N M B max_ratio debug_level
("small2k", lambda: generic_test( 8, 32, 8, 8, 2 )),
("small", lambda: generic_test( 13, 64, 8, 4, 2 )),
("n100b16", lambda: generic_test( 100, 1024, 16, 3, 1 )),
("n1000b16", lambda: generic_test(1000, 1024, 16, 3, 1 )),
("n1000b64", lambda: generic_test(1000, 8192, 64, 3, 1 )),
("n1000b256", lambda: generic_test(1000, 65536, 256, 5, 1 )),
("n1000b4096", lambda: generic_test(1000, 65536, 4096, 50, 1 )),
]
if __name__ == "__main__":
for required_test in sys.argv[1:] or [name for name, _ in tests]:
for name, test in tests:
if name == required_test:
print("Running test {}".format(name), file=sys.stderr)
test()
break
else:
raise ValueError("Unknown test {}".format(name))
Implement cache-oblivious transposition of square matrices.
The Python version requires the NumPy module for storing the matrix
in memory efficiently.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment