Skip to content
Snippets Groups Projects
Commit faf92e46 authored by David Mareček's avatar David Mareček
Browse files

matrix transpose

parent 363f118d
No related branches found
No related tags found
No related merge requests found
test: matrix_transpose_test
./$<
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare
matrix_transpose_test: matrix_transpose_test.cpp matrix_transpose.h matrix_tests.h test_main.cpp
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@
clean:
rm -f matrix_transpose_test
.PHONY: clean test
#include <string>
#include <vector>
#include <iostream>
#include <algorithm>
/* A matrix stored in a simulated cache */
class CachedMatrix {
unsigned B; // Block size
unsigned mem_blocks; // Memory size in blocks
unsigned cache_blocks; // Cache size in blocks
unsigned cache_used; // How many blocks of cache we already used
// We store the matrix as a one-dimensional array
vector<unsigned> items;
unsigned pos(unsigned i, unsigned j) { return i*N + j; }
/*
* For each memory block, we keep the following structure.
* If the block is currently cached, we set cached == true
* and lru_prev/next point to neighboring blocks in the cyclic LRU list.
* Otherwise, cached == false and the block is not in the LRU.
*/
class MemoryBlock {
public:
unsigned lru_prev, lru_next;
bool cached;
MemoryBlock()
{
lru_prev = lru_next = 0;
cached = false;
}
};
vector<MemoryBlock> blocks;
// One block at the end of "blocks" serves as a head of the LRU list.
unsigned lru_head;
public:
// Number of rows and columns of the matrix
unsigned N;
int debug_level; // Verbosity
CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0)
{
EXPECT(N > 0, "CachedMatrix must be non-empty.");
EXPECT(B > 0, "Blocks must be non-empty.");
EXPECT(!(M % B), "Cache size must be divisible by block size.");
EXPECT(M >= 2*B, "Cache must have at least 2 blocks.");
unsigned NN = N*N;
items.resize(NN, 0);
this->N = N;
this->B = B;
this->debug_level = debug_level;
mem_blocks = (NN+B-1) / B;
cache_blocks = M / B;
cache_used = 0;
// Initialize the LRU list
blocks.resize(mem_blocks + 1);
lru_head = mem_blocks;
blocks[lru_head].lru_prev = lru_head;
blocks[lru_head].lru_next = lru_head;
if (debug_level > 0)
cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n";
}
// Read value at position (i,j), used only in testing code
unsigned read(unsigned i, unsigned j)
{
EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
return items[addr];
}
// Write value at position (i,j), used only in testing code
void write(unsigned i, unsigned j, unsigned data)
{
EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
items[addr] = data;
}
// Swap items (i1,j1) and (i2,j2)
void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
{
EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
if (debug_level > 1)
cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl;
unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2);
access(addr1);
access(addr2);
std::swap(items[addr1], items[addr2]);
}
unsigned stat_cache_misses;
unsigned stat_accesses;
// Reset statistic counters.
void reset_stats()
{
stat_cache_misses = 0;
stat_accesses = 0;
}
static string coord_string(unsigned i, unsigned j)
{
return "(" + to_string(i) + "," + to_string(j) + ")";
}
#include "matrix_transpose.h"
private:
// Bring the given address to the cache.
void access(unsigned addr)
{
int i = addr / B; // Which block to bring
if (blocks[i].cached) {
lru_remove(i);
} else {
if (cache_used < cache_blocks) {
// We still have room in the cache.
cache_used++;
if (debug_level > 1)
cout << "\t\tLoading block " << i << endl;
} else {
// We need to evict the least-recently used block to make space.
unsigned replace = blocks[lru_head].lru_prev;
lru_remove(replace);
EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list.");
blocks[replace].cached = false;
if (debug_level > 1)
cout << "\t\tLoading block " << i << ", replacing " << replace << endl;
}
blocks[i].cached = true;
stat_cache_misses++;
}
lru_add_after(i, lru_head);
stat_accesses++;
}
// Remove block from the LRU list.
void lru_remove(unsigned i)
{
unsigned prev = blocks[i].lru_prev;
unsigned next = blocks[i].lru_next;
blocks[prev].lru_next = next;
blocks[next].lru_prev = prev;
}
// Add block at the given position in the LRU list.
void lru_add_after(unsigned i, unsigned after)
{
unsigned next = blocks[after].lru_next;
blocks[next].lru_prev = i;
blocks[after].lru_next = i;
blocks[i].lru_next = next;
blocks[i].lru_prev = after;
}
};
/* A cached matrix extended by methods for testing */
class TestMatrix : public CachedMatrix {
public:
TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { }
// Fill matrix with a testing pattern.
void fill_matrix()
{
if (debug_level > 1)
cout << "\tInitializing\n";
for (unsigned i = 0; i < N; i++)
for (unsigned j = 0; j < N; j++)
write(i, j, i*N + j);
}
// Check that the pattern corresponds to the properly transposed matrix.
void check_result()
{
if (debug_level > 1)
cout << "\tChecking\n";
for (unsigned i = 0; i < N; i++) {
for (unsigned j = 0; j < N; j++) {
unsigned want = j*N + i;
unsigned found = read(i, j);
unsigned found_i = found / N;
unsigned found_j = found % N;
EXPECT(found == want,
"Mismatch at position " + coord_string(i, j) +
": expected element from " + coord_string(j, i) +
", found element from " + coord_string(found_i, found_j) +
".");
}
}
}
// Transpose the matrix naively.
void naive_transpose()
{
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
};
/*
* This file is #include'd inside the definition of a matrix class
* like this:
*
* class ClassName {
* // Number of rows and columns of the matrix
* unsigned N;
*
* // Swap elements (i1,j1) and (i2,j2)
* void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2);
*
* // Your code
* #include "matrix_transpose.h"
* }
*/
void transpose()
{
// TODO: Implement this efficiently
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
#include <functional>
#include <vector>
#include <iostream>
#include <iomanip>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message);
#include "matrix_tests.h"
void generic_test(unsigned N, unsigned M, unsigned B, double max_ratio, int debug_level)
{
TestMatrix m(N, M, B, debug_level);
m.fill_matrix();
m.reset_stats();
m.transpose();
cout << "\t" << m.stat_cache_misses << " misses in " << m.stat_accesses << " accesses\n";
if (m.stat_accesses) {
double swaps = N * (N-1) / 2;
double mpa = (double) m.stat_cache_misses / swaps;
double lb = 2. / B;
double ratio = mpa / lb;
cout << "\t" <<
std::fixed << std::setprecision(6) <<
mpa << " misses/swap (lower bound is " << lb <<
" => ratio " << ratio <<
", limit " << max_ratio << ")\n";
EXPECT(ratio <= max_ratio, "Algorithm did too many I/O operations.");
}
m.check_result();
}
/*** A list of all tests ***/
vector<pair<string, function<void()>>> tests = {
// name N M B max_ratio debug_level
{ "small2k", [] { generic_test( 8, 32, 8, 8, 2 ); } },
{ "small", [] { generic_test( 13, 64, 8, 5, 2 ); } },
{ "n100b16", [] { generic_test( 100, 1024, 16, 3, 1 ); } },
{ "n1000b16", [] { generic_test(1000, 1024, 16, 3, 1 ); } },
{ "n1000b64", [] { generic_test(1000, 8192, 64, 4, 1 ); } },
{ "n1000b256", [] { generic_test(1000, 65536, 256, 7, 1 ); } },
{ "n1000b4096", [] { generic_test(1000, 65536, 4096, 60, 1 ); } },
};
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
extern vector<pair<string, function<void()>>> tests;
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
int main(int argc, char* argv[]) {
vector<string> required_tests;
if (argc > 1) {
required_tests.assign(argv + 1, argv + argc);
} else {
for (const auto& test : tests)
required_tests.push_back(test.first);
}
for (const auto& required_test : required_tests) {
bool found = false;
for (const auto& test : tests)
if (required_test == test.first) {
cerr << "Running test " << required_test << endl;
test.second();
found = true;
break;
}
if (!found) {
cerr << "Unknown test " << required_test << endl;
return 1;
}
}
return 0;
}
from matrix_transpose import Matrix
import numpy
# Description of memory blocks is stored in an array blocks[block_index][B_xxx].
# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to
# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block
# is not in the LRU.
B_CACHED = 0
B_LRU_NEXT = 1
B_LRU_PREV = 2
class CachedMatrix(Matrix):
"""A matrix stored in simulated cache"""
def __init__(self, N, M, B, debug_level=0):
assert N>0, "CachedMatrix must be non-empty."
assert B>0, "Blocks must be non-empty."
assert M%B == 0, "Cache size must be divisible by block size."
assert M >= 2*B, "Cache must have at least 2 blocks."
Matrix.__init__(self, N)
NN = N*N
self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C")
self.B = B
self.M = M
self.debug_level = debug_level
self.mem_blocks = (NN+B-1) // B
self.cache_blocks = M//B
self.cache_used = 0
# Initialize the LRU list. There is a virtual block right after the last real block,
# which serves as a head of the cyclic LRU list.
self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C")
self.lru_head = self.mem_blocks
self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head
self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head
self.reset_stats()
if debug_level > 0:
print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks))
def _pos(self, i, j):
"""Convert position in matrix to linear address in simulated memory."""
return i*self.N + j
def read(self, i, j):
"""Read value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
return self.items[addr]
def write(self, i, j, value):
"""Write value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
self.items[addr] = value
def swap(self, i1, j1, i2, j2):
"""Swap items (i1,j1) and (i2,j2)."""
assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \
i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \
"Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2)
if self.debug_level > 1:
print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2))
addr1 = self._pos(i1, j1)
addr2 = self._pos(i2, j2)
self._access(addr1)
self._access(addr2)
items = self.items
items[addr1], items[addr2] = items[addr2], items[addr1]
def reset_stats(self):
"""Reset statistic counters."""
self.stat_cache_misses = 0
self.stat_accesses = 0
def _access(self, addr):
"""Bring the given address to the cache."""
blocks = self.blocks
i = addr // self.B # Which block to bring
if blocks[i, B_CACHED] > 0:
self._lru_remove(i)
else:
if self.cache_used < self.cache_blocks:
# We still have room in the cache.
self.cache_used += 1
if self.debug_level > 1:
print("\t\tLoading block {}".format(i))
else:
# We need to evict the least-recently used block to make space.
replace = blocks[self.lru_head, B_LRU_PREV]
self._lru_remove(replace)
assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list"
blocks[replace, B_CACHED] = 0
if self.debug_level > 1:
print("\t\tLoading block {}, replacing {}".format(i, replace))
blocks[i, B_CACHED] = 1
self.stat_cache_misses += 1
self._lru_add_after(i, self.lru_head)
self.stat_accesses += 1
def _lru_remove(self, i):
"""Remove block from the LRU list."""
blocks = self.blocks
prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT]
blocks[prev, B_LRU_NEXT] = next
blocks[next, B_LRU_PREV] = prev
def _lru_add_after(self, i, after):
"""Add block at the given position in the LRU list."""
blocks = self.blocks
next = blocks[after, B_LRU_NEXT]
blocks[after, B_LRU_NEXT] = i
blocks[next, B_LRU_PREV] = i
blocks[i, B_LRU_NEXT] = next
blocks[i, B_LRU_PREV] = after
class TestMatrix(CachedMatrix):
"""A cached matrix extended by methods for testing."""
# Constructor is inherited
def fill_matrix(self):
"""Fill matrix with a testing pattern."""
if self.debug_level > 1:
print("\tInitializing")
N = self.N
for i in range(N):
for j in range(N):
self.write(i, j, i*N + j)
def check_result(self):
"""Check that the pattern corresponds to the properly transposed matrix."""
if self.debug_level > 1:
print("\tChecking")
N = self.N
for i in range(N):
for j in range(N):
want = j*N + i
have = self.read(i, j)
have_i = have // N
have_j = have % N
assert have == want, "Mismatch at position ({},{}): expected element from ({},{}), found element from ({},{})".format(i, j, j, i, have_i, have_j)
def naive_transpose(self):
"""Transpose the matrix naively."""
for i in range(self.N):
for j in range(i):
self.swap(i, j, j, i)
class Matrix:
"""Interface of a matrix.
This class provides only the matrix size N and a method for swapping
two items. The actual storage of the matrix in memory is provided by
subclasses in testing code.
"""
def __init__(self, N):
self.N = N
def swap(self, i1, j1, i2, j2):
"""Swap elements (i1,j1) and (i2,j2)."""
# Overridden in subclasses
raise NotImplementedError
def transpose(self):
"""Transpose the matrix."""
# TODO: Implement more efficiently
for i in range(self.N):
for j in range(i):
self.swap(i, j, j, i)
#!/usr/bin/env python3
import math
import sys
from matrix_tests import TestMatrix
def generic_test(N, M, B, max_ratio, debug_level):
m = TestMatrix(N, M, B, debug_level)
m.fill_matrix()
m.reset_stats()
m.transpose()
print("\t{} misses in {} accesses".format(m.stat_cache_misses, m.stat_accesses))
if m.stat_accesses:
swaps = N * (N-1) / 2
mpa = m.stat_cache_misses / swaps
lb = 2. / B
ratio = mpa / lb
print("\t{:.6f} misses/swap (lower bound is {:.6f} => ratio {:.6f}, limit {:.6f})".format(mpa, lb, ratio, max_ratio))
assert ratio <= max_ratio, "Algorithm did too many I/O operations."
m.check_result()
# A list of all tests
tests = [
# name N M B max_ratio debug_level
("small2k", lambda: generic_test( 8, 32, 8, 8, 2 )),
("small", lambda: generic_test( 13, 64, 8, 5, 2 )),
("n100b16", lambda: generic_test( 100, 1024, 16, 3, 1 )),
("n1000b16", lambda: generic_test(1000, 1024, 16, 3, 1 )),
("n1000b64", lambda: generic_test(1000, 8192, 64, 4, 1 )),
("n1000b256", lambda: generic_test(1000, 65536, 256, 7, 1 )),
("n1000b4096", lambda: generic_test(1000, 65536, 4096, 60, 1 )),
]
if __name__ == "__main__":
for required_test in sys.argv[1:] or [name for name, _ in tests]:
for name, test in tests:
if name == required_test:
print("Running test {}".format(name), file=sys.stderr)
test()
break
else:
raise ValueError("Unknown test {}".format(name))
Implement cache-oblivious transposition of square matrices.
The Python version requires the NumPy module for storing the matrix
in memory efficiently.
Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment