diff --git a/06-matrix_transpose/cpp/Makefile b/06-matrix_transpose/cpp/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d21f42477e1a907efa2fab6c1611f67a8641f017 --- /dev/null +++ b/06-matrix_transpose/cpp/Makefile @@ -0,0 +1,12 @@ +test: matrix_transpose_test + ./$< + +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare + +matrix_transpose_test: matrix_transpose_test.cpp matrix_transpose.h matrix_tests.h test_main.cpp + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ + +clean: + rm -f matrix_transpose_test + +.PHONY: clean test diff --git a/06-matrix_transpose/cpp/matrix_tests.h b/06-matrix_transpose/cpp/matrix_tests.h new file mode 100644 index 0000000000000000000000000000000000000000..6110f8efc37782f04f949fb30f9e18392cc325e2 --- /dev/null +++ b/06-matrix_transpose/cpp/matrix_tests.h @@ -0,0 +1,213 @@ +#include <string> +#include <vector> +#include <iostream> +#include <algorithm> + +/* A matrix stored in a simulated cache */ + +class CachedMatrix { + unsigned B; // Block size + unsigned mem_blocks; // Memory size in blocks + unsigned cache_blocks; // Cache size in blocks + unsigned cache_used; // How many blocks of cache we already used + + // We store the matrix as a one-dimensional array + vector<unsigned> items; + unsigned pos(unsigned i, unsigned j) { return i*N + j; } + + /* + * For each memory block, we keep the following structure. + * If the block is currently cached, we set cached == true + * and lru_prev/next point to neighboring blocks in the cyclic LRU list. + * Otherwise, cached == false and the block is not in the LRU. + */ + + class MemoryBlock { + public: + unsigned lru_prev, lru_next; + bool cached; + MemoryBlock() + { + lru_prev = lru_next = 0; + cached = false; + } + }; + + vector<MemoryBlock> blocks; + + // One block at the end of "blocks" serves as a head of the LRU list. + unsigned lru_head; + + public: + // Number of rows and columns of the matrix + unsigned N; + + int debug_level; // Verbosity + + CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0) + { + EXPECT(N > 0, "CachedMatrix must be non-empty."); + EXPECT(B > 0, "Blocks must be non-empty."); + EXPECT(!(M % B), "Cache size must be divisible by block size."); + EXPECT(M >= 2*B, "Cache must have at least 2 blocks."); + + unsigned NN = N*N; + items.resize(NN, 0); + + this->N = N; + this->B = B; + this->debug_level = debug_level; + mem_blocks = (NN+B-1) / B; + cache_blocks = M / B; + cache_used = 0; + + // Initialize the LRU list + blocks.resize(mem_blocks + 1); + lru_head = mem_blocks; + blocks[lru_head].lru_prev = lru_head; + blocks[lru_head].lru_next = lru_head; + + if (debug_level > 0) + cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n"; + } + + // Read value at position (i,j), used only in testing code + unsigned read(unsigned i, unsigned j) + { + EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + "."); + unsigned addr = pos(i, j); + access(addr); + return items[addr]; + } + + // Write value at position (i,j), used only in testing code + void write(unsigned i, unsigned j, unsigned data) + { + EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + "."); + unsigned addr = pos(i, j); + access(addr); + items[addr] = data; + } + + // Swap items (i1,j1) and (i2,j2) + void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2) + { + EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + "."); + if (debug_level > 1) + cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl; + unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2); + access(addr1); + access(addr2); + std::swap(items[addr1], items[addr2]); + } + + unsigned stat_cache_misses; + unsigned stat_accesses; + + // Reset statistic counters. + void reset_stats() + { + stat_cache_misses = 0; + stat_accesses = 0; + } + + static string coord_string(unsigned i, unsigned j) + { + return "(" + to_string(i) + "," + to_string(j) + ")"; + } + +#include "matrix_transpose.h" + + private: + // Bring the given address to the cache. + void access(unsigned addr) + { + int i = addr / B; // Which block to bring + if (blocks[i].cached) { + lru_remove(i); + } else { + if (cache_used < cache_blocks) { + // We still have room in the cache. + cache_used++; + if (debug_level > 1) + cout << "\t\tLoading block " << i << endl; + } else { + // We need to evict the least-recently used block to make space. + unsigned replace = blocks[lru_head].lru_prev; + lru_remove(replace); + EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list."); + blocks[replace].cached = false; + if (debug_level > 1) + cout << "\t\tLoading block " << i << ", replacing " << replace << endl; + } + blocks[i].cached = true; + stat_cache_misses++; + } + lru_add_after(i, lru_head); + stat_accesses++; + } + + // Remove block from the LRU list. + void lru_remove(unsigned i) + { + unsigned prev = blocks[i].lru_prev; + unsigned next = blocks[i].lru_next; + blocks[prev].lru_next = next; + blocks[next].lru_prev = prev; + } + + // Add block at the given position in the LRU list. + void lru_add_after(unsigned i, unsigned after) + { + unsigned next = blocks[after].lru_next; + blocks[next].lru_prev = i; + blocks[after].lru_next = i; + blocks[i].lru_next = next; + blocks[i].lru_prev = after; + } +}; + +/* A cached matrix extended by methods for testing */ + +class TestMatrix : public CachedMatrix { + public: + TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { } + + // Fill matrix with a testing pattern. + void fill_matrix() + { + if (debug_level > 1) + cout << "\tInitializing\n"; + for (unsigned i = 0; i < N; i++) + for (unsigned j = 0; j < N; j++) + write(i, j, i*N + j); + } + + // Check that the pattern corresponds to the properly transposed matrix. + void check_result() + { + if (debug_level > 1) + cout << "\tChecking\n"; + for (unsigned i = 0; i < N; i++) { + for (unsigned j = 0; j < N; j++) { + unsigned want = j*N + i; + unsigned found = read(i, j); + unsigned found_i = found / N; + unsigned found_j = found % N; + EXPECT(found == want, + "Mismatch at position " + coord_string(i, j) + + ": expected element from " + coord_string(j, i) + + ", found element from " + coord_string(found_i, found_j) + + "."); + } + } + } + + // Transpose the matrix naively. + void naive_transpose() + { + for (unsigned i=0; i<N; i++) + for (unsigned j=0; j<i; j++) + swap(i, j, j, i); + } +}; diff --git a/06-matrix_transpose/cpp/matrix_transpose.h b/06-matrix_transpose/cpp/matrix_transpose.h new file mode 100644 index 0000000000000000000000000000000000000000..ddc58b5a02da9a79eb62b3ef02288a022253fe8d --- /dev/null +++ b/06-matrix_transpose/cpp/matrix_transpose.h @@ -0,0 +1,24 @@ +/* + * This file is #include'd inside the definition of a matrix class + * like this: + * + * class ClassName { + * // Number of rows and columns of the matrix + * unsigned N; + * + * // Swap elements (i1,j1) and (i2,j2) + * void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2); + * + * // Your code + * #include "matrix_transpose.h" + * } + */ + +void transpose() +{ + // TODO: Implement this efficiently + + for (unsigned i=0; i<N; i++) + for (unsigned j=0; j<i; j++) + swap(i, j, j, i); +} diff --git a/06-matrix_transpose/cpp/matrix_transpose_test.cpp b/06-matrix_transpose/cpp/matrix_transpose_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8140103a0855f8b9125cf45ec2636b3f59604b65 --- /dev/null +++ b/06-matrix_transpose/cpp/matrix_transpose_test.cpp @@ -0,0 +1,50 @@ +#include <functional> +#include <vector> +#include <iostream> +#include <iomanip> + +using namespace std; + +// If the condition is not true, report an error and halt. +#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) + +void expect_failed(const string& message); + +#include "matrix_tests.h" + +void generic_test(unsigned N, unsigned M, unsigned B, double max_ratio, int debug_level) +{ + TestMatrix m(N, M, B, debug_level); + m.fill_matrix(); + m.reset_stats(); + m.transpose(); + + cout << "\t" << m.stat_cache_misses << " misses in " << m.stat_accesses << " accesses\n"; + if (m.stat_accesses) { + double swaps = N * (N-1) / 2; + double mpa = (double) m.stat_cache_misses / swaps; + double lb = 2. / B; + double ratio = mpa / lb; + cout << "\t" << + std::fixed << std::setprecision(6) << + mpa << " misses/swap (lower bound is " << lb << + " => ratio " << ratio << + ", limit " << max_ratio << ")\n"; + EXPECT(ratio <= max_ratio, "Algorithm did too many I/O operations."); + } + + m.check_result(); +} + +/*** A list of all tests ***/ + +vector<pair<string, function<void()>>> tests = { +// name N M B max_ratio debug_level + { "small2k", [] { generic_test( 8, 32, 8, 8, 2 ); } }, + { "small", [] { generic_test( 13, 64, 8, 5, 2 ); } }, + { "n100b16", [] { generic_test( 100, 1024, 16, 3, 1 ); } }, + { "n1000b16", [] { generic_test(1000, 1024, 16, 3, 1 ); } }, + { "n1000b64", [] { generic_test(1000, 8192, 64, 4, 1 ); } }, + { "n1000b256", [] { generic_test(1000, 65536, 256, 7, 1 ); } }, + { "n1000b4096", [] { generic_test(1000, 65536, 4096, 60, 1 ); } }, +}; diff --git a/06-matrix_transpose/cpp/test_main.cpp b/06-matrix_transpose/cpp/test_main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f4aff0785f636b7fd0ea1a15aa69dafe06f290f --- /dev/null +++ b/06-matrix_transpose/cpp/test_main.cpp @@ -0,0 +1,43 @@ +#include <cstdlib> +#include <functional> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +using namespace std; + +extern vector<pair<string, function<void()>>> tests; + +void expect_failed(const string& message) { + cerr << "Test error: " << message << endl; + exit(1); +} + +int main(int argc, char* argv[]) { + vector<string> required_tests; + + if (argc > 1) { + required_tests.assign(argv + 1, argv + argc); + } else { + for (const auto& test : tests) + required_tests.push_back(test.first); + } + + for (const auto& required_test : required_tests) { + bool found = false; + for (const auto& test : tests) + if (required_test == test.first) { + cerr << "Running test " << required_test << endl; + test.second(); + found = true; + break; + } + if (!found) { + cerr << "Unknown test " << required_test << endl; + return 1; + } + } + + return 0; +} diff --git a/06-matrix_transpose/python/matrix_tests.py b/06-matrix_transpose/python/matrix_tests.py new file mode 100644 index 0000000000000000000000000000000000000000..f002701a4e68f65cbde033b2058a79d71aad61ba --- /dev/null +++ b/06-matrix_transpose/python/matrix_tests.py @@ -0,0 +1,165 @@ +from matrix_transpose import Matrix +import numpy + +# Description of memory blocks is stored in an array blocks[block_index][B_xxx]. +# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to +# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block +# is not in the LRU. +B_CACHED = 0 +B_LRU_NEXT = 1 +B_LRU_PREV = 2 + +class CachedMatrix(Matrix): + """A matrix stored in simulated cache""" + + def __init__(self, N, M, B, debug_level=0): + assert N>0, "CachedMatrix must be non-empty." + assert B>0, "Blocks must be non-empty." + assert M%B == 0, "Cache size must be divisible by block size." + assert M >= 2*B, "Cache must have at least 2 blocks." + + Matrix.__init__(self, N) + + NN = N*N + self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C") + + self.B = B + self.M = M + self.debug_level = debug_level + self.mem_blocks = (NN+B-1) // B + self.cache_blocks = M//B + self.cache_used = 0 + + # Initialize the LRU list. There is a virtual block right after the last real block, + # which serves as a head of the cyclic LRU list. + self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C") + self.lru_head = self.mem_blocks + self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head + self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head + + self.reset_stats() + + if debug_level > 0: + print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks)) + + def _pos(self, i, j): + """Convert position in matrix to linear address in simulated memory.""" + + return i*self.N + j + + def read(self, i, j): + """Read value at position (i,j), used only in testing code.""" + + assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j) + addr = self._pos(i, j) + self._access(addr) + return self.items[addr] + + def write(self, i, j, value): + """Write value at position (i,j), used only in testing code.""" + + assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j) + addr = self._pos(i, j) + self._access(addr) + self.items[addr] = value + + def swap(self, i1, j1, i2, j2): + """Swap items (i1,j1) and (i2,j2).""" + + assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \ + i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \ + "Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2) + if self.debug_level > 1: + print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2)) + addr1 = self._pos(i1, j1) + addr2 = self._pos(i2, j2) + self._access(addr1) + self._access(addr2) + items = self.items + items[addr1], items[addr2] = items[addr2], items[addr1] + + def reset_stats(self): + """Reset statistic counters.""" + + self.stat_cache_misses = 0 + self.stat_accesses = 0 + + def _access(self, addr): + """Bring the given address to the cache.""" + + blocks = self.blocks + i = addr // self.B # Which block to bring + if blocks[i, B_CACHED] > 0: + self._lru_remove(i) + else: + if self.cache_used < self.cache_blocks: + # We still have room in the cache. + self.cache_used += 1 + if self.debug_level > 1: + print("\t\tLoading block {}".format(i)) + else: + # We need to evict the least-recently used block to make space. + replace = blocks[self.lru_head, B_LRU_PREV] + self._lru_remove(replace) + assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list" + blocks[replace, B_CACHED] = 0 + if self.debug_level > 1: + print("\t\tLoading block {}, replacing {}".format(i, replace)) + blocks[i, B_CACHED] = 1 + self.stat_cache_misses += 1 + self._lru_add_after(i, self.lru_head) + self.stat_accesses += 1 + + def _lru_remove(self, i): + """Remove block from the LRU list.""" + + blocks = self.blocks + prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT] + blocks[prev, B_LRU_NEXT] = next + blocks[next, B_LRU_PREV] = prev + + def _lru_add_after(self, i, after): + """Add block at the given position in the LRU list.""" + + blocks = self.blocks + next = blocks[after, B_LRU_NEXT] + blocks[after, B_LRU_NEXT] = i + blocks[next, B_LRU_PREV] = i + blocks[i, B_LRU_NEXT] = next + blocks[i, B_LRU_PREV] = after + +class TestMatrix(CachedMatrix): + """A cached matrix extended by methods for testing.""" + + # Constructor is inherited + + def fill_matrix(self): + """Fill matrix with a testing pattern.""" + + if self.debug_level > 1: + print("\tInitializing") + N = self.N + for i in range(N): + for j in range(N): + self.write(i, j, i*N + j) + + def check_result(self): + """Check that the pattern corresponds to the properly transposed matrix.""" + + if self.debug_level > 1: + print("\tChecking") + N = self.N + for i in range(N): + for j in range(N): + want = j*N + i + have = self.read(i, j) + have_i = have // N + have_j = have % N + assert have == want, "Mismatch at position ({},{}): expected element from ({},{}), found element from ({},{})".format(i, j, j, i, have_i, have_j) + + def naive_transpose(self): + """Transpose the matrix naively.""" + + for i in range(self.N): + for j in range(i): + self.swap(i, j, j, i) diff --git a/06-matrix_transpose/python/matrix_transpose.py b/06-matrix_transpose/python/matrix_transpose.py new file mode 100644 index 0000000000000000000000000000000000000000..77570f47c7ca42fc2c6d7367af5860a27ade5c1e --- /dev/null +++ b/06-matrix_transpose/python/matrix_transpose.py @@ -0,0 +1,24 @@ +class Matrix: + """Interface of a matrix. + + This class provides only the matrix size N and a method for swapping + two items. The actual storage of the matrix in memory is provided by + subclasses in testing code. + """ + + def __init__(self, N): + self.N = N + + def swap(self, i1, j1, i2, j2): + """Swap elements (i1,j1) and (i2,j2).""" + + # Overridden in subclasses + raise NotImplementedError + + def transpose(self): + """Transpose the matrix.""" + + # TODO: Implement more efficiently + for i in range(self.N): + for j in range(i): + self.swap(i, j, j, i) diff --git a/06-matrix_transpose/python/matrix_transpose_test.py b/06-matrix_transpose/python/matrix_transpose_test.py new file mode 100644 index 0000000000000000000000000000000000000000..fc0157e5f7ef266011948a12f235d02789718a97 --- /dev/null +++ b/06-matrix_transpose/python/matrix_transpose_test.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 +import math +import sys + +from matrix_tests import TestMatrix + +def generic_test(N, M, B, max_ratio, debug_level): + m = TestMatrix(N, M, B, debug_level) + m.fill_matrix() + m.reset_stats() + m.transpose() + + print("\t{} misses in {} accesses".format(m.stat_cache_misses, m.stat_accesses)) + if m.stat_accesses: + swaps = N * (N-1) / 2 + mpa = m.stat_cache_misses / swaps + lb = 2. / B + ratio = mpa / lb + print("\t{:.6f} misses/swap (lower bound is {:.6f} => ratio {:.6f}, limit {:.6f})".format(mpa, lb, ratio, max_ratio)) + assert ratio <= max_ratio, "Algorithm did too many I/O operations." + + m.check_result() + +# A list of all tests +tests = [ + # name N M B max_ratio debug_level + ("small2k", lambda: generic_test( 8, 32, 8, 8, 2 )), + ("small", lambda: generic_test( 13, 64, 8, 5, 2 )), + ("n100b16", lambda: generic_test( 100, 1024, 16, 3, 1 )), + ("n1000b16", lambda: generic_test(1000, 1024, 16, 3, 1 )), + ("n1000b64", lambda: generic_test(1000, 8192, 64, 4, 1 )), + ("n1000b256", lambda: generic_test(1000, 65536, 256, 7, 1 )), + ("n1000b4096", lambda: generic_test(1000, 65536, 4096, 60, 1 )), +] + +if __name__ == "__main__": + for required_test in sys.argv[1:] or [name for name, _ in tests]: + for name, test in tests: + if name == required_test: + print("Running test {}".format(name), file=sys.stderr) + test() + break + else: + raise ValueError("Unknown test {}".format(name)) diff --git a/06-matrix_transpose/task.md b/06-matrix_transpose/task.md new file mode 100644 index 0000000000000000000000000000000000000000..55f1d577ec29ae59e31fadd02037ab3bada500a1 --- /dev/null +++ b/06-matrix_transpose/task.md @@ -0,0 +1,6 @@ +Implement cache-oblivious transposition of square matrices. + +The Python version requires the NumPy module for storing the matrix +in memory efficiently. + +Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).