diff --git a/06-matrix_transpose/cpp/Makefile b/06-matrix_transpose/cpp/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..d21f42477e1a907efa2fab6c1611f67a8641f017
--- /dev/null
+++ b/06-matrix_transpose/cpp/Makefile
@@ -0,0 +1,12 @@
+test: matrix_transpose_test
+ ./$<
+
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare
+
+matrix_transpose_test: matrix_transpose_test.cpp matrix_transpose.h matrix_tests.h test_main.cpp
+ $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@
+
+clean:
+ rm -f matrix_transpose_test
+
+.PHONY: clean test
diff --git a/06-matrix_transpose/cpp/matrix_tests.h b/06-matrix_transpose/cpp/matrix_tests.h
new file mode 100644
index 0000000000000000000000000000000000000000..6110f8efc37782f04f949fb30f9e18392cc325e2
--- /dev/null
+++ b/06-matrix_transpose/cpp/matrix_tests.h
@@ -0,0 +1,213 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+/* A matrix stored in a simulated cache */
+
+class CachedMatrix {
+ unsigned B; // Block size
+ unsigned mem_blocks; // Memory size in blocks
+ unsigned cache_blocks; // Cache size in blocks
+ unsigned cache_used; // How many blocks of cache we already used
+
+ // We store the matrix as a one-dimensional array
+ vector<unsigned> items;
+ unsigned pos(unsigned i, unsigned j) { return i*N + j; }
+
+ /*
+ * For each memory block, we keep the following structure.
+ * If the block is currently cached, we set cached == true
+ * and lru_prev/next point to neighboring blocks in the cyclic LRU list.
+ * Otherwise, cached == false and the block is not in the LRU.
+ */
+
+ class MemoryBlock {
+ public:
+ unsigned lru_prev, lru_next;
+ bool cached;
+ MemoryBlock()
+ {
+ lru_prev = lru_next = 0;
+ cached = false;
+ }
+ };
+
+ vector<MemoryBlock> blocks;
+
+ // One block at the end of "blocks" serves as a head of the LRU list.
+ unsigned lru_head;
+
+ public:
+ // Number of rows and columns of the matrix
+ unsigned N;
+
+ int debug_level; // Verbosity
+
+ CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0)
+ {
+ EXPECT(N > 0, "CachedMatrix must be non-empty.");
+ EXPECT(B > 0, "Blocks must be non-empty.");
+ EXPECT(!(M % B), "Cache size must be divisible by block size.");
+ EXPECT(M >= 2*B, "Cache must have at least 2 blocks.");
+
+ unsigned NN = N*N;
+ items.resize(NN, 0);
+
+ this->N = N;
+ this->B = B;
+ this->debug_level = debug_level;
+ mem_blocks = (NN+B-1) / B;
+ cache_blocks = M / B;
+ cache_used = 0;
+
+ // Initialize the LRU list
+ blocks.resize(mem_blocks + 1);
+ lru_head = mem_blocks;
+ blocks[lru_head].lru_prev = lru_head;
+ blocks[lru_head].lru_next = lru_head;
+
+ if (debug_level > 0)
+ cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n";
+ }
+
+ // Read value at position (i,j), used only in testing code
+ unsigned read(unsigned i, unsigned j)
+ {
+ EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + ".");
+ unsigned addr = pos(i, j);
+ access(addr);
+ return items[addr];
+ }
+
+ // Write value at position (i,j), used only in testing code
+ void write(unsigned i, unsigned j, unsigned data)
+ {
+ EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + ".");
+ unsigned addr = pos(i, j);
+ access(addr);
+ items[addr] = data;
+ }
+
+ // Swap items (i1,j1) and (i2,j2)
+ void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
+ {
+ EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
+ if (debug_level > 1)
+ cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl;
+ unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2);
+ access(addr1);
+ access(addr2);
+ std::swap(items[addr1], items[addr2]);
+ }
+
+ unsigned stat_cache_misses;
+ unsigned stat_accesses;
+
+ // Reset statistic counters.
+ void reset_stats()
+ {
+ stat_cache_misses = 0;
+ stat_accesses = 0;
+ }
+
+ static string coord_string(unsigned i, unsigned j)
+ {
+ return "(" + to_string(i) + "," + to_string(j) + ")";
+ }
+
+#include "matrix_transpose.h"
+
+ private:
+ // Bring the given address to the cache.
+ void access(unsigned addr)
+ {
+ int i = addr / B; // Which block to bring
+ if (blocks[i].cached) {
+ lru_remove(i);
+ } else {
+ if (cache_used < cache_blocks) {
+ // We still have room in the cache.
+ cache_used++;
+ if (debug_level > 1)
+ cout << "\t\tLoading block " << i << endl;
+ } else {
+ // We need to evict the least-recently used block to make space.
+ unsigned replace = blocks[lru_head].lru_prev;
+ lru_remove(replace);
+ EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list.");
+ blocks[replace].cached = false;
+ if (debug_level > 1)
+ cout << "\t\tLoading block " << i << ", replacing " << replace << endl;
+ }
+ blocks[i].cached = true;
+ stat_cache_misses++;
+ }
+ lru_add_after(i, lru_head);
+ stat_accesses++;
+ }
+
+ // Remove block from the LRU list.
+ void lru_remove(unsigned i)
+ {
+ unsigned prev = blocks[i].lru_prev;
+ unsigned next = blocks[i].lru_next;
+ blocks[prev].lru_next = next;
+ blocks[next].lru_prev = prev;
+ }
+
+ // Add block at the given position in the LRU list.
+ void lru_add_after(unsigned i, unsigned after)
+ {
+ unsigned next = blocks[after].lru_next;
+ blocks[next].lru_prev = i;
+ blocks[after].lru_next = i;
+ blocks[i].lru_next = next;
+ blocks[i].lru_prev = after;
+ }
+};
+
+/* A cached matrix extended by methods for testing */
+
+class TestMatrix : public CachedMatrix {
+ public:
+ TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { }
+
+ // Fill matrix with a testing pattern.
+ void fill_matrix()
+ {
+ if (debug_level > 1)
+ cout << "\tInitializing\n";
+ for (unsigned i = 0; i < N; i++)
+ for (unsigned j = 0; j < N; j++)
+ write(i, j, i*N + j);
+ }
+
+ // Check that the pattern corresponds to the properly transposed matrix.
+ void check_result()
+ {
+ if (debug_level > 1)
+ cout << "\tChecking\n";
+ for (unsigned i = 0; i < N; i++) {
+ for (unsigned j = 0; j < N; j++) {
+ unsigned want = j*N + i;
+ unsigned found = read(i, j);
+ unsigned found_i = found / N;
+ unsigned found_j = found % N;
+ EXPECT(found == want,
+ "Mismatch at position " + coord_string(i, j) +
+ ": expected element from " + coord_string(j, i) +
+ ", found element from " + coord_string(found_i, found_j) +
+ ".");
+ }
+ }
+ }
+
+ // Transpose the matrix naively.
+ void naive_transpose()
+ {
+ for (unsigned i=0; i<N; i++)
+ for (unsigned j=0; j<i; j++)
+ swap(i, j, j, i);
+ }
+};
diff --git a/06-matrix_transpose/cpp/matrix_transpose.h b/06-matrix_transpose/cpp/matrix_transpose.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddc58b5a02da9a79eb62b3ef02288a022253fe8d
--- /dev/null
+++ b/06-matrix_transpose/cpp/matrix_transpose.h
@@ -0,0 +1,24 @@
+/*
+ * This file is #include'd inside the definition of a matrix class
+ * like this:
+ *
+ * class ClassName {
+ * // Number of rows and columns of the matrix
+ * unsigned N;
+ *
+ * // Swap elements (i1,j1) and (i2,j2)
+ * void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2);
+ *
+ * // Your code
+ * #include "matrix_transpose.h"
+ * }
+ */
+
+void transpose()
+{
+ // TODO: Implement this efficiently
+
+ for (unsigned i=0; i<N; i++)
+ for (unsigned j=0; j<i; j++)
+ swap(i, j, j, i);
+}
diff --git a/06-matrix_transpose/cpp/matrix_transpose_test.cpp b/06-matrix_transpose/cpp/matrix_transpose_test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cbbfff1134c95ec1d5a49dc0425f5ff2358930fd
--- /dev/null
+++ b/06-matrix_transpose/cpp/matrix_transpose_test.cpp
@@ -0,0 +1,49 @@
+#include <functional>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message);
+
+#include "matrix_tests.h"
+
+void generic_test(unsigned N, unsigned M, unsigned B, double max_ratio, int debug_level)
+{
+ TestMatrix m(N, M, B, debug_level);
+ m.fill_matrix();
+ m.reset_stats();
+ m.transpose();
+
+ cout << "\t" << m.stat_cache_misses << " misses in " << m.stat_accesses << " accesses\n";
+ if (m.stat_accesses) {
+ double mpa = (double) m.stat_cache_misses / m.stat_accesses;
+ double lb = 1. / B;
+ double ratio = mpa / lb;
+ cout << "\t" <<
+ std::fixed << std::setprecision(6) <<
+ mpa << " misses/access (lower bound is " << lb <<
+ " => ratio " << ratio <<
+ ", need " << max_ratio << ")\n";
+ EXPECT(ratio <= max_ratio, "Algorithm did too many I/O operations.");
+ }
+
+ m.check_result();
+}
+
+/*** A list of all tests ***/
+
+vector<pair<string, function<void()>>> tests = {
+// name N M B max_ratio debug_level
+ { "small2k", [] { generic_test( 8, 32, 8, 8, 2 ); } },
+ { "small", [] { generic_test( 13, 64, 8, 4, 2 ); } },
+ { "n100b16", [] { generic_test( 100, 1024, 16, 3, 1 ); } },
+ { "n1000b16", [] { generic_test(1000, 1024, 16, 3, 1 ); } },
+ { "n1000b64", [] { generic_test(1000, 8192, 64, 3, 1 ); } },
+ { "n1000b256", [] { generic_test(1000, 65536, 256, 5, 1 ); } },
+ { "n1000b4096", [] { generic_test(1000, 65536, 4096, 50, 1 ); } },
+};
diff --git a/06-matrix_transpose/cpp/test_main.cpp b/06-matrix_transpose/cpp/test_main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3f4aff0785f636b7fd0ea1a15aa69dafe06f290f
--- /dev/null
+++ b/06-matrix_transpose/cpp/test_main.cpp
@@ -0,0 +1,43 @@
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+ cerr << "Test error: " << message << endl;
+ exit(1);
+}
+
+int main(int argc, char* argv[]) {
+ vector<string> required_tests;
+
+ if (argc > 1) {
+ required_tests.assign(argv + 1, argv + argc);
+ } else {
+ for (const auto& test : tests)
+ required_tests.push_back(test.first);
+ }
+
+ for (const auto& required_test : required_tests) {
+ bool found = false;
+ for (const auto& test : tests)
+ if (required_test == test.first) {
+ cerr << "Running test " << required_test << endl;
+ test.second();
+ found = true;
+ break;
+ }
+ if (!found) {
+ cerr << "Unknown test " << required_test << endl;
+ return 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/06-matrix_transpose/python/matrix_tests.py b/06-matrix_transpose/python/matrix_tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..f002701a4e68f65cbde033b2058a79d71aad61ba
--- /dev/null
+++ b/06-matrix_transpose/python/matrix_tests.py
@@ -0,0 +1,165 @@
+from matrix_transpose import Matrix
+import numpy
+
+# Description of memory blocks is stored in an array blocks[block_index][B_xxx].
+# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to
+# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block
+# is not in the LRU.
+B_CACHED = 0
+B_LRU_NEXT = 1
+B_LRU_PREV = 2
+
+class CachedMatrix(Matrix):
+ """A matrix stored in simulated cache"""
+
+ def __init__(self, N, M, B, debug_level=0):
+ assert N>0, "CachedMatrix must be non-empty."
+ assert B>0, "Blocks must be non-empty."
+ assert M%B == 0, "Cache size must be divisible by block size."
+ assert M >= 2*B, "Cache must have at least 2 blocks."
+
+ Matrix.__init__(self, N)
+
+ NN = N*N
+ self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C")
+
+ self.B = B
+ self.M = M
+ self.debug_level = debug_level
+ self.mem_blocks = (NN+B-1) // B
+ self.cache_blocks = M//B
+ self.cache_used = 0
+
+ # Initialize the LRU list. There is a virtual block right after the last real block,
+ # which serves as a head of the cyclic LRU list.
+ self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C")
+ self.lru_head = self.mem_blocks
+ self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head
+ self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head
+
+ self.reset_stats()
+
+ if debug_level > 0:
+ print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks))
+
+ def _pos(self, i, j):
+ """Convert position in matrix to linear address in simulated memory."""
+
+ return i*self.N + j
+
+ def read(self, i, j):
+ """Read value at position (i,j), used only in testing code."""
+
+ assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j)
+ addr = self._pos(i, j)
+ self._access(addr)
+ return self.items[addr]
+
+ def write(self, i, j, value):
+ """Write value at position (i,j), used only in testing code."""
+
+ assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j)
+ addr = self._pos(i, j)
+ self._access(addr)
+ self.items[addr] = value
+
+ def swap(self, i1, j1, i2, j2):
+ """Swap items (i1,j1) and (i2,j2)."""
+
+ assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \
+ i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \
+ "Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2)
+ if self.debug_level > 1:
+ print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2))
+ addr1 = self._pos(i1, j1)
+ addr2 = self._pos(i2, j2)
+ self._access(addr1)
+ self._access(addr2)
+ items = self.items
+ items[addr1], items[addr2] = items[addr2], items[addr1]
+
+ def reset_stats(self):
+ """Reset statistic counters."""
+
+ self.stat_cache_misses = 0
+ self.stat_accesses = 0
+
+ def _access(self, addr):
+ """Bring the given address to the cache."""
+
+ blocks = self.blocks
+ i = addr // self.B # Which block to bring
+ if blocks[i, B_CACHED] > 0:
+ self._lru_remove(i)
+ else:
+ if self.cache_used < self.cache_blocks:
+ # We still have room in the cache.
+ self.cache_used += 1
+ if self.debug_level > 1:
+ print("\t\tLoading block {}".format(i))
+ else:
+ # We need to evict the least-recently used block to make space.
+ replace = blocks[self.lru_head, B_LRU_PREV]
+ self._lru_remove(replace)
+ assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list"
+ blocks[replace, B_CACHED] = 0
+ if self.debug_level > 1:
+ print("\t\tLoading block {}, replacing {}".format(i, replace))
+ blocks[i, B_CACHED] = 1
+ self.stat_cache_misses += 1
+ self._lru_add_after(i, self.lru_head)
+ self.stat_accesses += 1
+
+ def _lru_remove(self, i):
+ """Remove block from the LRU list."""
+
+ blocks = self.blocks
+ prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT]
+ blocks[prev, B_LRU_NEXT] = next
+ blocks[next, B_LRU_PREV] = prev
+
+ def _lru_add_after(self, i, after):
+ """Add block at the given position in the LRU list."""
+
+ blocks = self.blocks
+ next = blocks[after, B_LRU_NEXT]
+ blocks[after, B_LRU_NEXT] = i
+ blocks[next, B_LRU_PREV] = i
+ blocks[i, B_LRU_NEXT] = next
+ blocks[i, B_LRU_PREV] = after
+
+class TestMatrix(CachedMatrix):
+ """A cached matrix extended by methods for testing."""
+
+ # Constructor is inherited
+
+ def fill_matrix(self):
+ """Fill matrix with a testing pattern."""
+
+ if self.debug_level > 1:
+ print("\tInitializing")
+ N = self.N
+ for i in range(N):
+ for j in range(N):
+ self.write(i, j, i*N + j)
+
+ def check_result(self):
+ """Check that the pattern corresponds to the properly transposed matrix."""
+
+ if self.debug_level > 1:
+ print("\tChecking")
+ N = self.N
+ for i in range(N):
+ for j in range(N):
+ want = j*N + i
+ have = self.read(i, j)
+ have_i = have // N
+ have_j = have % N
+ assert have == want, "Mismatch at position ({},{}): expected element from ({},{}), found element from ({},{})".format(i, j, j, i, have_i, have_j)
+
+ def naive_transpose(self):
+ """Transpose the matrix naively."""
+
+ for i in range(self.N):
+ for j in range(i):
+ self.swap(i, j, j, i)
diff --git a/06-matrix_transpose/python/matrix_transpose.py b/06-matrix_transpose/python/matrix_transpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..77570f47c7ca42fc2c6d7367af5860a27ade5c1e
--- /dev/null
+++ b/06-matrix_transpose/python/matrix_transpose.py
@@ -0,0 +1,24 @@
+class Matrix:
+ """Interface of a matrix.
+
+ This class provides only the matrix size N and a method for swapping
+ two items. The actual storage of the matrix in memory is provided by
+ subclasses in testing code.
+ """
+
+ def __init__(self, N):
+ self.N = N
+
+ def swap(self, i1, j1, i2, j2):
+ """Swap elements (i1,j1) and (i2,j2)."""
+
+ # Overridden in subclasses
+ raise NotImplementedError
+
+ def transpose(self):
+ """Transpose the matrix."""
+
+ # TODO: Implement more efficiently
+ for i in range(self.N):
+ for j in range(i):
+ self.swap(i, j, j, i)
diff --git a/06-matrix_transpose/python/matrix_transpose_test.py b/06-matrix_transpose/python/matrix_transpose_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba5c0eccd00e8ece0e1868a6c758739ea2de0c9c
--- /dev/null
+++ b/06-matrix_transpose/python/matrix_transpose_test.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+import math
+import sys
+
+from matrix_tests import TestMatrix
+
+def generic_test(N, M, B, max_ratio, debug_level):
+ m = TestMatrix(N, M, B, debug_level)
+ m.fill_matrix()
+ m.reset_stats()
+ m.transpose()
+
+ print("\t{} misses in {} accesses".format(m.stat_cache_misses, m.stat_accesses))
+ if m.stat_accesses:
+ mpa = m.stat_cache_misses / m.stat_accesses
+ lb = 1 / B
+ ratio = mpa / lb
+ print("\t{:.6f} misses/access (lower bound is {:.6f} => ratio {:.6f}, need {:.6f})".format(mpa, lb, ratio, max_ratio))
+ assert ratio <= max_ratio, "Algorithm did too many I/O operations."
+
+ m.check_result()
+
+# A list of all tests
+tests = [
+ # name N M B max_ratio debug_level
+ ("small2k", lambda: generic_test( 8, 32, 8, 8, 2 )),
+ ("small", lambda: generic_test( 13, 64, 8, 4, 2 )),
+ ("n100b16", lambda: generic_test( 100, 1024, 16, 3, 1 )),
+ ("n1000b16", lambda: generic_test(1000, 1024, 16, 3, 1 )),
+ ("n1000b64", lambda: generic_test(1000, 8192, 64, 3, 1 )),
+ ("n1000b256", lambda: generic_test(1000, 65536, 256, 5, 1 )),
+ ("n1000b4096", lambda: generic_test(1000, 65536, 4096, 50, 1 )),
+]
+
+if __name__ == "__main__":
+ for required_test in sys.argv[1:] or [name for name, _ in tests]:
+ for name, test in tests:
+ if name == required_test:
+ print("Running test {}".format(name), file=sys.stderr)
+ test()
+ break
+ else:
+ raise ValueError("Unknown test {}".format(name))
diff --git a/06-matrix_transpose/task.md b/06-matrix_transpose/task.md
new file mode 100644
index 0000000000000000000000000000000000000000..55f1d577ec29ae59e31fadd02037ab3bada500a1
--- /dev/null
+++ b/06-matrix_transpose/task.md
@@ -0,0 +1,6 @@
+Implement cache-oblivious transposition of square matrices.
+
+The Python version requires the NumPy module for storing the matrix
+in memory efficiently.
+
+Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).