From 7a2c9086dbf013ad88285b5ec6ddb0a61db6d826 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ondra=20Mi=C4=8Dka=20=40=20miles-teg?=
 <mitch.ondra@gmail.com>
Date: Mon, 13 Apr 2020 14:31:15 +0200
Subject: [PATCH] Matrix transposition experiments

---
 08-matrix_experiment/cpp/Makefile             |  26 +++
 .../cpp/matrix_experiment_real.cpp            |  90 ++++++++
 .../cpp/matrix_experiment_sim.cpp             |  80 +++++++
 08-matrix_experiment/cpp/matrix_tests.h       | 213 ++++++++++++++++++
 08-matrix_experiment/python/Makefile          |  17 ++
 .../python/matrix_experiment_sim.py           |  42 ++++
 08-matrix_experiment/python/matrix_tests.py   | 165 ++++++++++++++
 08-matrix_experiment/task.md                  |  87 +++++++
 8 files changed, 720 insertions(+)
 create mode 100644 08-matrix_experiment/cpp/Makefile
 create mode 100644 08-matrix_experiment/cpp/matrix_experiment_real.cpp
 create mode 100644 08-matrix_experiment/cpp/matrix_experiment_sim.cpp
 create mode 100644 08-matrix_experiment/cpp/matrix_tests.h
 create mode 100644 08-matrix_experiment/python/Makefile
 create mode 100755 08-matrix_experiment/python/matrix_experiment_sim.py
 create mode 100644 08-matrix_experiment/python/matrix_tests.py
 create mode 100644 08-matrix_experiment/task.md

diff --git a/08-matrix_experiment/cpp/Makefile b/08-matrix_experiment/cpp/Makefile
new file mode 100644
index 0000000..ab0539e
--- /dev/null
+++ b/08-matrix_experiment/cpp/Makefile
@@ -0,0 +1,26 @@
+.PHONY: test
+test: matrix_experiment_sim matrix_experiment_real
+	@rm -rf out && mkdir out
+	@for exp in m1024-b16 m8192-b64 m65536-b256 m65536-b4096 ; do \
+		for impl in smart naive ; do \
+			echo "t-sim-$$exp-$$impl" ; \
+			./matrix_experiment_sim $$exp $$impl >out/t-sim-$$exp-$$impl ; \
+		done ; \
+	done
+	@for impl in smart naive ; do \
+		echo "t-real-$$impl" ; \
+		./matrix_experiment_real $$impl >out/t-real-$$impl ; \
+	done
+
+CXXFLAGS=-std=c++11 -O3 -Wall -Wextra -g -Wno-sign-compare
+
+matrix_experiment_sim: matrix_transpose.h matrix_tests.h matrix_experiment_sim.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) matrix_experiment_sim.cpp -o $@
+
+matrix_experiment_real: matrix_transpose.h matrix_tests.h matrix_experiment_real.cpp
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) matrix_experiment_real.cpp -o $@
+
+.PHONY: clean
+clean:
+	rm -f matrix_experiment_sim matrix_experiment_real
+	rm -rf out
diff --git a/08-matrix_experiment/cpp/matrix_experiment_real.cpp b/08-matrix_experiment/cpp/matrix_experiment_real.cpp
new file mode 100644
index 0000000..3d2a28a
--- /dev/null
+++ b/08-matrix_experiment/cpp/matrix_experiment_real.cpp
@@ -0,0 +1,90 @@
+#include <functional>
+#include <string>
+#include <vector>
+#include <cstdio>
+#include <cmath>
+#include <iostream>
+
+#include <time.h>   // FIXME
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+class Matrix {
+    vector<unsigned> items;
+    unsigned &item(unsigned i, unsigned j) { return items[i*N + j]; }
+  public:
+    unsigned N;
+    Matrix(unsigned N) { this->N = N; items.resize(N*N, 0); }
+
+    void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
+    {
+        // EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
+        std::swap(item(i1, j1), item(i2, j2));
+    }
+
+    void naive_transpose()
+    {
+        for (unsigned i=0; i<N; i++)
+            for (unsigned j=0; j<i; j++)
+                swap(i, j, j, i);
+    }
+
+#include "matrix_transpose.h"
+};
+
+void real_test(bool naive)
+{
+    for (int e=40; e<=112; e++) {
+        unsigned N = (unsigned) pow(2, e/8.);
+        Matrix m(N);
+
+        clock_t start_time, stop_time;
+        unsigned tries = 1;
+        do {
+            start_time = clock();
+            for (unsigned t=0; t < tries; t++) {
+                if (naive)
+                    m.naive_transpose();
+                else
+                    m.transpose();
+            }
+            stop_time = clock();
+            tries *= 2;
+        } while (stop_time - start_time < CLOCKS_PER_SEC/10);
+        // It is guaranteed that the total number of tries is odd :)
+
+        double ns_per_item = (double)(stop_time - start_time) / CLOCKS_PER_SEC / (N*(N-1)) / tries * 1e9;
+        printf("%d\t%.6f\n", N, ns_per_item);
+    }
+}
+
+int main(int argc, char **argv)
+{
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s (smart|naive)\n", argv[0]);
+        return 1;
+    }
+
+    std::string mode = argv[1];
+
+    bool naive;
+    if (mode == "smart")
+      naive = false;
+    else if (mode == "naive")
+      naive = true;
+    else {
+        fprintf(stderr, "The argument must be either 'smart' or 'naive'\n");
+        return 1;
+    }
+
+    real_test(naive);
+    return 0;
+}
diff --git a/08-matrix_experiment/cpp/matrix_experiment_sim.cpp b/08-matrix_experiment/cpp/matrix_experiment_sim.cpp
new file mode 100644
index 0000000..5eca7ab
--- /dev/null
+++ b/08-matrix_experiment/cpp/matrix_experiment_sim.cpp
@@ -0,0 +1,80 @@
+#include <functional>
+#include <string>
+#include <vector>
+#include <cstdio>
+#include <cmath>
+#include <string>
+#include <iostream>
+
+#include <time.h>   // FIXME
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+#include "matrix_tests.h"
+
+void simulated_test(unsigned M, unsigned B, bool naive)
+{
+    for (int e=20; e<=52; e++) {
+        unsigned N = (unsigned) pow(2, e/4.);
+        TestMatrix m(N, M, B, 0);
+        m.fill_matrix();
+        m.reset_stats();
+        if (naive)
+            m.naive_transpose();
+        else
+            m.transpose();
+
+        double misses_per_item = (double) m.stat_cache_misses / (N*(N-1));
+        printf("%d\t%.6f\n", N, misses_per_item);
+
+        m.check_result();
+    }
+}
+
+vector<pair<string, function<void(bool n)>>> tests = {
+//                                                    M     B
+    { "m1024-b16",    [](bool n) { simulated_test( 1024,   16, n); } },
+    { "m8192-b64",    [](bool n) { simulated_test( 8192,   64, n); } },
+    { "m65536-b256",  [](bool n) { simulated_test(65536,  256, n); } },
+    { "m65536-b4096", [](bool n) { simulated_test(65536, 4096, n); } },
+};
+
+int main(int argc, char **argv)
+{
+    if (argc != 3) {
+        fprintf(stderr, "Usage: %s <test> (smart|naive)\n", argv[0]);
+        return 1;
+    }
+
+    std::string which_test = argv[1];
+    std::string mode = argv[2];
+
+    bool naive;
+    if (mode == "smart")
+      naive = false;
+    else if (mode == "naive")
+      naive = true;
+    else
+      {
+        fprintf(stderr, "Last argument must be either 'smart' or 'naive'\n");
+        return 1;
+      }
+
+    for (const auto& test : tests) {
+        if (test.first == which_test) {
+            test.second(naive);
+            return 0;
+        }
+    }
+
+    fprintf(stderr, "Unknown test %s\n", which_test.c_str());
+    return 1;
+}
diff --git a/08-matrix_experiment/cpp/matrix_tests.h b/08-matrix_experiment/cpp/matrix_tests.h
new file mode 100644
index 0000000..6110f8e
--- /dev/null
+++ b/08-matrix_experiment/cpp/matrix_tests.h
@@ -0,0 +1,213 @@
+#include <string>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+/* A matrix stored in a simulated cache */
+
+class CachedMatrix {
+    unsigned B;             // Block size
+    unsigned mem_blocks;    // Memory size in blocks
+    unsigned cache_blocks;  // Cache size in blocks
+    unsigned cache_used;    // How many blocks of cache we already used
+
+    // We store the matrix as a one-dimensional array
+    vector<unsigned> items;
+    unsigned pos(unsigned i, unsigned j) { return i*N + j; }
+
+    /*
+     *  For each memory block, we keep the following structure.
+     *  If the block is currently cached, we set cached == true
+     *  and lru_prev/next point to neighboring blocks in the cyclic LRU list.
+     *  Otherwise, cached == false and the block is not in the LRU.
+     */
+
+    class MemoryBlock {
+      public:
+        unsigned lru_prev, lru_next;
+        bool cached;
+        MemoryBlock()
+        {
+            lru_prev = lru_next = 0;
+            cached = false;
+        }
+    };
+
+    vector<MemoryBlock> blocks;
+
+    // One block at the end of "blocks" serves as a head of the LRU list.
+    unsigned lru_head;
+
+  public:
+    // Number of rows and columns of the matrix
+    unsigned N;
+
+    int debug_level;        // Verbosity
+
+    CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0)
+    {
+        EXPECT(N > 0, "CachedMatrix must be non-empty.");
+        EXPECT(B > 0, "Blocks must be non-empty.");
+        EXPECT(!(M % B), "Cache size must be divisible by block size.");
+        EXPECT(M >= 2*B, "Cache must have at least 2 blocks.");
+
+        unsigned NN = N*N;
+        items.resize(NN, 0);
+
+        this->N = N;
+        this->B = B;
+        this->debug_level = debug_level;
+        mem_blocks = (NN+B-1) / B;
+        cache_blocks = M / B;
+        cache_used = 0;
+
+        // Initialize the LRU list
+        blocks.resize(mem_blocks + 1);
+        lru_head = mem_blocks;
+        blocks[lru_head].lru_prev = lru_head;
+        blocks[lru_head].lru_next = lru_head;
+
+        if (debug_level > 0)
+            cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n";
+    }
+
+    // Read value at position (i,j), used only in testing code
+    unsigned read(unsigned i, unsigned j)
+    {
+        EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + ".");
+        unsigned addr = pos(i, j);
+        access(addr);
+        return items[addr];
+    }
+
+    // Write value at position (i,j), used only in testing code
+    void write(unsigned i, unsigned j, unsigned data)
+    {
+        EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + ".");
+        unsigned addr = pos(i, j);
+        access(addr);
+        items[addr] = data;
+    }
+
+    // Swap items (i1,j1) and (i2,j2)
+    void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
+    {
+        EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
+        if (debug_level > 1)
+            cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl;
+        unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2);
+        access(addr1);
+        access(addr2);
+        std::swap(items[addr1], items[addr2]);
+    }
+
+    unsigned stat_cache_misses;
+    unsigned stat_accesses;
+
+    // Reset statistic counters.
+    void reset_stats()
+    {
+        stat_cache_misses = 0;
+        stat_accesses = 0;
+    }
+
+    static string coord_string(unsigned i, unsigned j)
+    {
+        return "(" + to_string(i) + "," + to_string(j) + ")";
+    }
+
+#include "matrix_transpose.h"
+
+  private:
+    // Bring the given address to the cache.
+    void access(unsigned addr)
+    {
+        int i = addr / B;           // Which block to bring
+        if (blocks[i].cached) {
+            lru_remove(i);
+        } else {
+            if (cache_used < cache_blocks) {
+                // We still have room in the cache.
+                cache_used++;
+                if (debug_level > 1)
+                  cout << "\t\tLoading block " << i << endl;
+            } else {
+                // We need to evict the least-recently used block to make space.
+                unsigned replace = blocks[lru_head].lru_prev;
+                lru_remove(replace);
+                EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list.");
+                blocks[replace].cached = false;
+                if (debug_level > 1)
+                  cout << "\t\tLoading block " << i << ", replacing " << replace << endl;
+            }
+            blocks[i].cached = true;
+            stat_cache_misses++;
+        }
+        lru_add_after(i, lru_head);
+        stat_accesses++;
+    }
+
+    // Remove block from the LRU list.
+    void lru_remove(unsigned i)
+    {
+        unsigned prev = blocks[i].lru_prev;
+        unsigned next = blocks[i].lru_next;
+        blocks[prev].lru_next = next;
+        blocks[next].lru_prev = prev;
+    }
+
+    // Add block at the given position in the LRU list.
+    void lru_add_after(unsigned i, unsigned after)
+    {
+        unsigned next = blocks[after].lru_next;
+        blocks[next].lru_prev = i;
+        blocks[after].lru_next = i;
+        blocks[i].lru_next = next;
+        blocks[i].lru_prev = after;
+    }
+};
+
+/* A cached matrix extended by methods for testing */
+
+class TestMatrix : public CachedMatrix {
+  public:
+    TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { }
+
+    // Fill matrix with a testing pattern.
+    void fill_matrix()
+    {
+        if (debug_level > 1)
+            cout << "\tInitializing\n";
+        for (unsigned i = 0; i < N; i++)
+            for (unsigned j = 0; j < N; j++)
+                write(i, j, i*N + j);
+    }
+
+    // Check that the pattern corresponds to the properly transposed matrix.
+    void check_result()
+    {
+        if (debug_level > 1)
+            cout << "\tChecking\n";
+        for (unsigned i = 0; i < N; i++) {
+            for (unsigned j = 0; j < N; j++) {
+                unsigned want = j*N + i;
+                unsigned found = read(i, j);
+                unsigned found_i = found / N;
+                unsigned found_j = found % N;
+                EXPECT(found == want,
+                       "Mismatch at position " + coord_string(i, j) +
+                       ": expected element from " + coord_string(j, i) +
+                       ", found element from " + coord_string(found_i, found_j) +
+                       ".");
+            }
+        }
+    }
+
+    // Transpose the matrix naively.
+    void naive_transpose()
+    {
+        for (unsigned i=0; i<N; i++)
+            for (unsigned j=0; j<i; j++)
+                swap(i, j, j, i);
+    }
+};
diff --git a/08-matrix_experiment/python/Makefile b/08-matrix_experiment/python/Makefile
new file mode 100644
index 0000000..6ae85a9
--- /dev/null
+++ b/08-matrix_experiment/python/Makefile
@@ -0,0 +1,17 @@
+TESTS=m1024-b16 m8192-b64 m65536-b256 m65536-b4096
+TESTFILES=$(addprefix out/t-sim-,$(TESTS))
+
+.PHONY: test
+test: $(addsuffix -smart,$(TESTFILES)) $(addsuffix -naive,$(TESTFILES))
+
+out/t-sim-%-naive:
+	@mkdir -p out
+	./matrix_experiment_sim.py $* naive >$@
+
+out/t-sim-%-smart:
+	@mkdir -p out
+	./matrix_experiment_sim.py $* smart >$@
+
+.PHONY: clean
+clean:
+	rm -rf out
diff --git a/08-matrix_experiment/python/matrix_experiment_sim.py b/08-matrix_experiment/python/matrix_experiment_sim.py
new file mode 100755
index 0000000..5f326f1
--- /dev/null
+++ b/08-matrix_experiment/python/matrix_experiment_sim.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+import sys
+
+from matrix_tests import TestMatrix
+
+def simulated_test(M, B, naive):
+    for e in range(10, 25):
+        N = int(2 ** (e/2))
+        print("    ", N, M, B, file=sys.stderr)
+        m = TestMatrix(N, M, B, 0)
+        m.fill_matrix()
+        m.reset_stats()
+        if naive:
+            m.naive_transpose()
+        else:
+            m.transpose()
+        misses_per_item = m.stat_cache_misses / (N*(N-1))
+        print(N, misses_per_item, flush=True)
+        m.check_result()
+
+tests = {
+#                                                M     B
+    "m1024-b16":    lambda n: simulated_test( 1024,   16, n),
+    "m8192-b64":    lambda n: simulated_test( 8192,   64, n),
+    "m65536-b256":  lambda n: simulated_test(65536,  256, n),
+    "m65536-b4096": lambda n: simulated_test(65536, 4096, n),
+}
+
+if len(sys.argv) == 3:
+    test = sys.argv[1]
+    if sys.argv[2] == "smart":
+        naive = False
+    elif sys.argv[2] == "naive":
+        naive = True
+    else:
+        raise ValueError("Last argument must be either 'smart' or 'naive'")
+    if test in tests:
+        tests[test](naive)
+    else:
+        raise ValueError("Unknown test {}".format(test))
+else:
+    raise ValueError("Usage: {} <test> (smart|naive)".format(sys.argv[0]))
diff --git a/08-matrix_experiment/python/matrix_tests.py b/08-matrix_experiment/python/matrix_tests.py
new file mode 100644
index 0000000..f002701
--- /dev/null
+++ b/08-matrix_experiment/python/matrix_tests.py
@@ -0,0 +1,165 @@
+from matrix_transpose import Matrix
+import numpy
+
+# Description of memory blocks is stored in an array blocks[block_index][B_xxx].
+# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to
+# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block
+# is not in the LRU.
+B_CACHED = 0
+B_LRU_NEXT = 1
+B_LRU_PREV = 2
+
+class CachedMatrix(Matrix):
+    """A matrix stored in simulated cache"""
+
+    def __init__(self, N, M, B, debug_level=0):
+        assert N>0, "CachedMatrix must be non-empty."
+        assert B>0, "Blocks must be non-empty."
+        assert M%B == 0, "Cache size must be divisible by block size."
+        assert M >= 2*B, "Cache must have at least 2 blocks."
+
+        Matrix.__init__(self, N)
+
+        NN = N*N
+        self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C")
+
+        self.B = B
+        self.M = M
+        self.debug_level = debug_level
+        self.mem_blocks = (NN+B-1) // B
+        self.cache_blocks = M//B
+        self.cache_used = 0
+
+        # Initialize the LRU list. There is a virtual block right after the last real block,
+        # which serves as a head of the cyclic LRU list.
+        self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C")
+        self.lru_head = self.mem_blocks
+        self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head
+        self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head
+
+        self.reset_stats()
+
+        if debug_level > 0:
+            print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks))
+
+    def _pos(self, i, j):
+        """Convert position in matrix to linear address in simulated memory."""
+
+        return i*self.N + j
+
+    def read(self, i, j):
+        """Read value at position (i,j), used only in testing code."""
+
+        assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j)
+        addr = self._pos(i, j)
+        self._access(addr)
+        return self.items[addr]
+
+    def write(self, i, j, value):
+        """Write value at position (i,j), used only in testing code."""
+
+        assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j)
+        addr = self._pos(i, j)
+        self._access(addr)
+        self.items[addr] = value
+
+    def swap(self, i1, j1, i2, j2):
+        """Swap items (i1,j1) and (i2,j2)."""
+
+        assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \
+               i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \
+               "Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2)
+        if self.debug_level > 1:
+            print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2))
+        addr1 = self._pos(i1, j1)
+        addr2 = self._pos(i2, j2)
+        self._access(addr1)
+        self._access(addr2)
+        items = self.items
+        items[addr1], items[addr2] = items[addr2], items[addr1]
+
+    def reset_stats(self):
+        """Reset statistic counters."""
+
+        self.stat_cache_misses = 0
+        self.stat_accesses = 0
+
+    def _access(self, addr):
+        """Bring the given address to the cache."""
+
+        blocks = self.blocks
+        i = addr // self.B      # Which block to bring
+        if blocks[i, B_CACHED] > 0:
+            self._lru_remove(i)
+        else:
+            if self.cache_used < self.cache_blocks:
+                # We still have room in the cache.
+                self.cache_used += 1
+                if self.debug_level > 1:
+                    print("\t\tLoading block {}".format(i))
+            else:
+                # We need to evict the least-recently used block to make space.
+                replace = blocks[self.lru_head, B_LRU_PREV]
+                self._lru_remove(replace)
+                assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list"
+                blocks[replace, B_CACHED] = 0
+                if self.debug_level > 1:
+                    print("\t\tLoading block {}, replacing {}".format(i, replace))
+            blocks[i, B_CACHED] = 1
+            self.stat_cache_misses += 1
+        self._lru_add_after(i, self.lru_head)
+        self.stat_accesses += 1
+
+    def _lru_remove(self, i):
+        """Remove block from the LRU list."""
+
+        blocks = self.blocks
+        prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT]
+        blocks[prev, B_LRU_NEXT] = next
+        blocks[next, B_LRU_PREV] = prev
+
+    def _lru_add_after(self, i, after):
+        """Add block at the given position in the LRU list."""
+
+        blocks = self.blocks
+        next = blocks[after, B_LRU_NEXT]
+        blocks[after, B_LRU_NEXT] = i
+        blocks[next, B_LRU_PREV] = i
+        blocks[i, B_LRU_NEXT] = next
+        blocks[i, B_LRU_PREV] = after
+
+class TestMatrix(CachedMatrix):
+    """A cached matrix extended by methods for testing."""
+
+    # Constructor is inherited
+
+    def fill_matrix(self):
+        """Fill matrix with a testing pattern."""
+
+        if self.debug_level > 1:
+            print("\tInitializing")
+        N = self.N
+        for i in range(N):
+            for j in range(N):
+                self.write(i, j, i*N + j)
+
+    def check_result(self):
+        """Check that the pattern corresponds to the properly transposed matrix."""
+
+        if self.debug_level > 1:
+            print("\tChecking")
+        N = self.N
+        for i in range(N):
+            for j in range(N):
+                want = j*N + i
+                have = self.read(i, j)
+                have_i = have // N
+                have_j = have % N
+                assert have == want, "Mismatch at position ({},{}): expected element from ({},{}), found element from ({},{})".format(i, j, j, i, have_i, have_j)
+
+    def naive_transpose(self):
+        """Transpose the matrix naively."""
+
+        for i in range(self.N):
+            for j in range(i):
+                self.swap(i, j, j, i)
diff --git a/08-matrix_experiment/task.md b/08-matrix_experiment/task.md
new file mode 100644
index 0000000..4e36fb5
--- /dev/null
+++ b/08-matrix_experiment/task.md
@@ -0,0 +1,87 @@
+## Goal
+
+The goal of this assignment is to evaluate your implementation of cache-oblivious
+matrix transposition experimentally and to compare it with the trivial algorithm
+which transposes by definition.
+
+You are given a test program (`matrix_experiment_sim`) which evaluates your
+implementation from the previous assignment on different simulated caches and
+matrices of different sizes. For each experiment, the average number of cache
+misses per item is reported (the diagonal items which do not move are not
+counted). The program also evaluates performance of the trivial transposition algorithm.
+
+You should run these experiments and write a report, which contains one plot of
+the measured data for each cache type, showing dependency of the average number of
+misses on the matrix size. There should be two curves in each plot: one for your
+algorithm, another for the trivial one.
+
+The report should discuss the experimental results and try to explain the observed
+behavior (including any strange anomalies) using theory from the lectures.
+If you want, you can carry out further experiments to gain better understanding
+of the algorithm and include these in the report.
+
+You should submit a PDF file with the report (and no source code).
+You will get 1 temporary point upon submission if the file is syntactically correct;
+proper points will be assigned later.
+
+## Test program
+
+The test program is given two arguments:
+- Cache type:
+    - `m1024b16` – cache of 1024 items organized in 16-item blocks
+    - `m8192b64` – cache of 8192 items organized in 64-item blocks
+    - `m65536b256` – cache of 65536 items organized on 256-item blocks
+    - `m65536b4096` – cache of 65536 items organized in 4096-item blocks
+- The implementation to test (`smart` or `naive`).
+
+The output of the program contains one line per experiment, which consists of
+the matrix size and the average number of cache misses.
+
+*Warning:* The Python tests are slow, even though they use only a subset of the
+matrix sizes. They can take about one hour to complete.
+If your machine has multiple processors or cores, you can try `make -j`
+to run the tests in parallel.
+
+## Optional: Tests on real hardware (for 5 extra points)
+
+You can also test your transposition algorithm on real hardware
+using the `matrix_experiment_real` program. The matrix is stored in row-major
+order, each item takes 4 bytes.
+
+The program takes one parameter, the implementation to test: `smart` or `naive`.
+Its output contains one line per experiment, which consists of the matrix size
+and the average time per item in nanoseconds.
+
+However, the program is available only for C++, because general slowness of
+Python completely hides all cache effects.
+
+Again, your report should show a plot of the measured data and discuss the observed
+effects. You should also include the configuration of caches in your machine.
+(If you are using Linux, you can try the `machinfo` script from
+[this repository](https://gitlab.kam.mff.cuni.cz/mj/aim.git).)
+
+## Hints
+
+The following tools can be useful for producing nice plots:
+- [pandas](https://pandas.pydata.org/)
+- [matplotlib](https://matplotlib.org/)
+- [gnuplot](http://www.gnuplot.info/)
+
+A quick checklist for plots:
+- Is there a caption explaining what is plotted?
+- Are the axes clearly labelled? Do they have value ranges and units?
+- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs
+  are more fitting in some cases, but you should tell.)
+- Is it clear which curve means what?
+- Is it clear what are the measured points and what is an interpolated
+  curve between them?
+- Are there any overlaps? (E.g., the most interesting part of the curve
+  hidden underneath a label?)
+- **Is the graph distorted by compression artifacts?** (No, you shouldn't use JPEG for plots!)
+
+In your discussion, please distinguish the following kinds of claims.
+It should be always clear which is which:
+- Experimental results (i.e., the raw data you obtained from the experiments)
+- Theoretical facts (i.e., claims we have proved mathematically)
+- Your hypotheses (e.g., when you claim that the graph looks like something is true,
+  but you are not able to prove rigorously that it always holds)
-- 
GitLab