matrix transpose

faf92e46 · David Mareček · 363f118d · faf92e46 · faf92e46 · faf92e46
Commit faf92e46 authored 1 year ago by David Mareček
--- a/06-matrix_transpose/cpp/Makefile
+++ b/06-matrix_transpose/cpp/Makefile
+test: matrix_transpose_test
+	./$<
+
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare
+
+matrix_transpose_test: matrix_transpose_test.cpp matrix_transpose.h matrix_tests.h test_main.cpp
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@
+
+clean:
+	rm -f matrix_transpose_test
+
+.PHONY: clean test
--- a/06-matrix_transpose/cpp/matrix_tests.h
+++ b/06-matrix_transpose/cpp/matrix_tests.h
+#include <string>
+#include <vector>
+#include <iostream>
+#include <algorithm>
+
+/* A matrix stored in a simulated cache */
+
+class CachedMatrix {
+    unsigned B;             // Block size
+    unsigned mem_blocks;    // Memory size in blocks
+    unsigned cache_blocks;  // Cache size in blocks
+    unsigned cache_used;    // How many blocks of cache we already used
+
+    // We store the matrix as a one-dimensional array
+    vector<unsigned> items;
+    unsigned pos(unsigned i, unsigned j) { return i*N + j; }
+
+    /*
+     *  For each memory block, we keep the following structure.
+     *  If the block is currently cached, we set cached == true
+     *  and lru_prev/next point to neighboring blocks in the cyclic LRU list.
+     *  Otherwise, cached == false and the block is not in the LRU.
+     */
+
+    class MemoryBlock {
+      public:
+        unsigned lru_prev, lru_next;
+        bool cached;
+        MemoryBlock()
+        {
+            lru_prev = lru_next = 0;
+            cached = false;
+        }
+    };
+
+    vector<MemoryBlock> blocks;
+
+    // One block at the end of "blocks" serves as a head of the LRU list.
+    unsigned lru_head;
+
+  public:
+    // Number of rows and columns of the matrix
+    unsigned N;
+
+    int debug_level;        // Verbosity
+
+    CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0)
+    {
+        EXPECT(N > 0, "CachedMatrix must be non-empty.");
+        EXPECT(B > 0, "Blocks must be non-empty.");
+        EXPECT(!(M % B), "Cache size must be divisible by block size.");
+        EXPECT(M >= 2*B, "Cache must have at least 2 blocks.");
+
+        unsigned NN = N*N;
+        items.resize(NN, 0);
+
+        this->N = N;
+        this->B = B;
+        this->debug_level = debug_level;
+        mem_blocks = (NN+B-1) / B;
+        cache_blocks = M / B;
+        cache_used = 0;
+
+        // Initialize the LRU list
+        blocks.resize(mem_blocks + 1);
+        lru_head = mem_blocks;
+        blocks[lru_head].lru_prev = lru_head;
+        blocks[lru_head].lru_next = lru_head;
+
+        if (debug_level > 0)
+            cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n";
+    }
+
+    // Read value at position (i,j), used only in testing code
+    unsigned read(unsigned i, unsigned j)
+    {
+        EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + ".");
+        unsigned addr = pos(i, j);
+        access(addr);
+        return items[addr];
+    }
+
+    // Write value at position (i,j), used only in testing code
+    void write(unsigned i, unsigned j, unsigned data)
+    {
+        EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + ".");
+        unsigned addr = pos(i, j);
+        access(addr);
+        items[addr] = data;
+    }
+
+    // Swap items (i1,j1) and (i2,j2)
+    void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
+    {
+        EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
+        if (debug_level > 1)
+            cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl;
+        unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2);
+        access(addr1);
+        access(addr2);
+        std::swap(items[addr1], items[addr2]);
+    }
+
+    unsigned stat_cache_misses;
+    unsigned stat_accesses;
+
+    // Reset statistic counters.
+    void reset_stats()
+    {
+        stat_cache_misses = 0;
+        stat_accesses = 0;
+    }
+
+    static string coord_string(unsigned i, unsigned j)
+    {
+        return "(" + to_string(i) + "," + to_string(j) + ")";
+    }
+
+#include "matrix_transpose.h"
+
+  private:
+    // Bring the given address to the cache.
+    void access(unsigned addr)
+    {
+        int i = addr / B;           // Which block to bring
+        if (blocks[i].cached) {
+            lru_remove(i);
+        } else {
+            if (cache_used < cache_blocks) {
+                // We still have room in the cache.
+                cache_used++;
+                if (debug_level > 1)
+                  cout << "\t\tLoading block " << i << endl;
+            } else {
+                // We need to evict the least-recently used block to make space.
+                unsigned replace = blocks[lru_head].lru_prev;
+                lru_remove(replace);
+                EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list.");
+                blocks[replace].cached = false;
+                if (debug_level > 1)
+                  cout << "\t\tLoading block " << i << ", replacing " << replace << endl;
+            }
+            blocks[i].cached = true;
+            stat_cache_misses++;
+        }
+        lru_add_after(i, lru_head);
+        stat_accesses++;
+    }
+
+    // Remove block from the LRU list.
+    void lru_remove(unsigned i)
+    {
+        unsigned prev = blocks[i].lru_prev;
+        unsigned next = blocks[i].lru_next;
+        blocks[prev].lru_next = next;
+        blocks[next].lru_prev = prev;
+    }
+
+    // Add block at the given position in the LRU list.
+    void lru_add_after(unsigned i, unsigned after)
+    {
+        unsigned next = blocks[after].lru_next;
+        blocks[next].lru_prev = i;
+        blocks[after].lru_next = i;
+        blocks[i].lru_next = next;
+        blocks[i].lru_prev = after;
+    }
+};
+
+/* A cached matrix extended by methods for testing */
+
+class TestMatrix : public CachedMatrix {
+  public:
+    TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { }
+
+    // Fill matrix with a testing pattern.
+    void fill_matrix()
+    {
+        if (debug_level > 1)
+            cout << "\tInitializing\n";
+        for (unsigned i = 0; i < N; i++)
+            for (unsigned j = 0; j < N; j++)
+                write(i, j, i*N + j);
+    }
+
+    // Check that the pattern corresponds to the properly transposed matrix.
+    void check_result()
+    {
+        if (debug_level > 1)
+            cout << "\tChecking\n";
+        for (unsigned i = 0; i < N; i++) {
+            for (unsigned j = 0; j < N; j++) {
+                unsigned want = j*N + i;
+                unsigned found = read(i, j);
+                unsigned found_i = found / N;
+                unsigned found_j = found % N;
+                EXPECT(found == want,
+                       "Mismatch at position " + coord_string(i, j) +
+                       ": expected element from " + coord_string(j, i) +
+                       ", found element from " + coord_string(found_i, found_j) +
+                       ".");
+            }
+        }
+    }
+
+    // Transpose the matrix naively.
+    void naive_transpose()
+    {
+        for (unsigned i=0; i<N; i++)
+            for (unsigned j=0; j<i; j++)
+                swap(i, j, j, i);
+    }
+};
--- a/06-matrix_transpose/cpp/matrix_transpose.h
+++ b/06-matrix_transpose/cpp/matrix_transpose.h
+/*
+ *  This file is #include'd inside the definition of a matrix class
+ *  like this:
+ *
+ *  	class ClassName {
+ *          // Number of rows and columns of the matrix
+ *          unsigned N;
+ *
+ *          // Swap elements (i1,j1) and (i2,j2)
+ *          void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2);
+ *
+ *          // Your code
+ *          #include "matrix_transpose.h"
+ *      }
+ */
+
+void transpose()
+{
+    // TODO: Implement this efficiently
+
+    for (unsigned i=0; i<N; i++)
+        for (unsigned j=0; j<i; j++)
+            swap(i, j, j, i);
+}
--- a/06-matrix_transpose/cpp/matrix_transpose_test.cpp
+++ b/06-matrix_transpose/cpp/matrix_transpose_test.cpp
+#include <functional>
+#include <vector>
+#include <iostream>
+#include <iomanip>
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message);
+
+#include "matrix_tests.h"
+
+void generic_test(unsigned N, unsigned M, unsigned B, double max_ratio, int debug_level)
+{
+    TestMatrix m(N, M, B, debug_level);
+    m.fill_matrix();
+    m.reset_stats();
+    m.transpose();
+
+    cout << "\t" << m.stat_cache_misses << " misses in " << m.stat_accesses << " accesses\n";
+    if (m.stat_accesses) {
+        double swaps = N * (N-1) / 2;
+        double mpa = (double) m.stat_cache_misses / swaps;
+        double lb = 2. / B;
+        double ratio = mpa / lb;
+        cout << "\t" <<
+		std::fixed << std::setprecision(6) <<
+		mpa << " misses/swap (lower bound is " << lb <<
+		" => ratio " << ratio <<
+		", limit " << max_ratio << ")\n";
+        EXPECT(ratio <= max_ratio, "Algorithm did too many I/O operations.");
+    }
+
+    m.check_result();
+}
+
+/*** A list of all tests ***/
+
+vector<pair<string, function<void()>>> tests = {
+//    name                                N      M     B  max_ratio  debug_level
+    { "small2k",     [] { generic_test(   8,    32,    8,         8,     2 ); } },
+    { "small",       [] { generic_test(  13,    64,    8,         5,     2 ); } },
+    { "n100b16",     [] { generic_test( 100,  1024,   16,         3,     1 ); } },
+    { "n1000b16",    [] { generic_test(1000,  1024,   16,         3,     1 ); } },
+    { "n1000b64",    [] { generic_test(1000,  8192,   64,         4,     1 ); } },
+    { "n1000b256",   [] { generic_test(1000, 65536,  256,         7,     1 ); } },
+    { "n1000b4096",  [] { generic_test(1000, 65536, 4096,        60,     1 ); } },
+};
--- a/06-matrix_transpose/cpp/test_main.cpp
+++ b/06-matrix_transpose/cpp/test_main.cpp
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+int main(int argc, char* argv[]) {
+    vector<string> required_tests;
+
+    if (argc > 1) {
+        required_tests.assign(argv + 1, argv + argc);
+    } else {
+        for (const auto& test : tests)
+            required_tests.push_back(test.first);
+    }
+
+    for (const auto& required_test : required_tests) {
+        bool found = false;
+        for (const auto& test : tests)
+            if (required_test == test.first) {
+                cerr << "Running test " << required_test << endl;
+                test.second();
+                found = true;
+                break;
+            }
+        if (!found) {
+            cerr << "Unknown test " << required_test << endl;
+            return 1;
+        }
+    }
+
+    return 0;
+}
--- a/06-matrix_transpose/python/matrix_tests.py
+++ b/06-matrix_transpose/python/matrix_tests.py
+from matrix_transpose import Matrix
+import numpy
+
+# Description of memory blocks is stored in an array blocks[block_index][B_xxx].
+# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to
+# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block
+# is not in the LRU.
+B_CACHED = 0
+B_LRU_NEXT = 1
+B_LRU_PREV = 2
+
+class CachedMatrix(Matrix):
+    """A matrix stored in simulated cache"""
+
+    def __init__(self, N, M, B, debug_level=0):
+        assert N>0, "CachedMatrix must be non-empty."
+        assert B>0, "Blocks must be non-empty."
+        assert M%B == 0, "Cache size must be divisible by block size."
+        assert M >= 2*B, "Cache must have at least 2 blocks."
+
+        Matrix.__init__(self, N)
+
+        NN = N*N
+        self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C")
+
+        self.B = B
+        self.M = M
+        self.debug_level = debug_level
+        self.mem_blocks = (NN+B-1) // B
+        self.cache_blocks = M//B
+        self.cache_used = 0
+
+        # Initialize the LRU list. There is a virtual block right after the last real block,
+        # which serves as a head of the cyclic LRU list.
+        self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C")
+        self.lru_head = self.mem_blocks
+        self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head
+        self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head
+
+        self.reset_stats()
+
+        if debug_level > 0:
+            print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks))
+
+    def _pos(self, i, j):
+        """Convert position in matrix to linear address in simulated memory."""
+
+        return i*self.N + j
+
+    def read(self, i, j):
+        """Read value at position (i,j), used only in testing code."""
+
+        assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j)
+        addr = self._pos(i, j)
+        self._access(addr)
+        return self.items[addr]
+
+    def write(self, i, j, value):
+        """Write value at position (i,j), used only in testing code."""
+
+        assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j)
+        addr = self._pos(i, j)
+        self._access(addr)
+        self.items[addr] = value
+
+    def swap(self, i1, j1, i2, j2):
+        """Swap items (i1,j1) and (i2,j2)."""
+
+        assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \
+               i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \
+               "Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2)
+        if self.debug_level > 1:
+            print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2))
+        addr1 = self._pos(i1, j1)
+        addr2 = self._pos(i2, j2)
+        self._access(addr1)
+        self._access(addr2)
+        items = self.items
+        items[addr1], items[addr2] = items[addr2], items[addr1]
+
+    def reset_stats(self):
+        """Reset statistic counters."""
+
+        self.stat_cache_misses = 0
+        self.stat_accesses = 0
+
+    def _access(self, addr):
+        """Bring the given address to the cache."""
+
+        blocks = self.blocks
+        i = addr // self.B      # Which block to bring
+        if blocks[i, B_CACHED] > 0:
+            self._lru_remove(i)
+        else:
+            if self.cache_used < self.cache_blocks:
+                # We still have room in the cache.
+                self.cache_used += 1
+                if self.debug_level > 1:
+                    print("\t\tLoading block {}".format(i))
+            else:
+                # We need to evict the least-recently used block to make space.
+                replace = blocks[self.lru_head, B_LRU_PREV]
+                self._lru_remove(replace)
+                assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list"
+                blocks[replace, B_CACHED] = 0
+                if self.debug_level > 1:
+                    print("\t\tLoading block {}, replacing {}".format(i, replace))
+            blocks[i, B_CACHED] = 1
+            self.stat_cache_misses += 1
+        self._lru_add_after(i, self.lru_head)
+        self.stat_accesses += 1
+
+    def _lru_remove(self, i):
+        """Remove block from the LRU list."""
+
+        blocks = self.blocks
+        prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT]
+        blocks[prev, B_LRU_NEXT] = next
+        blocks[next, B_LRU_PREV] = prev
+
+    def _lru_add_after(self, i, after):
+        """Add block at the given position in the LRU list."""
+
+        blocks = self.blocks
+        next = blocks[after, B_LRU_NEXT]
+        blocks[after, B_LRU_NEXT] = i
+        blocks[next, B_LRU_PREV] = i
+        blocks[i, B_LRU_NEXT] = next
+        blocks[i, B_LRU_PREV] = after
+
+class TestMatrix(CachedMatrix):
+    """A cached matrix extended by methods for testing."""
+
+    # Constructor is inherited
+
+    def fill_matrix(self):
+        """Fill matrix with a testing pattern."""
+
+        if self.debug_level > 1:
+            print("\tInitializing")
+        N = self.N
+        for i in range(N):
+            for j in range(N):
+                self.write(i, j, i*N + j)
+
+    def check_result(self):
+        """Check that the pattern corresponds to the properly transposed matrix."""
+
+        if self.debug_level > 1:
+            print("\tChecking")
+        N = self.N
+        for i in range(N):
+            for j in range(N):
+                want = j*N + i
+                have = self.read(i, j)
+                have_i = have // N
+                have_j = have % N
+                assert have == want, "Mismatch at position ({},{}): expected element from ({},{}), found element from ({},{})".format(i, j, j, i, have_i, have_j)
+
+    def naive_transpose(self):
+        """Transpose the matrix naively."""
+
+        for i in range(self.N):
+            for j in range(i):
+                self.swap(i, j, j, i)
--- a/06-matrix_transpose/python/matrix_transpose.py
+++ b/06-matrix_transpose/python/matrix_transpose.py
+class Matrix:
+    """Interface of a matrix.
+
+    This class provides only the matrix size N and a method for swapping
+    two items. The actual storage of the matrix in memory is provided by
+    subclasses in testing code.
+    """
+
+    def __init__(self, N):
+        self.N = N
+
+    def swap(self, i1, j1, i2, j2):
+        """Swap elements (i1,j1) and (i2,j2)."""
+
+        # Overridden in subclasses
+        raise NotImplementedError
+
+    def transpose(self):
+        """Transpose the matrix."""
+
+        # TODO: Implement more efficiently
+        for i in range(self.N):
+            for j in range(i):
+                self.swap(i, j, j, i)
--- a/06-matrix_transpose/python/matrix_transpose_test.py
+++ b/06-matrix_transpose/python/matrix_transpose_test.py
+#!/usr/bin/env python3
+import math
+import sys
+
+from matrix_tests import TestMatrix
+
+def generic_test(N, M, B, max_ratio, debug_level):
+    m = TestMatrix(N, M, B, debug_level)
+    m.fill_matrix()
+    m.reset_stats()
+    m.transpose()
+
+    print("\t{} misses in {} accesses".format(m.stat_cache_misses, m.stat_accesses))
+    if m.stat_accesses:
+        swaps = N * (N-1) / 2
+        mpa = m.stat_cache_misses / swaps
+        lb = 2. / B
+        ratio = mpa / lb
+        print("\t{:.6f} misses/swap (lower bound is {:.6f} => ratio {:.6f}, limit {:.6f})".format(mpa, lb, ratio, max_ratio))
+        assert ratio <= max_ratio, "Algorithm did too many I/O operations."
+
+    m.check_result()
+
+# A list of all tests
+tests = [
+    # name                                  N      M     B  max_ratio  debug_level
+    ("small2k",     lambda: generic_test(   8,    32,    8,         8,     2 )),
+    ("small",       lambda: generic_test(  13,    64,    8,         5,     2 )),
+    ("n100b16",     lambda: generic_test( 100,  1024,   16,         3,     1 )),
+    ("n1000b16",    lambda: generic_test(1000,  1024,   16,         3,     1 )),
+    ("n1000b64",    lambda: generic_test(1000,  8192,   64,         4,     1 )),
+    ("n1000b256",   lambda: generic_test(1000, 65536,  256,         7,     1 )),
+    ("n1000b4096",  lambda: generic_test(1000, 65536, 4096,        60,     1 )),
+]
+
+if __name__ == "__main__":
+    for required_test in sys.argv[1:] or [name for name, _ in tests]:
+        for name, test in tests:
+            if name == required_test:
+                print("Running test {}".format(name), file=sys.stderr)
+                test()
+                break
+        else:
+            raise ValueError("Unknown test {}".format(name))
--- a/06-matrix_transpose/task.md
+++ b/06-matrix_transpose/task.md
+Implement cache-oblivious transposition of square matrices.
+
+The Python version requires the NumPy module for storing the matrix
+in memory efficiently.
+
+Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).