Skip to content
Snippets Groups Projects
Commit a36c6d6f authored by Tung Anh Vu's avatar Tung Anh Vu
Browse files

Publish matrix_experiment

parent 2316da99
No related branches found
No related tags found
No related merge requests found
.PHONY: test
test: matrix_experiment_sim matrix_experiment_real
@rm -rf out && mkdir out
@for exp in m1024-b16 m8192-b64 m65536-b256 m65536-b4096 ; do \
for impl in smart naive ; do \
echo "t-sim-$$exp-$$impl" ; \
./matrix_experiment_sim $$exp $$impl >out/t-sim-$$exp-$$impl ; \
done ; \
done
@for impl in smart naive ; do \
echo "t-real-$$impl" ; \
./matrix_experiment_real $$impl >out/t-real-$$impl ; \
done
CXXFLAGS=-std=c++11 -O3 -Wall -Wextra -g -Wno-sign-compare
matrix_experiment_sim: matrix_transpose.h matrix_tests.h matrix_experiment_sim.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) matrix_experiment_sim.cpp -o $@
matrix_experiment_real: matrix_transpose.h matrix_tests.h matrix_experiment_real.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) matrix_experiment_real.cpp -o $@
.PHONY: clean
clean::
rm -f matrix_experiment_sim matrix_experiment_real
rm -rf out
#include <functional>
#include <string>
#include <vector>
#include <cstdio>
#include <cmath>
#include <iostream>
#include <time.h>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
class Matrix {
vector<unsigned> items;
unsigned &item(unsigned i, unsigned j) { return items[i*N + j]; }
public:
unsigned N;
Matrix(unsigned N) { this->N = N; items.resize(N*N, 0); }
void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
{
// EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
std::swap(item(i1, j1), item(i2, j2));
}
void naive_transpose()
{
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
#include "matrix_transpose.h"
};
void real_test(bool naive)
{
for (int e=40; e<=112; e++) {
unsigned N = (unsigned) pow(2, e/8.);
Matrix m(N);
clock_t start_time, stop_time;
unsigned tries = 1;
do {
start_time = clock();
for (unsigned t=0; t < tries; t++) {
if (naive)
m.naive_transpose();
else
m.transpose();
}
stop_time = clock();
tries *= 2;
} while (stop_time - start_time < CLOCKS_PER_SEC/10);
// It is guaranteed that the total number of tries is odd :)
double ns_per_item = (double)(stop_time - start_time) / CLOCKS_PER_SEC / (N*(N-1)) / tries * 1e9;
printf("%d\t%.6f\n", N, ns_per_item);
}
}
int main(int argc, char **argv)
{
if (argc != 2) {
fprintf(stderr, "Usage: %s (smart|naive)\n", argv[0]);
return 1;
}
std::string mode = argv[1];
bool naive;
if (mode == "smart")
naive = false;
else if (mode == "naive")
naive = true;
else {
fprintf(stderr, "The argument must be either 'smart' or 'naive'\n");
return 1;
}
real_test(naive);
return 0;
}
#include <functional>
#include <string>
#include <vector>
#include <cstdio>
#include <cmath>
#include <string>
#include <iostream>
#include <time.h>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
#include "matrix_tests.h"
void simulated_test(unsigned M, unsigned B, bool naive)
{
for (int e=20; e<=52; e++) {
unsigned N = (unsigned) pow(2, e/4.);
TestMatrix m(N, M, B, 0);
m.fill_matrix();
m.reset_stats();
if (naive)
m.naive_transpose();
else
m.transpose();
double misses_per_item = (double) m.stat_cache_misses / (N*(N-1));
printf("%d\t%.6f\n", N, misses_per_item);
m.check_result();
}
}
vector<pair<string, function<void(bool n)>>> tests = {
// M B
{ "m1024-b16", [](bool n) { simulated_test( 1024, 16, n); } },
{ "m8192-b64", [](bool n) { simulated_test( 8192, 64, n); } },
{ "m65536-b256", [](bool n) { simulated_test(65536, 256, n); } },
{ "m65536-b4096", [](bool n) { simulated_test(65536, 4096, n); } },
};
int main(int argc, char **argv)
{
if (argc != 3) {
fprintf(stderr, "Usage: %s <test> (smart|naive)\n", argv[0]);
return 1;
}
std::string which_test = argv[1];
std::string mode = argv[2];
bool naive;
if (mode == "smart")
naive = false;
else if (mode == "naive")
naive = true;
else
{
fprintf(stderr, "Last argument must be either 'smart' or 'naive'\n");
return 1;
}
for (const auto& test : tests) {
if (test.first == which_test) {
test.second(naive);
return 0;
}
}
fprintf(stderr, "Unknown test %s\n", which_test.c_str());
return 1;
}
#include <string>
#include <vector>
#include <iostream>
#include <algorithm>
/* A matrix stored in a simulated cache */
class CachedMatrix {
unsigned B; // Block size
unsigned mem_blocks; // Memory size in blocks
unsigned cache_blocks; // Cache size in blocks
unsigned cache_used; // How many blocks of cache we already used
// We store the matrix as a one-dimensional array
vector<unsigned> items;
unsigned pos(unsigned i, unsigned j) { return i*N + j; }
/*
* For each memory block, we keep the following structure.
* If the block is currently cached, we set cached == true
* and lru_prev/next point to neighboring blocks in the cyclic LRU list.
* Otherwise, cached == false and the block is not in the LRU.
*/
class MemoryBlock {
public:
unsigned lru_prev, lru_next;
bool cached;
MemoryBlock()
{
lru_prev = lru_next = 0;
cached = false;
}
};
vector<MemoryBlock> blocks;
// One block at the end of "blocks" serves as a head of the LRU list.
unsigned lru_head;
public:
// Number of rows and columns of the matrix
unsigned N;
int debug_level; // Verbosity
CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0)
{
EXPECT(N > 0, "CachedMatrix must be non-empty.");
EXPECT(B > 0, "Blocks must be non-empty.");
EXPECT(!(M % B), "Cache size must be divisible by block size.");
EXPECT(M >= 2*B, "Cache must have at least 2 blocks.");
unsigned NN = N*N;
items.resize(NN, 0);
this->N = N;
this->B = B;
this->debug_level = debug_level;
mem_blocks = (NN+B-1) / B;
cache_blocks = M / B;
cache_used = 0;
// Initialize the LRU list
blocks.resize(mem_blocks + 1);
lru_head = mem_blocks;
blocks[lru_head].lru_prev = lru_head;
blocks[lru_head].lru_next = lru_head;
if (debug_level > 0)
cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n";
}
// Read value at position (i,j), used only in testing code
unsigned read(unsigned i, unsigned j)
{
EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
return items[addr];
}
// Write value at position (i,j), used only in testing code
void write(unsigned i, unsigned j, unsigned data)
{
EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
items[addr] = data;
}
// Swap items (i1,j1) and (i2,j2)
void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
{
EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
if (debug_level > 1)
cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl;
unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2);
access(addr1);
access(addr2);
std::swap(items[addr1], items[addr2]);
}
unsigned stat_cache_misses;
unsigned stat_accesses;
// Reset statistic counters.
void reset_stats()
{
stat_cache_misses = 0;
stat_accesses = 0;
}
static string coord_string(unsigned i, unsigned j)
{
return "(" + to_string(i) + "," + to_string(j) + ")";
}
#include "matrix_transpose.h"
private:
// Bring the given address to the cache.
void access(unsigned addr)
{
int i = addr / B; // Which block to bring
if (blocks[i].cached) {
lru_remove(i);
} else {
if (cache_used < cache_blocks) {
// We still have room in the cache.
cache_used++;
if (debug_level > 1)
cout << "\t\tLoading block " << i << endl;
} else {
// We need to evict the least-recently used block to make space.
unsigned replace = blocks[lru_head].lru_prev;
lru_remove(replace);
EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list.");
blocks[replace].cached = false;
if (debug_level > 1)
cout << "\t\tLoading block " << i << ", replacing " << replace << endl;
}
blocks[i].cached = true;
stat_cache_misses++;
}
lru_add_after(i, lru_head);
stat_accesses++;
}
// Remove block from the LRU list.
void lru_remove(unsigned i)
{
unsigned prev = blocks[i].lru_prev;
unsigned next = blocks[i].lru_next;
blocks[prev].lru_next = next;
blocks[next].lru_prev = prev;
}
// Add block at the given position in the LRU list.
void lru_add_after(unsigned i, unsigned after)
{
unsigned next = blocks[after].lru_next;
blocks[next].lru_prev = i;
blocks[after].lru_next = i;
blocks[i].lru_next = next;
blocks[i].lru_prev = after;
}
};
/* A cached matrix extended by methods for testing */
class TestMatrix : public CachedMatrix {
public:
TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { }
// Fill matrix with a testing pattern.
void fill_matrix()
{
if (debug_level > 1)
cout << "\tInitializing\n";
for (unsigned i = 0; i < N; i++)
for (unsigned j = 0; j < N; j++)
write(i, j, i*N + j);
}
// Check that the pattern corresponds to the properly transposed matrix.
void check_result()
{
if (debug_level > 1)
cout << "\tChecking\n";
for (unsigned i = 0; i < N; i++) {
for (unsigned j = 0; j < N; j++) {
unsigned want = j*N + i;
unsigned found = read(i, j);
unsigned found_i = found / N;
unsigned found_j = found % N;
EXPECT(found == want,
"Mismatch at position " + coord_string(i, j) +
": expected element from " + coord_string(j, i) +
", found element from " + coord_string(found_i, found_j) +
".");
}
}
}
// Transpose the matrix naively.
void naive_transpose()
{
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
};
TESTS=m1024-b16 m8192-b64 m65536-b256 m65536-b4096
TESTFILES=$(addprefix out/t-sim-,$(TESTS))
.PHONY: test
test: $(addsuffix -smart,$(TESTFILES)) $(addsuffix -naive,$(TESTFILES))
out/t-sim-%-naive:
@mkdir -p out
./matrix_experiment_sim.py $* naive >$@
out/t-sim-%-smart:
@mkdir -p out
./matrix_experiment_sim.py $* smart >$@
.PHONY: clean
clean::
rm -rf out __pycache__
#!/usr/bin/env python3
import sys
from matrix_tests import TestMatrix
def simulated_test(M, B, naive):
for e in range(10, 25):
N = int(2 ** (e/2))
print(" ", N, M, B, file=sys.stderr)
m = TestMatrix(N, M, B, 0)
m.fill_matrix()
m.reset_stats()
if naive:
m.naive_transpose()
else:
m.transpose()
misses_per_item = m.stat_cache_misses / (N*(N-1))
print(N, misses_per_item, flush=True)
m.check_result()
tests = {
# M B
"m1024-b16": lambda n: simulated_test( 1024, 16, n),
"m8192-b64": lambda n: simulated_test( 8192, 64, n),
"m65536-b256": lambda n: simulated_test(65536, 256, n),
"m65536-b4096": lambda n: simulated_test(65536, 4096, n),
}
if len(sys.argv) == 3:
test = sys.argv[1]
if sys.argv[2] == "smart":
naive = False
elif sys.argv[2] == "naive":
naive = True
else:
raise ValueError("Last argument must be either 'smart' or 'naive'")
if test in tests:
tests[test](naive)
else:
raise ValueError("Unknown test {}".format(test))
else:
raise ValueError("Usage: {} <test> (smart|naive)".format(sys.argv[0]))
from matrix_transpose import Matrix
import numpy
# Description of memory blocks is stored in an array blocks[block_index][B_xxx].
# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to
# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block
# is not in the LRU.
B_CACHED = 0
B_LRU_NEXT = 1
B_LRU_PREV = 2
class CachedMatrix(Matrix):
"""A matrix stored in simulated cache"""
def __init__(self, N, M, B, debug_level=0):
assert N>0, "CachedMatrix must be non-empty."
assert B>0, "Blocks must be non-empty."
assert M%B == 0, "Cache size must be divisible by block size."
assert M >= 2*B, "Cache must have at least 2 blocks."
Matrix.__init__(self, N)
NN = N*N
self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C")
self.B = B
self.M = M
self.debug_level = debug_level
self.mem_blocks = (NN+B-1) // B
self.cache_blocks = M//B
self.cache_used = 0
# Initialize the LRU list. There is a virtual block right after the last real block,
# which serves as a head of the cyclic LRU list.
self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C")
self.lru_head = self.mem_blocks
self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head
self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head
self.reset_stats()
if debug_level > 0:
print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks))
def _pos(self, i, j):
"""Convert position in matrix to linear address in simulated memory."""
return i*self.N + j
def read(self, i, j):
"""Read value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
return self.items[addr]
def write(self, i, j, value):
"""Write value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
self.items[addr] = value
def swap(self, i1, j1, i2, j2):
"""Swap items (i1,j1) and (i2,j2)."""
assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \
i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \
"Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2)
if self.debug_level > 1:
print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2))
addr1 = self._pos(i1, j1)
addr2 = self._pos(i2, j2)
self._access(addr1)
self._access(addr2)
items = self.items
items[addr1], items[addr2] = items[addr2], items[addr1]
def reset_stats(self):
"""Reset statistic counters."""
self.stat_cache_misses = 0
self.stat_accesses = 0
def _access(self, addr):
"""Bring the given address to the cache."""
blocks = self.blocks
i = addr // self.B # Which block to bring
if blocks[i, B_CACHED] > 0:
self._lru_remove(i)
else:
if self.cache_used < self.cache_blocks:
# We still have room in the cache.
self.cache_used += 1
if self.debug_level > 1:
print("\t\tLoading block {}".format(i))
else:
# We need to evict the least-recently used block to make space.
replace = blocks[self.lru_head, B_LRU_PREV]
self._lru_remove(replace)
assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list"
blocks[replace, B_CACHED] = 0
if self.debug_level > 1:
print("\t\tLoading block {}, replacing {}".format(i, replace))
blocks[i, B_CACHED] = 1
self.stat_cache_misses += 1
self._lru_add_after(i, self.lru_head)
self.stat_accesses += 1
def _lru_remove(self, i):
"""Remove block from the LRU list."""
blocks = self.blocks
prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT]
blocks[prev, B_LRU_NEXT] = next
blocks[next, B_LRU_PREV] = prev
def _lru_add_after(self, i, after):
"""Add block at the given position in the LRU list."""
blocks = self.blocks
next = blocks[after, B_LRU_NEXT]
blocks[after, B_LRU_NEXT] = i
blocks[next, B_LRU_PREV] = i
blocks[i, B_LRU_NEXT] = next
blocks[i, B_LRU_PREV] = after
class TestMatrix(CachedMatrix):
"""A cached matrix extended by methods for testing."""
# Constructor is inherited
def fill_matrix(self):
"""Fill matrix with a testing pattern."""
if self.debug_level > 1:
print("\tInitializing")
N = self.N
for i in range(N):
for j in range(N):
self.write(i, j, i*N + j)
def check_result(self):
"""Check that the pattern corresponds to the properly transposed matrix."""
if self.debug_level > 1:
print("\tChecking")
N = self.N
for i in range(N):
for j in range(N):
want = j*N + i
have = self.read(i, j)
have_i = have // N
have_j = have % N
assert have == want, "Mismatch at position ({},{}): expected element from ({},{}), found element from ({},{})".format(i, j, j, i, have_i, have_j)
def naive_transpose(self):
"""Transpose the matrix naively."""
for i in range(self.N):
for j in range(i):
self.swap(i, j, j, i)
## Goal
The goal of this assignment is to evaluate your implementation of cache-oblivious
matrix transposition experimentally and to compare it with the trivial algorithm
which transposes by definition.
You are given a test program (`matrix_experiment_sim`) which evaluates your
implementation from the previous assignment on different simulated caches and
matrices of different sizes. For each experiment, the average number of cache
misses per item is reported (the diagonal items which do not move are not
counted). The program also evaluates performance of the trivial transposition algorithm.
The simulated cache is fully associative and uses LRU replacement strategy.
You should run these experiments and write a report, which contains one plot of
the measured data for each cache type, showing dependency of the average number of
misses per item on the matrix size. There should be two curves in each plot: one for your
algorithm, another for the trivial one.
The report should discuss the experimental results and try to explain the observed
behavior (including any strange anomalies) using theory from the lectures.
If you want, you can carry out further experiments to gain better understanding
of the algorithm and include these in the report.
You should submit a PDF file with the report (and no source code).
You will get 1 temporary point upon submission if the file is syntactically correct;
proper points will be assigned later.
## Test program
The test program is given two arguments:
- Cache type:
- `m1024-b16` – cache of 1024 items organized in 16-item blocks
- `m8192-b64` – cache of 8192 items organized in 64-item blocks
- `m65536-b256` – cache of 65536 items organized on 256-item blocks
- `m65536-b4096` – cache of 65536 items organized in 4096-item blocks
- The implementation to test (`smart` or `naive`).
The output of the program contains one line per experiment, which consists of
the matrix size and the average number of cache misses per item.
*Warning:* The Python tests are slow, even though they use only a subset of the
matrix sizes. They can take about one hour to complete.
If your machine has multiple processors or cores, you can try `make -j`
to run the tests in parallel.
## Optional: Tests on real hardware (for 5 extra points)
You can also test your transposition algorithm on real hardware
using the `matrix_experiment_real` program. The matrix is stored in row-major
order, each item takes 4 bytes.
The program takes one parameter, the implementation to test: `smart` or `naive`.
Its output contains one line per experiment, which consists of the matrix size
and the average time per item in nanoseconds.
However, the program is available only for C++, because general slowness of
Python completely hides all cache effects.
Again, your report should show a plot of the measured data and discuss the observed
effects. You should also include the configuration of caches in your machine.
(If you are using Linux, you can try the `machinfo` script from
[this repository](https://gitlab.kam.mff.cuni.cz/mj/aim.git).)
## Hints
The following tools can be useful for producing nice plots:
- [pandas](https://pandas.pydata.org/)
- [matplotlib](https://matplotlib.org/)
- [gnuplot](http://www.gnuplot.info/)
A quick checklist for plots:
- Is there a caption explaining what is plotted?
- Are the axes clearly labelled? Do they have value ranges and units?
- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs
are more fitting in some cases, but you should tell.)
- Is it clear which curve means what?
- Is it clear what are the measured points and what is an interpolated
curve between them?
- Are there any overlaps? (E.g., the most interesting part of the curve
hidden underneath a label?)
- **Is the graph distorted by compression artifacts?** (No, you shouldn't use JPEG for plots!)
In your discussion, please distinguish the following kinds of claims.
It should be always clear which is which:
- Experimental results (i.e., the raw data you obtained from the experiments)
- Theoretical facts (i.e., claims we have proved mathematically)
- Your hypotheses (e.g., when you claim that the graph looks like something is true,
but you are not able to prove rigorously that it always holds)
Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment