Commit 50660fce authored by Martin Mareš's avatar Martin Mareš

Matrix experiment

parent 282392b2
.PHONY: test
test: matrix_experiment_sim matrix_experiment_real
@rm -rf out && mkdir out
@for exp in m1024-b16 m8192-b64 m65536-b256 m65536-b4096 ; do \
for impl in smart naive ; do \
echo "t-sim-$$exp-$$impl" ; \
./matrix_experiment_sim $$exp $$impl >out/t-sim-$$exp-$$impl ; \
done ; \
done
@for impl in smart naive ; do \
echo "t-real-$$impl" ; \
./matrix_experiment_real $$impl >out/t-real-$$impl ; \
done
CXXFLAGS=-std=c++11 -O3 -Wall -Wextra -g -Wno-sign-compare
matrix_experiment_sim: matrix_transpose.h matrix_tests.h matrix_experiment_sim.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) matrix_experiment_sim.cpp -o $@
matrix_experiment_real: matrix_transpose.h matrix_tests.h matrix_experiment_real.cpp
$(CXX) $(CPPFLAGS) $(CXXFLAGS) matrix_experiment_real.cpp -o $@
.PHONY: clean
clean:
rm -f matrix_experiment_sim matrix_experiment_real
rm -rf out
#include <functional>
#include <string>
#include <vector>
#include <cstdio>
#include <cmath>
#include <iostream>
#include <time.h> // FIXME
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
class Matrix {
vector<unsigned> items;
unsigned &item(unsigned i, unsigned j) { return items[i*N + j]; }
public:
unsigned N;
Matrix(unsigned N) { this->N = N; items.resize(N*N, 0); }
void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
{
// EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
std::swap(item(i1, j1), item(i2, j2));
}
void naive_transpose()
{
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
#include "matrix_transpose.h"
};
void real_test(bool naive)
{
for (int e=40; e<=112; e++) {
unsigned N = (unsigned) pow(2, e/8.);
Matrix m(N);
clock_t start_time, stop_time;
unsigned tries = 1;
do {
start_time = clock();
for (unsigned t=0; t < tries; t++) {
if (naive)
m.naive_transpose();
else
m.transpose();
}
stop_time = clock();
tries *= 2;
} while (stop_time - start_time < CLOCKS_PER_SEC/10);
// It is guaranteed that the total number of tries is odd :)
double ns_per_item = (double)(stop_time - start_time) / CLOCKS_PER_SEC / (N*(N-1)) / tries * 1e9;
printf("%d\t%.6f\n", N, ns_per_item);
}
}
int main(int argc, char **argv)
{
if (argc != 2) {
fprintf(stderr, "Usage: %s (smart|naive)\n", argv[0]);
return 1;
}
std::string mode = argv[1];
bool naive;
if (mode == "smart")
naive = false;
else if (mode == "naive")
naive = true;
else {
fprintf(stderr, "The argument must be either 'smart' or 'naive'\n");
return 1;
}
real_test(naive);
return 0;
}
#include <functional>
#include <string>
#include <vector>
#include <cstdio>
#include <cmath>
#include <string>
#include <iostream>
#include <time.h> // FIXME
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
#include "matrix_tests.h"
void simulated_test(unsigned M, unsigned B, bool naive)
{
for (int e=20; e<=52; e++) {
unsigned N = (unsigned) pow(2, e/4.);
TestMatrix m(N, M, B, 0);
m.fill_matrix();
m.reset_stats();
if (naive)
m.naive_transpose();
else
m.transpose();
double misses_per_item = (double) m.stat_cache_misses / (N*(N-1));
printf("%d\t%.6f\n", N, misses_per_item);
m.check_result();
}
}
vector<pair<string, function<void(bool n)>>> tests = {
// M B
{ "m1024-b16", [](bool n) { simulated_test( 1024, 16, n); } },
{ "m8192-b64", [](bool n) { simulated_test( 8192, 64, n); } },
{ "m65536-b256", [](bool n) { simulated_test(65536, 256, n); } },
{ "m65536-b4096", [](bool n) { simulated_test(65536, 4096, n); } },
};
int main(int argc, char **argv)
{
if (argc != 3) {
fprintf(stderr, "Usage: %s <test> (smart|naive)\n", argv[0]);
return 1;
}
std::string which_test = argv[1];
std::string mode = argv[2];
bool naive;
if (mode == "smart")
naive = false;
else if (mode == "naive")
naive = true;
else
{
fprintf(stderr, "Last argument must be either 'smart' or 'naive'\n");
return 1;
}
for (const auto& test : tests) {
if (test.first == which_test) {
test.second(naive);
return 0;
}
}
fprintf(stderr, "Unknown test %s\n", which_test.c_str());
return 1;
}
#include <string>
#include <vector>
#include <iostream>
#include <algorithm>
/* A matrix stored in a simulated cache */
class CachedMatrix {
unsigned B; // Block size
unsigned mem_blocks; // Memory size in blocks
unsigned cache_blocks; // Cache size in blocks
unsigned cache_used; // How many blocks of cache we already used
// We store the matrix as a one-dimensional array
vector<unsigned> items;
unsigned pos(unsigned i, unsigned j) { return i*N + j; }
/*
* For each memory block, we keep the following structure.
* If the block is currently cached, we set cached == true
* and lru_prev/next point to neighboring blocks in the cyclic LRU list.
* Otherwise, cached == false and the block is not in the LRU.
*/
class MemoryBlock {
public:
unsigned lru_prev, lru_next;
bool cached;
MemoryBlock()
{
lru_prev = lru_next = 0;
cached = false;
}
};
vector<MemoryBlock> blocks;
// One block at the end of "blocks" serves as a head of the LRU list.
unsigned lru_head;
public:
// Number of rows and columns of the matrix
unsigned N;
int debug_level; // Verbosity
CachedMatrix(unsigned N, unsigned M, unsigned B, int debug_level=0)
{
EXPECT(N > 0, "CachedMatrix must be non-empty.");
EXPECT(B > 0, "Blocks must be non-empty.");
EXPECT(!(M % B), "Cache size must be divisible by block size.");
EXPECT(M >= 2*B, "Cache must have at least 2 blocks.");
unsigned NN = N*N;
items.resize(NN, 0);
this->N = N;
this->B = B;
this->debug_level = debug_level;
mem_blocks = (NN+B-1) / B;
cache_blocks = M / B;
cache_used = 0;
// Initialize the LRU list
blocks.resize(mem_blocks + 1);
lru_head = mem_blocks;
blocks[lru_head].lru_prev = lru_head;
blocks[lru_head].lru_next = lru_head;
if (debug_level > 0)
cout << "\tMemory: " << mem_blocks << " blocks of " << B << " items, " << cache_blocks << " cached\n";
}
// Read value at position (i,j), used only in testing code
unsigned read(unsigned i, unsigned j)
{
EXPECT(i < N && j < N, "Read out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
return items[addr];
}
// Write value at position (i,j), used only in testing code
void write(unsigned i, unsigned j, unsigned data)
{
EXPECT(i < N && j < N, "Write out of range: " + coord_string(i, j) + ".");
unsigned addr = pos(i, j);
access(addr);
items[addr] = data;
}
// Swap items (i1,j1) and (i2,j2)
void swap(unsigned i1, unsigned j1, unsigned i2, unsigned j2)
{
EXPECT(i1 < N && j1 < N && i2 < N && j2 < N, "Swap out of range: " + coord_string(i1, j1) + " with " + coord_string(i2, j2) + ".");
if (debug_level > 1)
cout << "\tSwap " << coord_string(i1, j1) << " " << coord_string(i2, j2) << endl;
unsigned addr1 = pos(i1, j1), addr2 = pos(i2, j2);
access(addr1);
access(addr2);
std::swap(items[addr1], items[addr2]);
}
unsigned stat_cache_misses;
unsigned stat_accesses;
// Reset statistic counters.
void reset_stats()
{
stat_cache_misses = 0;
stat_accesses = 0;
}
static string coord_string(unsigned i, unsigned j)
{
return "(" + to_string(i) + "," + to_string(j) + ")";
}
#include "matrix_transpose.h"
private:
// Bring the given address to the cache.
void access(unsigned addr)
{
int i = addr / B; // Which block to bring
if (blocks[i].cached) {
lru_remove(i);
} else {
if (cache_used < cache_blocks) {
// We still have room in the cache.
cache_used++;
if (debug_level > 1)
cout << "\t\tLoading block " << i << endl;
} else {
// We need to evict the least-recently used block to make space.
unsigned replace = blocks[lru_head].lru_prev;
lru_remove(replace);
EXPECT(blocks[replace].cached, "Internal error: Buggy LRU list.");
blocks[replace].cached = false;
if (debug_level > 1)
cout << "\t\tLoading block " << i << ", replacing " << replace << endl;
}
blocks[i].cached = true;
stat_cache_misses++;
}
lru_add_after(i, lru_head);
stat_accesses++;
}
// Remove block from the LRU list.
void lru_remove(unsigned i)
{
unsigned prev = blocks[i].lru_prev;
unsigned next = blocks[i].lru_next;
blocks[prev].lru_next = next;
blocks[next].lru_prev = prev;
}
// Add block at the given position in the LRU list.
void lru_add_after(unsigned i, unsigned after)
{
unsigned next = blocks[after].lru_next;
blocks[next].lru_prev = i;
blocks[after].lru_next = i;
blocks[i].lru_next = next;
blocks[i].lru_prev = after;
}
};
/* A cached matrix extended by methods for testing */
class TestMatrix : public CachedMatrix {
public:
TestMatrix(unsigned N, unsigned M, unsigned B, int debug_level = 0) : CachedMatrix(N, M, B, debug_level) { }
// Fill matrix with a testing pattern.
void fill_matrix()
{
if (debug_level > 1)
cout << "\tInitializing\n";
for (unsigned i = 0; i < N; i++)
for (unsigned j = 0; j < N; j++)
write(i, j, i*N + j);
}
// Check that the pattern corresponds to the properly transposed matrix.
void check_result()
{
if (debug_level > 1)
cout << "\tChecking\n";
for (unsigned i = 0; i < N; i++) {
for (unsigned j = 0; j < N; j++) {
unsigned want = j*N + i;
unsigned found = read(i, j);
unsigned found_i = found / N;
unsigned found_j = found % N;
EXPECT(found == want,
"Mismatch at position " + coord_string(i, j) +
": expected element from " + coord_string(j, i) +
", found element from " + coord_string(found_i, found_j) +
".");
}
}
}
// Transpose the matrix naively.
void naive_transpose()
{
for (unsigned i=0; i<N; i++)
for (unsigned j=0; j<i; j++)
swap(i, j, j, i);
}
};
TESTS=m1024-b16 m8192-b64 m65536-b256 m65536-b4096
TESTFILES=$(addprefix out/t-sim-,$(TESTS))
.PHONY: test
test: $(addsuffix -smart,$(TESTFILES)) $(addsuffix -naive,$(TESTFILES))
out/t-sim-%-naive:
@mkdir -p out
./matrix_experiment_sim.py $* naive >$@
out/t-sim-%-smart:
@mkdir -p out
./matrix_experiment_sim.py $* smart >$@
.PHONY: clean
clean:
rm -rf out
#!/usr/bin/env python3
import sys
from matrix_tests import TestMatrix
def simulated_test(M, B, naive):
for e in range(10, 25):
N = int(2 ** (e/2))
print(" ", N, M, B, file=sys.stderr)
m = TestMatrix(N, M, B, 0)
m.fill_matrix()
m.reset_stats()
if naive:
m.naive_transpose()
else:
m.transpose()
misses_per_item = m.stat_cache_misses / (N*(N-1))
print(N, misses_per_item, flush=True)
m.check_result()
tests = {
# M B
"m1024-b16": lambda n: simulated_test( 1024, 16, n),
"m8192-b64": lambda n: simulated_test( 8192, 64, n),
"m65536-b256": lambda n: simulated_test(65536, 256, n),
"m65536-b4096": lambda n: simulated_test(65536, 4096, n),
}
if len(sys.argv) == 3:
test = sys.argv[1]
if sys.argv[2] == "smart":
naive = False
elif sys.argv[2] == "naive":
naive = True
else:
raise ValueError("Last argument must be either 'smart' or 'naive'")
if test in tests:
tests[test](naive)
else:
raise ValueError("Unknown test {}".format(test))
else:
raise ValueError("Usage: {} <test> (smart|naive)".format(sys.argv[0]))
from matrix_transpose import Matrix
import numpy
# Description of memory blocks is stored in an array blocks[block_index][B_xxx].
# If the block is cached, B_CACHED is 1 and B_LRU_NEXT and B_LRU_PREV point to
# neighboring blocks in the cyclic LRU list. Otherwise, B_CACHED is 0 and the block
# is not in the LRU.
B_CACHED = 0
B_LRU_NEXT = 1
B_LRU_PREV = 2
class CachedMatrix(Matrix):
"""A matrix stored in simulated cache"""
def __init__(self, N, M, B, debug_level=0):
assert N>0, "CachedMatrix must be non-empty."
assert B>0, "Blocks must be non-empty."
assert M%B == 0, "Cache size must be divisible by block size."
assert M >= 2*B, "Cache must have at least 2 blocks."
Matrix.__init__(self, N)
NN = N*N
self.items = numpy.zeros(shape=NN, dtype=numpy.int32, order="C")
self.B = B
self.M = M
self.debug_level = debug_level
self.mem_blocks = (NN+B-1) // B
self.cache_blocks = M//B
self.cache_used = 0
# Initialize the LRU list. There is a virtual block right after the last real block,
# which serves as a head of the cyclic LRU list.
self.blocks = numpy.zeros(shape=(self.mem_blocks+1, 3), dtype=numpy.int32, order="C")
self.lru_head = self.mem_blocks
self.blocks[self.lru_head, B_LRU_NEXT] = self.lru_head
self.blocks[self.lru_head, B_LRU_PREV] = self.lru_head
self.reset_stats()
if debug_level > 0:
print("\tMemory: {} blocks of {} items, {} cached".format(self.mem_blocks, self.B, self.cache_blocks))
def _pos(self, i, j):
"""Convert position in matrix to linear address in simulated memory."""
return i*self.N + j
def read(self, i, j):
"""Read value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Read out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
return self.items[addr]
def write(self, i, j, value):
"""Write value at position (i,j), used only in testing code."""
assert i >= 0 and i < self.N and j >= 0 and j < self.N, "Write out of range: ({},{})".format(i, j)
addr = self._pos(i, j)
self._access(addr)
self.items[addr] = value
def swap(self, i1, j1, i2, j2):
"""Swap items (i1,j1) and (i2,j2)."""
assert i1 >= 0 and i1 < self.N and j1 >= 0 and j1 < self.N and \
i2 >= 0 and i2 < self.N and j2 >= 0 and j2 < self.N, \
"Swap out of range: ({},{}) with ({},{})".format(i1, j1, i2, j2)
if self.debug_level > 1:
print("\tSwap ({},{}) ({},{})".format(i1, j1, i2, j2))
addr1 = self._pos(i1, j1)
addr2 = self._pos(i2, j2)
self._access(addr1)
self._access(addr2)
items = self.items
items[addr1], items[addr2] = items[addr2], items[addr1]
def reset_stats(self):
"""Reset statistic counters."""
self.stat_cache_misses = 0
self.stat_accesses = 0
def _access(self, addr):
"""Bring the given address to the cache."""
blocks = self.blocks
i = addr // self.B # Which block to bring
if blocks[i, B_CACHED] > 0:
self._lru_remove(i)
else:
if self.cache_used < self.cache_blocks:
# We still have room in the cache.
self.cache_used += 1
if self.debug_level > 1:
print("\t\tLoading block {}".format(i))
else:
# We need to evict the least-recently used block to make space.
replace = blocks[self.lru_head, B_LRU_PREV]
self._lru_remove(replace)
assert blocks[replace, B_CACHED] > 0, "Internal error: Buggy LRU list"
blocks[replace, B_CACHED] = 0
if self.debug_level > 1:
print("\t\tLoading block {}, replacing {}".format(i, replace))
blocks[i, B_CACHED] = 1
self.stat_cache_misses += 1
self._lru_add_after(i, self.lru_head)
self.stat_accesses += 1
def _lru_remove(self, i):
"""Remove block from the LRU list."""
blocks = self.blocks
prev, next = blocks[i, B_LRU_PREV], blocks[i, B_LRU_NEXT]
blocks[prev, B_LRU_NEXT] = next
blocks[next, B_LRU_PREV] = prev
def _lru_add_after(self, i, after):
"""Add block at the given position in the LRU list."""
blocks = self.blocks
next = blocks[after, B_LRU_NEXT]
blocks[after, B_LRU_NEXT] = i
blocks[next, B_LRU_PREV] = i
blocks[i, B_LRU_NEXT] = next
blocks[i, B_LRU_PREV] = after
class TestMatrix(CachedMatrix):
"""A cached matrix extended by methods for testing."""
# Constructor is inherited
def fill_matrix(self):
"""Fill matrix with a testing pattern."""
if self.debug_level > 1:
print("\tInitializing")
N = self.N
for i in range(N):
for j in range(N):
self.write(i, j, i*N + j)