From 439146835a24d7725a908d84bced84666d70db84 Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Wed, 21 Apr 2021 18:14:49 +0200
Subject: [PATCH] Cuckoo hash and Hash experiment

---
 07-cuckoo_hash/cpp/Makefile                  |  13 +
 07-cuckoo_hash/cpp/cuckoo_hash.h             | 103 ++++++
 07-cuckoo_hash/cpp/cuckoo_hash_test.cpp      |  35 +++
 07-cuckoo_hash/cpp/random.h                  |  59 ++++
 07-cuckoo_hash/cpp/test_main.cpp             |  43 +++
 07-cuckoo_hash/python/cuckoo_hash.py         |  56 ++++
 07-cuckoo_hash/python/cuckoo_hash_test.py    |  41 +++
 07-cuckoo_hash/task.md                       |  12 +
 08-hash_experiment/cpp/Makefile              |  20 ++
 08-hash_experiment/cpp/hash_experiment.cpp   | 314 +++++++++++++++++++
 08-hash_experiment/cpp/random.h              |  61 ++++
 08-hash_experiment/python/Makefile           |  14 +
 08-hash_experiment/python/hash_experiment.py | 217 +++++++++++++
 08-hash_experiment/task.md                   |  74 +++++
 14 files changed, 1062 insertions(+)
 create mode 100644 07-cuckoo_hash/cpp/Makefile
 create mode 100644 07-cuckoo_hash/cpp/cuckoo_hash.h
 create mode 100644 07-cuckoo_hash/cpp/cuckoo_hash_test.cpp
 create mode 100644 07-cuckoo_hash/cpp/random.h
 create mode 100644 07-cuckoo_hash/cpp/test_main.cpp
 create mode 100644 07-cuckoo_hash/python/cuckoo_hash.py
 create mode 100755 07-cuckoo_hash/python/cuckoo_hash_test.py
 create mode 100644 07-cuckoo_hash/task.md
 create mode 100644 08-hash_experiment/cpp/Makefile
 create mode 100644 08-hash_experiment/cpp/hash_experiment.cpp
 create mode 100644 08-hash_experiment/cpp/random.h
 create mode 100644 08-hash_experiment/python/Makefile
 create mode 100644 08-hash_experiment/python/hash_experiment.py
 create mode 100644 08-hash_experiment/task.md

diff --git a/07-cuckoo_hash/cpp/Makefile b/07-cuckoo_hash/cpp/Makefile
new file mode 100644
index 0000000..f32e87a
--- /dev/null
+++ b/07-cuckoo_hash/cpp/Makefile
@@ -0,0 +1,13 @@
+test: cuckoo_hash_test
+	./$<
+
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+
+cuckoo_hash_test: cuckoo_hash_test.cpp cuckoo_hash.h test_main.cpp $(INCLUDE)/random.h
+	$(CXX) $(CXXFLAGS) $^ -o $@
+
+clean:
+	rm -f cuckoo_hash_test
+
+.PHONY: clean test
diff --git a/07-cuckoo_hash/cpp/cuckoo_hash.h b/07-cuckoo_hash/cpp/cuckoo_hash.h
new file mode 100644
index 0000000..32a5660
--- /dev/null
+++ b/07-cuckoo_hash/cpp/cuckoo_hash.h
@@ -0,0 +1,103 @@
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <iostream>
+
+#include "random.h"
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message);
+
+class TabulationHash {
+    /*
+     * Hash function for hashing by tabulation.
+     *
+     * The 32-bit key is split to four 8-bit parts. Each part indexes
+     * a separate table of 256 randomly generated values. Obtained values
+     * are XORed together.
+     */
+
+    unsigned num_buckets;
+    uint32_t tables[4][256];
+
+public:
+    TabulationHash(unsigned num_buckets, RandomGen *random_gen)
+    {
+      this->num_buckets = num_buckets;
+      for (int i=0; i<4; i++)
+          for (int j=0; j<256; j++)
+              tables[i][j] = random_gen->next_u32();
+    }
+
+    uint32_t hash(uint32_t key)
+    {
+        unsigned h0 = key & 0xff;
+        unsigned h1 = (key >> 8) & 0xff;
+        unsigned h2 = (key >> 16) & 0xff;
+        unsigned h3 = (key >> 24) & 0xff;
+        return (tables[0][h0] ^ tables[1][h1] ^ tables[2][h2] ^ tables[3][h3]) % num_buckets;
+    }
+};
+
+class CuckooTable {
+    /*
+     * Hash table with Cuckoo hashing.
+     *
+     * We have two hash functions, which map 32-bit keys to buckets of a common
+     * hash table. Unused buckets contain 0xffffffff.
+     */
+
+    const uint32_t UNUSED = 0xffffffff;
+
+    // The array of buckets
+    vector<uint32_t> table;
+    unsigned num_buckets;
+
+    // Hash functions and the random generator used to create them
+    TabulationHash *hashes[2];
+    RandomGen *random_gen;
+
+public:
+
+    CuckooTable(unsigned num_buckets)
+    {
+        // Initialize the table with the given number of buckets.
+        // The number of buckets is expected to stay constant.
+
+        this->num_buckets = num_buckets;
+        table.resize(num_buckets, UNUSED);
+
+        // Obtain two fresh hash functions.
+        random_gen = new RandomGen(42);
+        for (int i=0; i<2; i++)
+            hashes[i] = new TabulationHash(num_buckets, random_gen);
+    }
+
+    ~CuckooTable()
+    {
+        for (int i=0; i<2; i++)
+            delete hashes[i];
+        delete random_gen;
+    }
+
+    bool lookup(uint32_t key)
+    {
+        // Check if the table contains the given key. Returns True or False.
+        unsigned h0 = hashes[0]->hash(key);
+        unsigned h1 = hashes[1]->hash(key);
+        return (table[h0] == key || table[h1] == key);
+    }
+
+    void insert(uint32_t key)
+    {
+        // Insert a new key to the table. Assumes that the key is not present yet.
+        EXPECT(key != UNUSED, "Keys must differ from UNUSED.");
+
+        // TODO: Implement
+    }
+
+};
diff --git a/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp b/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp
new file mode 100644
index 0000000..84ececb
--- /dev/null
+++ b/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp
@@ -0,0 +1,35 @@
+#include <functional>
+#include <cstdlib>
+#include <vector>
+
+#include "cuckoo_hash.h"
+
+void simple_test(unsigned n, unsigned table_size_percentage)
+{
+    CuckooTable table(n * table_size_percentage / 100);
+
+    for (unsigned i=0; i < n; i++)
+        table.insert(37*i);
+
+    for (unsigned i=0; i < n; i++) {
+        EXPECT(table.lookup(37*i), "Item not present in table, but it should be.");
+        EXPECT(!table.lookup(37*i+1), "Item present in table, even though it should not be.");
+    }
+}
+
+void multiple_test(unsigned min_n, unsigned max_n, unsigned step_n, unsigned table_size_percentage)
+{
+    for (unsigned n=min_n; n < max_n; n += step_n) {
+        printf("\tn=%u\n", n);
+        simple_test(n, table_size_percentage);
+    }
+}
+
+/*** A list of all tests ***/
+
+vector<pair<string, function<void()>>> tests = {
+    { "small",     [] { simple_test(100, 400); } },
+    { "middle",    [] { simple_test(31415, 300); } },
+    { "big",       [] { simple_test(1000000, 300); } },
+    { "tight",     [] { multiple_test(20000, 40000, 500, 205); } },
+};
diff --git a/07-cuckoo_hash/cpp/random.h b/07-cuckoo_hash/cpp/random.h
new file mode 100644
index 0000000..7d18ab6
--- /dev/null
+++ b/07-cuckoo_hash/cpp/random.h
@@ -0,0 +1,59 @@
+#define DS1_RANDOM_H
+
+#include <cstdint>
+
+/*
+ * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
+ * and Sebastiano Vigna, distributed under the CC-0 license. For more details,
+ * see http://vigna.di.unimi.it/xorshift/.
+ *
+ * Rewritten to C++ by Martin Mares, also placed under CC-0.
+ */
+
+class RandomGen {
+    uint64_t state[2];
+
+    uint64_t rotl(uint64_t x, int k)
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+
+  public:
+    // Initialize the generator, set its seed and warm it up.
+    RandomGen(unsigned int seed)
+    {
+        state[0] = seed * 0xdeadbeef;
+        state[1] = seed ^ 0xc0de1234;
+        for (int i=0; i<100; i++)
+            next_u64();
+    }
+
+    // Generate a random 64-bit number.
+    uint64_t next_u64(void)
+    {
+        uint64_t s0 = state[0], s1 = state[1];
+        uint64_t result = s0 + s1;
+        s1 ^= s0;
+        state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+        state[1] = rotl(s1, 36);
+        return result;
+    }
+
+    // Generate a random 32-bit number.
+    uint32_t next_u32(void)
+    {
+      return next_u64() >> 11;
+    }
+
+    // Generate a number between 0 and range-1.
+    unsigned int next_range(unsigned int range)
+    {
+        /*
+         * This is not perfectly uniform, unless the range is a power of two.
+         * However, for 64-bit random values and 32-bit ranges, the bias is
+         * insignificant.
+         */
+        return next_u64() % range;
+    }
+};
+
diff --git a/07-cuckoo_hash/cpp/test_main.cpp b/07-cuckoo_hash/cpp/test_main.cpp
new file mode 100644
index 0000000..3f4aff0
--- /dev/null
+++ b/07-cuckoo_hash/cpp/test_main.cpp
@@ -0,0 +1,43 @@
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+int main(int argc, char* argv[]) {
+    vector<string> required_tests;
+
+    if (argc > 1) {
+        required_tests.assign(argv + 1, argv + argc);
+    } else {
+        for (const auto& test : tests)
+            required_tests.push_back(test.first);
+    }
+
+    for (const auto& required_test : required_tests) {
+        bool found = false;
+        for (const auto& test : tests)
+            if (required_test == test.first) {
+                cerr << "Running test " << required_test << endl;
+                test.second();
+                found = true;
+                break;
+            }
+        if (!found) {
+            cerr << "Unknown test " << required_test << endl;
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/07-cuckoo_hash/python/cuckoo_hash.py b/07-cuckoo_hash/python/cuckoo_hash.py
new file mode 100644
index 0000000..72a415c
--- /dev/null
+++ b/07-cuckoo_hash/python/cuckoo_hash.py
@@ -0,0 +1,56 @@
+import random
+import math
+
+class TabulationHash:
+    """Hash function for hashing by tabulation.
+
+    The 32-bit key is split to four 8-bit parts. Each part indexes
+    a separate table of 256 randomly generated values. Obtained values
+    are XORed together.
+    """
+
+    def __init__(self, num_buckets):
+        self.tables = [None] * 4
+        for i in range(4):
+            self.tables[i] = [random.randint(0, 0xffffffff) for _ in range(256)]
+        self.num_buckets = num_buckets
+
+    def hash(self, key):
+        h0 = key & 0xff
+        h1 = (key >> 8) & 0xff
+        h2 = (key >> 16) & 0xff
+        h3 = (key >> 24) & 0xff
+        t = self.tables
+        return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
+
+class CuckooTable:
+    """Hash table with Cuckoo hashing.
+
+    We have two hash functions, which map 32-bit keys to buckets of a common
+    hash table. Unused buckets contain None.
+    """
+
+    def __init__(self, num_buckets):
+        """Initialize the table with the given number of buckets.
+        The number of buckets is expected to stay constant."""
+
+        # The array of buckets
+        self.num_buckets = num_buckets
+        self.table = [None] * num_buckets
+
+        # Create two fresh hash functions
+        self.hashes = [TabulationHash(num_buckets), TabulationHash(num_buckets)]
+
+    def lookup(self, key):
+        """Check if the table contains the given key. Returns True or False."""
+
+        b0 = self.hashes[0].hash(key)
+        b1 = self.hashes[1].hash(key)
+        # print("## Lookup key={} b0={} b1={}".format(key, b0, b1))
+        return self.table[b0] == key or self.table[b1] == key
+
+    def insert(self, key):
+        """Insert a new key to the table. Assumes that the key is not present yet."""
+
+        # TODO: Implement
+        raise NotImplementedError
diff --git a/07-cuckoo_hash/python/cuckoo_hash_test.py b/07-cuckoo_hash/python/cuckoo_hash_test.py
new file mode 100755
index 0000000..f9137c4
--- /dev/null
+++ b/07-cuckoo_hash/python/cuckoo_hash_test.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+import sys
+import random
+
+from cuckoo_hash import CuckooTable
+
+def simple_test(n, table_size_percentage):
+    random.seed(42)
+    table = CuckooTable(n*table_size_percentage//100)
+
+    # Insert an arithmetic progression
+    for i in range(n):
+        table.insert(37*i)
+
+    # Verify contents of the table
+    for i in range(n):
+        assert table.lookup(37*i), "Item not present in table, but it should be."
+        assert not table.lookup(37*i+1), "Item present in table, even though it should not be."
+
+def multiple_test(min_n, max_n, step_n, table_size_percentage):
+    for n in range(min_n, max_n, step_n):
+        print("\tn={}".format(n))
+        simple_test(n, table_size_percentage)
+
+# A list of all tests
+tests = [
+    ("small",       lambda: simple_test(100, 400)),
+    ("middle",      lambda: simple_test(31415, 300)),
+    ("big",         lambda: simple_test(1000000, 300)),
+    ("tight",       lambda: multiple_test(20000, 40000, 500, 205)),
+]
+
+if __name__ == "__main__":
+    for required_test in sys.argv[1:] or [name for name, _ in tests]:
+        for name, test in tests:
+            if name == required_test:
+                print("Running test {}".format(name), file=sys.stderr)
+                test()
+                break
+        else:
+            raise ValueError("Unknown test {}".format(name))
diff --git a/07-cuckoo_hash/task.md b/07-cuckoo_hash/task.md
new file mode 100644
index 0000000..dc71991
--- /dev/null
+++ b/07-cuckoo_hash/task.md
@@ -0,0 +1,12 @@
+Implement Cuckoo hash table with simple tabulation hashing.
+
+You are given a skeleton code which defines the table, implements
+`lookup()`, and provides hash functions. You have to add an `insert()`
+method.
+
+If too many elements are moved during a single insert, the table must
+be rehashed with new hash functions. See lecture notes for the particular
+bounds.
+
+The size of the table should stay constant
+throughout the existence of the data structure.
diff --git a/08-hash_experiment/cpp/Makefile b/08-hash_experiment/cpp/Makefile
new file mode 100644
index 0000000..8745327
--- /dev/null
+++ b/08-hash_experiment/cpp/Makefile
@@ -0,0 +1,20 @@
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+STUDENT_ID ?= PLEASE_SET_STUDENT_ID
+
+HASHFUNCS=ms-low ms-high poly-1 poly-2 tab
+
+.PHONY: test
+test: $(addprefix out/t-grow-, $(HASHFUNCS)) $(addprefix out/t-usage-, $(HASHFUNCS))
+
+out/t-%: hash_experiment
+	@mkdir -p out
+	./hash_experiment $* $(STUDENT_ID) >$@
+
+hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
+
+.PHONY: clean
+clean:
+	rm -f hash_experiment
+	rm -rf out
diff --git a/08-hash_experiment/cpp/hash_experiment.cpp b/08-hash_experiment/cpp/hash_experiment.cpp
new file mode 100644
index 0000000..7eac12f
--- /dev/null
+++ b/08-hash_experiment/cpp/hash_experiment.cpp
@@ -0,0 +1,314 @@
+#include <vector>
+#include <functional>
+#include <algorithm>
+#include <utility>
+#include <stdexcept>
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+#include "random.h"
+
+using namespace std;
+
+RandomGen rng(42);
+
+typedef uint32_t uint;
+
+typedef function<uint(uint)> HashFunction;
+typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
+
+/*
+ * Hash function for hashing by tabulation.
+ *
+ * The 32-bit key is split to four 8-bit parts. Each part indexes
+ * a separate table of 256 randomly generated values. Obtained values
+ * are XORed together.
+ */
+class TabulationHash {
+    unsigned num_buckets;
+    vector<uint> tables;
+
+    TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
+        for (uint& x : tables) x = rng.next_u32();
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(TabulationHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        return (
+            tables[key & 0xff] ^
+            tables[((key >> 8) & 0xff) | 0x100] ^
+            tables[((key >> 16) & 0xff) | 0x200] ^
+            tables[((key >> 24) & 0xff) | 0x300]
+        ) % num_buckets;
+    }
+};
+
+// Hash function using polynomial modulo a prime.
+template < int degree, uint prime = 2147483647 >
+class PolynomialHash {
+    unsigned num_buckets;
+    vector<uint> coefs;
+
+    PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
+        for (uint& x : coefs) x = rng.next_u32();
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(PolynomialHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        uint64_t acc = 0;
+        for (uint c : coefs) acc = (acc * key + c) % prime;
+        return (uint)(acc % num_buckets);
+    }
+};
+
+typedef PolynomialHash<1> LinearHash;
+typedef PolynomialHash<2> QuadraticHash;
+
+// Multiply-shift hash function taking top bits of 32-bit word
+class MultiplyShiftLowHash {
+    uint mult;
+    uint mask;
+    int shift = 0;
+
+    MultiplyShiftLowHash(unsigned num_buckets) {
+        mult = rng.next_u32() | 0x1;
+        mask = num_buckets - 1;
+
+        if (mask & num_buckets)
+            throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
+
+        unsigned tmp = num_buckets - 1;
+        while ((0x80000000U & tmp) == 0) {
+            tmp <<= 1;
+            shift++;
+        }
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(MultiplyShiftLowHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        return ((key * mult) >> shift) & mask;
+    }
+};
+
+// Multiply-shift hash function taking low bits of upper half of 64-bit word
+class MultiplyShiftHighHash {
+    uint mask;
+    uint64_t mult;
+
+    MultiplyShiftHighHash(unsigned num_buckets) {
+        mult = rng.next_u64() | 0x1;
+        mask = num_buckets - 1;
+
+        if (mask & num_buckets)
+            throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(MultiplyShiftHighHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        return ((key * mult) >> 32) & mask;
+    }
+};
+
+
+// Hash table with linear probing
+class HashTable {
+    HashFunction hash;
+    vector<uint> table;
+    unsigned size = 0;
+
+    unsigned ops;
+    unsigned max_;
+    uint64_t steps;
+
+  public:
+    // We reserve one integer to mark unused buckets. This integer
+    // cannot be stored in the table.
+    static constexpr uint UNUSED = ~((uint)0);
+
+    HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
+        hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
+        reset_counter();
+    }
+
+    // Check whether key is present in the table.
+    bool lookup(uint key) {
+        if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
+
+        bool ret = false;
+        unsigned steps = 1;
+
+        uint b = hash(key);
+        while (table[b] != UNUSED) {
+            if (table[b] == key) {
+                ret = true;
+                break;
+            }
+            steps++;
+            b = next_bucket(b);
+        }
+
+        update_counter(steps);
+        return ret;
+    }
+
+    // Add the key in the table.
+    void insert(uint key) {
+        if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
+        if (size >= table.size()) throw runtime_error("Insert: Table is full");
+
+        unsigned steps = 1;
+        uint b = hash(key);
+
+        while (table[b] != UNUSED) {
+            if (table[b] == key) goto key_found;
+            steps++;
+            b = next_bucket(b);
+        }
+
+        table[b] = key;
+        size++;
+
+      key_found:
+        update_counter(steps);
+    }
+
+    void reset_counter() { ops = steps = max_ = 0; }
+    double report_avg() { return ((double)steps) / max(1U, ops); }
+    double report_max() { return max_; }
+
+  private:
+    void update_counter(unsigned steps) {
+        ops++;
+        this->steps += steps;
+        max_ = max(steps, max_);
+    }
+
+    unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
+};
+
+void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) {
+    vector<double> avg(max_usage, 0.0);
+    vector<double> avg2(max_usage, 0.0);
+
+    unsigned N = 1 << 20;
+    unsigned step_size = N / 100;
+
+    vector<uint> elements(N);
+    for (unsigned i = 0; i < N; i++) elements[i] = i;
+
+    for (int t = 0; t < retry; t++) {
+        HashTable H(factory, N);
+        for (unsigned i = 0; i < N-1; i++)
+            swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
+
+        for (int s = 0; s < max_usage; s++) {
+            H.reset_counter();
+            for (unsigned i = 0; i < step_size; i++)
+                H.insert(elements[s*step_size + i]);
+
+            avg[s] += H.report_avg();
+            avg2[s] += H.report_avg() * H.report_avg();
+        }
+    }
+
+    for (int i = 0; i < max_usage; i++) {
+        avg[i] /= retry;
+        avg2[i] /= retry;
+        double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
+
+        printf("%i %.03lf %.03lf\n", i+1, avg[i], std_dev);
+    }
+}
+
+
+void grow_test(HashFunctionFactory factory, int usage = 60, int retry = 40,
+               int begin = 7, int end = 22) {
+
+    for (int n = begin; n < end; n++) {
+        double avg = 0;
+        double avg2 = 0;
+        unsigned N = 1 << n;
+
+        vector<uint> elements(N);
+        for (unsigned i = 0; i < N; i++) elements[i] = i;
+
+        for (int t = 0; t < retry; t++) {
+            HashTable H(factory, N);
+            for (unsigned i = 0; i < N-1; i++)
+                swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
+
+            for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
+                H.insert(elements[i]);
+
+            for (unsigned i = 0; i < N; i++)
+                H.lookup(i);
+
+            avg += H.report_avg();
+            avg2 += H.report_avg() * H.report_avg();
+        }
+
+        avg /= retry;
+        avg2 /= retry;
+        double std_dev = sqrt(avg2 - avg*avg);
+
+        printf("%i %.03lf %.03lf\n", N, avg, std_dev);
+    }
+}
+
+int main(int argc, char** argv) {
+    vector<pair<string, HashFunctionFactory>> grow_tests = {
+        {"grow-ms-low", MultiplyShiftLowHash::factory},
+        {"grow-ms-high", MultiplyShiftHighHash::factory},
+        {"grow-poly-1", LinearHash::factory},
+        {"grow-poly-2", QuadraticHash::factory},
+        {"grow-tab", TabulationHash::factory}
+    };
+    vector<pair<string, HashFunctionFactory>> usage_tests = {
+        {"usage-ms-low", MultiplyShiftLowHash::factory},
+        {"usage-ms-high", MultiplyShiftHighHash::factory},
+        {"usage-poly-1", LinearHash::factory},
+        {"usage-poly-2", QuadraticHash::factory},
+        {"usage-tab", TabulationHash::factory}
+    };
+
+    if (argc != 3) goto fail;
+
+    rng = RandomGen(atoi(argv[2]));
+
+    for (auto t : grow_tests) {
+        if (t.first == argv[1]) {
+            grow_test(t.second);
+            return 0;
+        }
+    }
+
+    for (auto t : usage_tests) {
+        if (t.first == argv[1]) {
+            usage_test(t.second);
+            return 0;
+        }
+    }
+
+  fail:
+    printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]);
+    for (auto t : grow_tests) printf(" %s", t.first.c_str());
+    for (auto t : usage_tests) printf(" %s", t.first.c_str());
+    return 1;
+}
+
diff --git a/08-hash_experiment/cpp/random.h b/08-hash_experiment/cpp/random.h
new file mode 100644
index 0000000..5ef10ae
--- /dev/null
+++ b/08-hash_experiment/cpp/random.h
@@ -0,0 +1,61 @@
+#ifndef DS1_RANDOM_H
+#define DS1_RANDOM_H
+
+#include <cstdint>
+
+/*
+ * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
+ * and Sebastiano Vigna, distributed under the CC-0 license. For more details,
+ * see http://vigna.di.unimi.it/xorshift/.
+ *
+ * Rewritten to C++ by Martin Mares, also placed under CC-0.
+ */
+
+class RandomGen {
+    uint64_t state[2];
+
+    uint64_t rotl(uint64_t x, int k)
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+
+  public:
+    // Initialize the generator, set its seed and warm it up.
+    RandomGen(unsigned int seed)
+    {
+        state[0] = seed * 0xdeadbeef;
+        state[1] = seed ^ 0xc0de1234;
+        for (int i=0; i<100; i++)
+            next_u64();
+    }
+
+    // Generate a random 64-bit number.
+    uint64_t next_u64(void)
+    {
+        uint64_t s0 = state[0], s1 = state[1];
+        uint64_t result = s0 + s1;
+        s1 ^= s0;
+        state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+        state[1] = rotl(s1, 36);
+        return result;
+    }
+
+    // Generate a random 32-bit number.
+    uint32_t next_u32(void)
+    {
+      return next_u64() >> 11;
+    }
+
+    // Generate a number between 0 and range-1.
+    unsigned int next_range(unsigned int range)
+    {
+        /*
+         * This is not perfectly uniform, unless the range is a power of two.
+         * However, for 64-bit random values and 32-bit ranges, the bias is
+         * insignificant.
+         */
+        return next_u64() % range;
+    }
+};
+
+#endif
diff --git a/08-hash_experiment/python/Makefile b/08-hash_experiment/python/Makefile
new file mode 100644
index 0000000..e9373dc
--- /dev/null
+++ b/08-hash_experiment/python/Makefile
@@ -0,0 +1,14 @@
+STUDENT_ID ?= PLEASE_SET_STUDENT_ID
+
+HASHFUNCS=ms-low ms-high poly-1 poly-2 tab
+
+.PHONY: test
+test: $(addprefix out/t-grow-, $(HASHFUNCS)) $(addprefix out/t-usage-, $(HASHFUNCS))
+
+out/t-%: hash_experiment.py
+	@mkdir -p out
+	./hash_experiment.py $* $(STUDENT_ID) >$@
+
+.PHONY: clean
+clean:
+	rm -rf out
diff --git a/08-hash_experiment/python/hash_experiment.py b/08-hash_experiment/python/hash_experiment.py
new file mode 100644
index 0000000..9de266e
--- /dev/null
+++ b/08-hash_experiment/python/hash_experiment.py
@@ -0,0 +1,217 @@
+#!/usr/bin/env python3
+
+import random, sys
+from math import sqrt
+
+# Our wrapper of random so we can substitute it with another random generator
+rng_init = lambda x: random.seed(x)
+rng_next_u32 = lambda: random.randint(0, 2**32 - 1)
+
+class TabulationHash:
+    """Hash function for hashing by tabulation.
+
+    The 32-bit key is split to four 8-bit parts. Each part indexes
+    a separate table of 256 randomly generated values. Obtained values
+    are XORed together.
+    """
+
+    def __init__(self, num_buckets):
+        self.num_buckets = num_buckets
+        self.tables = [None] * 4
+        for i in range(4):
+            self.tables[i] = [ rng_next_u32() for _ in range(256) ]
+
+    def __call__(self, key):
+        h0 = key & 0xff;
+        h1 = (key >> 8) & 0xff;
+        h2 = (key >> 16) & 0xff;
+        h3 = (key >> 24) & 0xff;
+        t = self.tables
+        return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
+
+class PolynomialHash:
+    """Hash function using polynomial modulo a prime."""
+
+    def __init__(self, num_buckets, degree, prime = 2147483647):
+        self.num_buckets = num_buckets
+        self.prime = prime
+        self.coefs = [ rng_next_u32() for _ in range(degree + 1) ]
+
+    def __call__(self, key):
+        acc = 0
+        for c in self.coefs:
+            acc = (acc * key + c) % self.prime
+        return acc % self.num_buckets
+
+LinearHash = lambda num_buckets: PolynomialHash(num_buckets, 1)
+QuadraticHash = lambda num_buckets: PolynomialHash(num_buckets, 2)
+
+class MultiplyShiftLowHash:
+    """Multiply-shift hash function taking top bits of 32-bit word"""
+
+    def __init__(self, num_buckets):
+        self.mask = num_buckets - 1
+        assert (num_buckets & self.mask == 0), \
+            "MultiplyShiftLowHash: num_buckets must be power of 2"
+
+        self.mult = rng_next_u32() | 0x1
+        self.shift = 0;
+        tmp = num_buckets - 1
+        while 0x80000000 & tmp == 0:
+            tmp <<= 1
+            self.shift += 1
+
+    def __call__(self, key):
+        return ((key * self.mult) >> self.shift) & self.mask
+
+class MultiplyShiftHighHash:
+    """Multiply-shift hash function taking low bits of upper half of 64-bit word"""
+
+    def __init__(self, num_buckets):
+        self.mask = num_buckets - 1
+        assert (num_buckets & self.mask == 0), \
+            "MultiplyShiftLowHash: num_buckets must be power of 2"
+        self.mult = (rng_next_u32() << 32) | rng_next_u32() | 0x1
+
+    def __call__(self, key):
+        return ((key * self.mult) >> 32) & self.mask
+
+class HashTable:
+    """Hash table with linear probing"""
+
+    def __init__(self, hash_fun_factory, num_buckets):
+        self._hash = hash_fun_factory(num_buckets)
+        self._num_buckets = num_buckets
+        self._table = [None] * num_buckets
+        self._size = 0
+        self.reset_counter()
+
+    def _next_bucket(self, b):
+        return (b + 1) % self._num_buckets
+
+    def lookup(self, key):
+        """Check whether key is present in the table."""
+        ret = False
+        steps = 1
+
+        b = self._hash(key)
+        while self._table[b] is not None:
+            if self._table[b] == key:
+              ret = True
+              break
+            steps += 1
+            b = self._next_bucket(b)
+
+        self._update_counter(steps)
+        return ret
+
+    def insert(self, key):
+        """Add the key in the table."""
+        assert self._size < self._num_buckets, "Cannot insert into a full table."
+        steps = 1
+
+        b = self._hash(key)
+        while self._table[b] is not None:
+            if self._table[b] == key: break
+            steps += 1
+            b = self._next_bucket(b)
+        else:
+            self._table[b] = key
+
+        self._update_counter(steps)
+
+    def _update_counter(self, steps):
+        self._ops += 1
+        self._steps += steps
+        self._max = max(self._max, steps)
+
+    def reset_counter(self):
+        self._steps = 0
+        self._ops = 0
+        self._max = 0
+
+    def report_avg(self): return self._steps / max(1, self._ops)
+    def report_max(self): return self._max
+
+def permute_list(l):
+    N = len(l)
+    for i in range(N - 1):
+        dst = i + (rng_next_u32() % (N-i))
+        l[i], l[dst] = l[dst], l[i]
+
+def usage_test(hash_fun_factory, max_usage = 90, retry = 40):
+    avg = [0.0] * max_usage
+    avg2 = [0.0] * max_usage
+
+    N = 2**19
+    step_size = N // 100
+    elements = list(range(N))
+
+    for _ in range(retry):
+        H = HashTable(hash_fun_factory, N)
+        permute_list(elements)
+
+        for s in range(max_usage):
+            H.reset_counter()
+            for i in range(step_size):
+                H.insert(s*step_size + i)
+            avg[s] += H.report_avg()
+            avg2[s] += H.report_avg() ** 2
+
+    for i in range(max_usage):
+        avg[i] /= retry;
+        avg2[i] /= retry;
+        std_dev = sqrt(avg2[i] - avg[i]**2)
+
+        print("%i %.03f %.03f" % ((i + 1), avg[i], std_dev))
+
+def grow_test(hash_fun_factory, usage = 60, retry = 40, begin = 7, end = 21):
+    for n in range(begin, end):
+        avg = 0.0
+        avg2 = 0.0
+        N = 2 ** n
+        elements = list(range(N))
+
+        for _ in range(retry):
+            H = HashTable(hash_fun_factory, N)
+            permute_list(elements)
+
+            for x in elements[:N * usage // 100]:
+                H.insert(x)
+
+            for i in range(N):
+                H.lookup(i)
+
+            avg += H.report_avg()
+            avg2 += H.report_avg() ** 2
+
+        avg /= retry
+        avg2 /= retry
+        std_dev = sqrt(avg2 - avg**2)
+
+        print("%i %.03f %.03f" % (N, avg, std_dev))
+
+tests = {
+    "usage-ms-low": lambda: usage_test(MultiplyShiftLowHash),
+    "usage-ms-high": lambda: usage_test(MultiplyShiftHighHash),
+    "usage-poly-1": lambda: usage_test(LinearHash),
+    "usage-poly-2": lambda: usage_test(QuadraticHash),
+    "usage-tab": lambda: usage_test(TabulationHash),
+
+    "grow-ms-low": lambda: grow_test(MultiplyShiftLowHash),
+    "grow-ms-high": lambda: grow_test(MultiplyShiftHighHash),
+    "grow-poly-1": lambda: grow_test(LinearHash),
+    "grow-poly-2": lambda: grow_test(QuadraticHash),
+    "grow-tab": lambda: grow_test(TabulationHash),
+}
+
+if len(sys.argv) == 3:
+    test, student_id = sys.argv[1], sys.argv[2]
+    rng_init(int(student_id))
+    if test in tests:
+        tests[test]()
+    else:
+        raise ValueError("Unknown test {}".format(test))
+else:
+    raise ValueError("Usage: {} <test> <student-id>".format(sys.argv[0]))
+
diff --git a/08-hash_experiment/task.md b/08-hash_experiment/task.md
new file mode 100644
index 0000000..8616811
--- /dev/null
+++ b/08-hash_experiment/task.md
@@ -0,0 +1,74 @@
+## Goal
+
+The goal of this assignment is to experimentally evaluate Linear probing
+hash table with different systems of hash functions.
+
+You are given a test program (`hash_experiment`) which implements everything
+needed to perform the following experiments:
+
+- _Grow experiment:_ This experiment tries different sizes $N$ of the hash table and for each size
+  it inserts small keys in random order until 60% of the table is used
+  and then it performs lookup operation for keys $0,\ldots,N-1$.
+- _Usage experiment:_ This experiment uses hash table of size $2^{20}$. It performs insertions
+  to increase usage of the table by 1%, reports efficiency of the insert operation,
+  and repeats until usage of the table reaches 90%.
+
+Both experiments measure number of probed buckets per operation, are repeated 40 times
+and report average and standard deviation. Note that even with 40 repetitions
+the reported numbers still depend quite a lot on the random seed used.
+
+You should perform these experiments for 5 different classes of hash functions –
+tabulation, multiply-shift which uses top bits of 32-bit word (`ms-low`),
+multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`),
+and polynomial hash function of degree 1 and 2 – and write a report, which contains two
+plots of the measured data for each experiment. The first plot should contain average
+complexity of operations and the second one the standard deviation.
+
+Each plot should show the dependence of the average number of probed buckets
+either on size of the hash table (the grow experiment) or the usage of the hash table
+(the usage experiment).
+
+The report should discuss the experimental results and try to explain the observed
+behavior using theory from the lectures. (If you want, you can carry out further
+experiments to gain better understanding of the data structure and include these
+in the report. This is strictly optional.)
+
+You should submit a PDF file with the report (and no source code).
+You will get 1 temporary point upon submission if the file is syntactically correct;
+proper points will be assigned later.
+
+## Test program
+
+The test program is given two arguments:
+- The name of the test (`{grow,usage}-{ms-low,ms-high,poly-1,poly-2,tab}`).
+- The random seed: you should use the last 2 digits of your student ID (you can find
+  it in the Study Information System – just click on the Personal data icon). Please
+  include the random seed in your report.
+
+The output of the program contains one line per experiment, which consists of
+the set size and the average number of structural changes.
+
+## Hints
+
+The following tools can be useful for producing nice plots:
+- [pandas](https://pandas.pydata.org/)
+- [matplotlib](https://matplotlib.org/)
+- [gnuplot](http://www.gnuplot.info/)
+
+A quick checklist for plots:
+- Is there a caption explaining what is plotted?
+- Are the axes clearly labelled? Do they have value ranges and units?
+- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs
+  are more fitting in some cases, but you should tell.)
+- Is it clear which curve means what?
+- Is it clear what are the measured points and what is an interpolated
+  curve between them?
+- Are there any overlaps? (E.g., the most interesting part of the curve
+  hidden underneath a label?)
+
+In your discussion, please distinguish the following kinds of claims.
+It should be always clear which is which:
+- Experimental results (i.e., the raw data you obtained from the experiments)
+- Theoretical facts (i.e., claims we have proved mathematically)
+- Your hypotheses (e.g., when you claim that the graph looks like something is true,
+  but you are not able to prove rigorously that it always holds)
-- 
GitLab