From dc065fd65e8b235c995ab573b55d0bff47af3205 Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Wed, 24 Apr 2019 18:01:29 +0200
Subject: [PATCH] Cuckoo hashing

---
 09-cuckoo_hash/cpp/Makefile               |  13 +++
 09-cuckoo_hash/cpp/cuckoo_hash.h          | 102 ++++++++++++++++++++++
 09-cuckoo_hash/cpp/cuckoo_hash_test.cpp   |  35 ++++++++
 09-cuckoo_hash/cpp/random.h               |  59 +++++++++++++
 09-cuckoo_hash/cpp/test_main.cpp          |  43 +++++++++
 09-cuckoo_hash/python/cuckoo_hash.py      |  56 ++++++++++++
 09-cuckoo_hash/python/cuckoo_hash_test.py |  41 +++++++++
 09-cuckoo_hash/task.md                    |   9 ++
 8 files changed, 358 insertions(+)
 create mode 100644 09-cuckoo_hash/cpp/Makefile
 create mode 100644 09-cuckoo_hash/cpp/cuckoo_hash.h
 create mode 100644 09-cuckoo_hash/cpp/cuckoo_hash_test.cpp
 create mode 100644 09-cuckoo_hash/cpp/random.h
 create mode 100644 09-cuckoo_hash/cpp/test_main.cpp
 create mode 100644 09-cuckoo_hash/python/cuckoo_hash.py
 create mode 100755 09-cuckoo_hash/python/cuckoo_hash_test.py
 create mode 100644 09-cuckoo_hash/task.md

diff --git a/09-cuckoo_hash/cpp/Makefile b/09-cuckoo_hash/cpp/Makefile
new file mode 100644
index 0000000..f32e87a
--- /dev/null
+++ b/09-cuckoo_hash/cpp/Makefile
@@ -0,0 +1,13 @@
+test: cuckoo_hash_test
+	./$<
+
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+
+cuckoo_hash_test: cuckoo_hash_test.cpp cuckoo_hash.h test_main.cpp $(INCLUDE)/random.h
+	$(CXX) $(CXXFLAGS) $^ -o $@
+
+clean:
+	rm -f cuckoo_hash_test
+
+.PHONY: clean test
diff --git a/09-cuckoo_hash/cpp/cuckoo_hash.h b/09-cuckoo_hash/cpp/cuckoo_hash.h
new file mode 100644
index 0000000..6323dff
--- /dev/null
+++ b/09-cuckoo_hash/cpp/cuckoo_hash.h
@@ -0,0 +1,102 @@
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <iostream>
+
+#include "random.h"
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message);
+
+class TabulationHash {
+    /*
+     * Hash function for hashing by tabulation.
+     *
+     * The 32-bit key is split to four 8-bit parts. Each part indexes
+     * a separate table of 256 randomly generated values. Obtained values
+     * are XORed together.
+     */
+
+    unsigned num_buckets;
+    uint32_t tables[4][256];
+
+public:
+    TabulationHash(unsigned num_buckets, RandomGen *random_gen)
+    {
+      this->num_buckets = num_buckets;
+      for (int i=0; i<4; i++)
+          for (int j=0; j<256; j++)
+              tables[i][j] = random_gen->next_u32();
+    }
+
+    uint32_t hash(uint32_t key)
+    {
+        unsigned h0 = key & 0xff;
+        unsigned h1 = (key >> 8) & 0xff;
+        unsigned h2 = (key >> 16) & 0xff;
+        unsigned h3 = (key >> 24) & 0xff;
+        return (tables[0][h0] ^ tables[1][h1] ^ tables[2][h2] ^ tables[3][h3]) % num_buckets;
+    }
+};
+
+class CuckooTable {
+    /*
+     * Hash table with Cuckoo hashing.
+     *
+     * We have two hash functions, which map 32-bit keys to buckets of a common
+     * hash table. Unused buckets contain 0xffffffff.
+     */
+
+    const uint32_t UNUSED = 0xffffffff;
+
+    // The array of buckets
+    vector<uint32_t> table;
+    unsigned num_buckets;
+
+    // Hash functions and the random generator used to create them
+    TabulationHash *hashes[2];
+    RandomGen *random_gen;
+
+public:
+
+    CuckooTable(unsigned num_buckets)
+    {
+        // Initialize the table with the given number of buckets.
+
+        this->num_buckets = num_buckets;
+        table.resize(num_buckets, UNUSED);
+
+        // Obtain two fresh hash functions.
+        random_gen = new RandomGen(42);
+        for (int i=0; i<2; i++)
+            hashes[i] = new TabulationHash(num_buckets, random_gen);
+    }
+
+    ~CuckooTable()
+    {
+        for (int i=0; i<2; i++)
+            delete hashes[i];
+        delete random_gen;
+    }
+
+    bool lookup(uint32_t key)
+    {
+        // Check if the table contains the given key. Returns True or False.
+        unsigned h0 = hashes[0]->hash(key);
+        unsigned h1 = hashes[1]->hash(key);
+        return (table[h0] == key || table[h1] == key);
+    }
+
+    void insert(uint32_t key)
+    {
+        // Insert a new key to the table. Assumes that the key is not present yet.
+        EXPECT(key != UNUSED, "Keys must differ from UNUSED.");
+
+        // TODO: Implement
+    }
+
+};
diff --git a/09-cuckoo_hash/cpp/cuckoo_hash_test.cpp b/09-cuckoo_hash/cpp/cuckoo_hash_test.cpp
new file mode 100644
index 0000000..84ececb
--- /dev/null
+++ b/09-cuckoo_hash/cpp/cuckoo_hash_test.cpp
@@ -0,0 +1,35 @@
+#include <functional>
+#include <cstdlib>
+#include <vector>
+
+#include "cuckoo_hash.h"
+
+void simple_test(unsigned n, unsigned table_size_percentage)
+{
+    CuckooTable table(n * table_size_percentage / 100);
+
+    for (unsigned i=0; i < n; i++)
+        table.insert(37*i);
+
+    for (unsigned i=0; i < n; i++) {
+        EXPECT(table.lookup(37*i), "Item not present in table, but it should be.");
+        EXPECT(!table.lookup(37*i+1), "Item present in table, even though it should not be.");
+    }
+}
+
+void multiple_test(unsigned min_n, unsigned max_n, unsigned step_n, unsigned table_size_percentage)
+{
+    for (unsigned n=min_n; n < max_n; n += step_n) {
+        printf("\tn=%u\n", n);
+        simple_test(n, table_size_percentage);
+    }
+}
+
+/*** A list of all tests ***/
+
+vector<pair<string, function<void()>>> tests = {
+    { "small",     [] { simple_test(100, 400); } },
+    { "middle",    [] { simple_test(31415, 300); } },
+    { "big",       [] { simple_test(1000000, 300); } },
+    { "tight",     [] { multiple_test(20000, 40000, 500, 205); } },
+};
diff --git a/09-cuckoo_hash/cpp/random.h b/09-cuckoo_hash/cpp/random.h
new file mode 100644
index 0000000..7d18ab6
--- /dev/null
+++ b/09-cuckoo_hash/cpp/random.h
@@ -0,0 +1,59 @@
+#define DS1_RANDOM_H
+
+#include <cstdint>
+
+/*
+ * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
+ * and Sebastiano Vigna, distributed under the CC-0 license. For more details,
+ * see http://vigna.di.unimi.it/xorshift/.
+ *
+ * Rewritten to C++ by Martin Mares, also placed under CC-0.
+ */
+
+class RandomGen {
+    uint64_t state[2];
+
+    uint64_t rotl(uint64_t x, int k)
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+
+  public:
+    // Initialize the generator, set its seed and warm it up.
+    RandomGen(unsigned int seed)
+    {
+        state[0] = seed * 0xdeadbeef;
+        state[1] = seed ^ 0xc0de1234;
+        for (int i=0; i<100; i++)
+            next_u64();
+    }
+
+    // Generate a random 64-bit number.
+    uint64_t next_u64(void)
+    {
+        uint64_t s0 = state[0], s1 = state[1];
+        uint64_t result = s0 + s1;
+        s1 ^= s0;
+        state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+        state[1] = rotl(s1, 36);
+        return result;
+    }
+
+    // Generate a random 32-bit number.
+    uint32_t next_u32(void)
+    {
+      return next_u64() >> 11;
+    }
+
+    // Generate a number between 0 and range-1.
+    unsigned int next_range(unsigned int range)
+    {
+        /*
+         * This is not perfectly uniform, unless the range is a power of two.
+         * However, for 64-bit random values and 32-bit ranges, the bias is
+         * insignificant.
+         */
+        return next_u64() % range;
+    }
+};
+
diff --git a/09-cuckoo_hash/cpp/test_main.cpp b/09-cuckoo_hash/cpp/test_main.cpp
new file mode 100644
index 0000000..3f4aff0
--- /dev/null
+++ b/09-cuckoo_hash/cpp/test_main.cpp
@@ -0,0 +1,43 @@
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+int main(int argc, char* argv[]) {
+    vector<string> required_tests;
+
+    if (argc > 1) {
+        required_tests.assign(argv + 1, argv + argc);
+    } else {
+        for (const auto& test : tests)
+            required_tests.push_back(test.first);
+    }
+
+    for (const auto& required_test : required_tests) {
+        bool found = false;
+        for (const auto& test : tests)
+            if (required_test == test.first) {
+                cerr << "Running test " << required_test << endl;
+                test.second();
+                found = true;
+                break;
+            }
+        if (!found) {
+            cerr << "Unknown test " << required_test << endl;
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/09-cuckoo_hash/python/cuckoo_hash.py b/09-cuckoo_hash/python/cuckoo_hash.py
new file mode 100644
index 0000000..73f223e
--- /dev/null
+++ b/09-cuckoo_hash/python/cuckoo_hash.py
@@ -0,0 +1,56 @@
+import random
+import math
+
+class TabulationHash:
+    """Hash function for hashing by tabulation.
+
+    The 32-bit key is split to four 8-bit parts. Each part indexes
+    a separate table of 256 randomly generated values. Obtained values
+    are XORed together.
+    """
+
+    def __init__(self, num_buckets):
+        self.tables = [None] * 4
+        for i in range(4):
+            self.tables[i] = [random.randint(0, 0xffffffff) for _ in range(256)]
+        self.num_buckets = num_buckets
+
+    def hash(self, key):
+        h0 = key & 0xff;
+        h1 = (key >> 8) & 0xff;
+        h2 = (key >> 16) & 0xff;
+        h3 = (key >> 24) & 0xff;
+        t = self.tables
+        return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
+
+class CuckooTable:
+    """Hash table with Cuckoo hashing.
+
+    We have two hash functions, which map 32-bit keys to buckets of a common
+    hash table. Unused buckets contain None.
+    """
+
+    def __init__(self, num_buckets):
+        """Initialize the table with the given number of buckets."""
+
+        # The array of buckets
+        self.num_buckets = num_buckets
+        self.table = [None] * num_buckets
+
+        # Create two fresh hash functions
+        self.hashes = [TabulationHash(num_buckets), TabulationHash(num_buckets)]
+
+    def lookup(self, key):
+        """Check if the table contains the given key. Returns True or False."""
+
+        b0 = self.hashes[0].hash(key)
+        b1 = self.hashes[1].hash(key)
+        # print("## Lookup key={} b0={} b1={}".format(key, b0, b1))
+        return (self.table[b0] is not None and self.table[b0] == key) or \
+               (self.table[b1] is not None and self.table[b1] == key)
+
+    def insert(self, key):
+        """Insert a new key to the table. Assumes that the key is not present yet."""
+
+        # TODO: Implement
+        raise NotImplementedError
diff --git a/09-cuckoo_hash/python/cuckoo_hash_test.py b/09-cuckoo_hash/python/cuckoo_hash_test.py
new file mode 100755
index 0000000..f9137c4
--- /dev/null
+++ b/09-cuckoo_hash/python/cuckoo_hash_test.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+import sys
+import random
+
+from cuckoo_hash import CuckooTable
+
+def simple_test(n, table_size_percentage):
+    random.seed(42)
+    table = CuckooTable(n*table_size_percentage//100)
+
+    # Insert an arithmetic progression
+    for i in range(n):
+        table.insert(37*i)
+
+    # Verify contents of the table
+    for i in range(n):
+        assert table.lookup(37*i), "Item not present in table, but it should be."
+        assert not table.lookup(37*i+1), "Item present in table, even though it should not be."
+
+def multiple_test(min_n, max_n, step_n, table_size_percentage):
+    for n in range(min_n, max_n, step_n):
+        print("\tn={}".format(n))
+        simple_test(n, table_size_percentage)
+
+# A list of all tests
+tests = [
+    ("small",       lambda: simple_test(100, 400)),
+    ("middle",      lambda: simple_test(31415, 300)),
+    ("big",         lambda: simple_test(1000000, 300)),
+    ("tight",       lambda: multiple_test(20000, 40000, 500, 205)),
+]
+
+if __name__ == "__main__":
+    for required_test in sys.argv[1:] or [name for name, _ in tests]:
+        for name, test in tests:
+            if name == required_test:
+                print("Running test {}".format(name), file=sys.stderr)
+                test()
+                break
+        else:
+            raise ValueError("Unknown test {}".format(name))
diff --git a/09-cuckoo_hash/task.md b/09-cuckoo_hash/task.md
new file mode 100644
index 0000000..e568f72
--- /dev/null
+++ b/09-cuckoo_hash/task.md
@@ -0,0 +1,9 @@
+Implement Cuckoo hash table with simple tabulation hashing.
+
+You are given a skeleton code which defines the table, implements
+`lookup()`, and provides hash functions. You have to add an `insert()`
+method.
+
+If too many elements are moved during a single insert, the table must
+be rehashed with new hash functions. See lecture notes for the particular
+bounds.
-- 
GitLab