From 4daa7ed48f9e25e3ff33540b71c7017cf45c663f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Ondr=C3=A1=C4=8Dek?= <ondracek.lukas@gmail.com> Date: Mon, 23 Nov 2020 17:35:53 +0100 Subject: [PATCH] Cuckoo hash --- 07-cuckoo_hash/cpp/Makefile | 13 +++ 07-cuckoo_hash/cpp/cuckoo_hash.h | 103 ++++++++++++++++++++++ 07-cuckoo_hash/cpp/cuckoo_hash_test.cpp | 35 ++++++++ 07-cuckoo_hash/cpp/random.h | 59 +++++++++++++ 07-cuckoo_hash/cpp/test_main.cpp | 43 +++++++++ 07-cuckoo_hash/python/cuckoo_hash.py | 56 ++++++++++++ 07-cuckoo_hash/python/cuckoo_hash_test.py | 41 +++++++++ 07-cuckoo_hash/task.md | 12 +++ 8 files changed, 362 insertions(+) create mode 100644 07-cuckoo_hash/cpp/Makefile create mode 100644 07-cuckoo_hash/cpp/cuckoo_hash.h create mode 100644 07-cuckoo_hash/cpp/cuckoo_hash_test.cpp create mode 100644 07-cuckoo_hash/cpp/random.h create mode 100644 07-cuckoo_hash/cpp/test_main.cpp create mode 100644 07-cuckoo_hash/python/cuckoo_hash.py create mode 100755 07-cuckoo_hash/python/cuckoo_hash_test.py create mode 100644 07-cuckoo_hash/task.md diff --git a/07-cuckoo_hash/cpp/Makefile b/07-cuckoo_hash/cpp/Makefile new file mode 100644 index 0000000..f32e87a --- /dev/null +++ b/07-cuckoo_hash/cpp/Makefile @@ -0,0 +1,13 @@ +test: cuckoo_hash_test + ./$< + +INCLUDE ?= . +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) + +cuckoo_hash_test: cuckoo_hash_test.cpp cuckoo_hash.h test_main.cpp $(INCLUDE)/random.h + $(CXX) $(CXXFLAGS) $^ -o $@ + +clean: + rm -f cuckoo_hash_test + +.PHONY: clean test diff --git a/07-cuckoo_hash/cpp/cuckoo_hash.h b/07-cuckoo_hash/cpp/cuckoo_hash.h new file mode 100644 index 0000000..32a5660 --- /dev/null +++ b/07-cuckoo_hash/cpp/cuckoo_hash.h @@ -0,0 +1,103 @@ +#include <string> +#include <vector> +#include <cstdint> +#include <iostream> + +#include "random.h" + +using namespace std; + +// If the condition is not true, report an error and halt. +#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) + +void expect_failed(const string& message); + +class TabulationHash { + /* + * Hash function for hashing by tabulation. + * + * The 32-bit key is split to four 8-bit parts. Each part indexes + * a separate table of 256 randomly generated values. Obtained values + * are XORed together. + */ + + unsigned num_buckets; + uint32_t tables[4][256]; + +public: + TabulationHash(unsigned num_buckets, RandomGen *random_gen) + { + this->num_buckets = num_buckets; + for (int i=0; i<4; i++) + for (int j=0; j<256; j++) + tables[i][j] = random_gen->next_u32(); + } + + uint32_t hash(uint32_t key) + { + unsigned h0 = key & 0xff; + unsigned h1 = (key >> 8) & 0xff; + unsigned h2 = (key >> 16) & 0xff; + unsigned h3 = (key >> 24) & 0xff; + return (tables[0][h0] ^ tables[1][h1] ^ tables[2][h2] ^ tables[3][h3]) % num_buckets; + } +}; + +class CuckooTable { + /* + * Hash table with Cuckoo hashing. + * + * We have two hash functions, which map 32-bit keys to buckets of a common + * hash table. Unused buckets contain 0xffffffff. + */ + + const uint32_t UNUSED = 0xffffffff; + + // The array of buckets + vector<uint32_t> table; + unsigned num_buckets; + + // Hash functions and the random generator used to create them + TabulationHash *hashes[2]; + RandomGen *random_gen; + +public: + + CuckooTable(unsigned num_buckets) + { + // Initialize the table with the given number of buckets. + // The number of buckets is expected to stay constant. + + this->num_buckets = num_buckets; + table.resize(num_buckets, UNUSED); + + // Obtain two fresh hash functions. + random_gen = new RandomGen(42); + for (int i=0; i<2; i++) + hashes[i] = new TabulationHash(num_buckets, random_gen); + } + + ~CuckooTable() + { + for (int i=0; i<2; i++) + delete hashes[i]; + delete random_gen; + } + + bool lookup(uint32_t key) + { + // Check if the table contains the given key. Returns True or False. + unsigned h0 = hashes[0]->hash(key); + unsigned h1 = hashes[1]->hash(key); + return (table[h0] == key || table[h1] == key); + } + + void insert(uint32_t key) + { + // Insert a new key to the table. Assumes that the key is not present yet. + EXPECT(key != UNUSED, "Keys must differ from UNUSED."); + + // TODO: Implement + } + +}; diff --git a/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp b/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp new file mode 100644 index 0000000..84ececb --- /dev/null +++ b/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp @@ -0,0 +1,35 @@ +#include <functional> +#include <cstdlib> +#include <vector> + +#include "cuckoo_hash.h" + +void simple_test(unsigned n, unsigned table_size_percentage) +{ + CuckooTable table(n * table_size_percentage / 100); + + for (unsigned i=0; i < n; i++) + table.insert(37*i); + + for (unsigned i=0; i < n; i++) { + EXPECT(table.lookup(37*i), "Item not present in table, but it should be."); + EXPECT(!table.lookup(37*i+1), "Item present in table, even though it should not be."); + } +} + +void multiple_test(unsigned min_n, unsigned max_n, unsigned step_n, unsigned table_size_percentage) +{ + for (unsigned n=min_n; n < max_n; n += step_n) { + printf("\tn=%u\n", n); + simple_test(n, table_size_percentage); + } +} + +/*** A list of all tests ***/ + +vector<pair<string, function<void()>>> tests = { + { "small", [] { simple_test(100, 400); } }, + { "middle", [] { simple_test(31415, 300); } }, + { "big", [] { simple_test(1000000, 300); } }, + { "tight", [] { multiple_test(20000, 40000, 500, 205); } }, +}; diff --git a/07-cuckoo_hash/cpp/random.h b/07-cuckoo_hash/cpp/random.h new file mode 100644 index 0000000..7d18ab6 --- /dev/null +++ b/07-cuckoo_hash/cpp/random.h @@ -0,0 +1,59 @@ +#define DS1_RANDOM_H + +#include <cstdint> + +/* + * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman + * and Sebastiano Vigna, distributed under the CC-0 license. For more details, + * see http://vigna.di.unimi.it/xorshift/. + * + * Rewritten to C++ by Martin Mares, also placed under CC-0. + */ + +class RandomGen { + uint64_t state[2]; + + uint64_t rotl(uint64_t x, int k) + { + return (x << k) | (x >> (64 - k)); + } + + public: + // Initialize the generator, set its seed and warm it up. + RandomGen(unsigned int seed) + { + state[0] = seed * 0xdeadbeef; + state[1] = seed ^ 0xc0de1234; + for (int i=0; i<100; i++) + next_u64(); + } + + // Generate a random 64-bit number. + uint64_t next_u64(void) + { + uint64_t s0 = state[0], s1 = state[1]; + uint64_t result = s0 + s1; + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + return result; + } + + // Generate a random 32-bit number. + uint32_t next_u32(void) + { + return next_u64() >> 11; + } + + // Generate a number between 0 and range-1. + unsigned int next_range(unsigned int range) + { + /* + * This is not perfectly uniform, unless the range is a power of two. + * However, for 64-bit random values and 32-bit ranges, the bias is + * insignificant. + */ + return next_u64() % range; + } +}; + diff --git a/07-cuckoo_hash/cpp/test_main.cpp b/07-cuckoo_hash/cpp/test_main.cpp new file mode 100644 index 0000000..3f4aff0 --- /dev/null +++ b/07-cuckoo_hash/cpp/test_main.cpp @@ -0,0 +1,43 @@ +#include <cstdlib> +#include <functional> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +using namespace std; + +extern vector<pair<string, function<void()>>> tests; + +void expect_failed(const string& message) { + cerr << "Test error: " << message << endl; + exit(1); +} + +int main(int argc, char* argv[]) { + vector<string> required_tests; + + if (argc > 1) { + required_tests.assign(argv + 1, argv + argc); + } else { + for (const auto& test : tests) + required_tests.push_back(test.first); + } + + for (const auto& required_test : required_tests) { + bool found = false; + for (const auto& test : tests) + if (required_test == test.first) { + cerr << "Running test " << required_test << endl; + test.second(); + found = true; + break; + } + if (!found) { + cerr << "Unknown test " << required_test << endl; + return 1; + } + } + + return 0; +} diff --git a/07-cuckoo_hash/python/cuckoo_hash.py b/07-cuckoo_hash/python/cuckoo_hash.py new file mode 100644 index 0000000..72a415c --- /dev/null +++ b/07-cuckoo_hash/python/cuckoo_hash.py @@ -0,0 +1,56 @@ +import random +import math + +class TabulationHash: + """Hash function for hashing by tabulation. + + The 32-bit key is split to four 8-bit parts. Each part indexes + a separate table of 256 randomly generated values. Obtained values + are XORed together. + """ + + def __init__(self, num_buckets): + self.tables = [None] * 4 + for i in range(4): + self.tables[i] = [random.randint(0, 0xffffffff) for _ in range(256)] + self.num_buckets = num_buckets + + def hash(self, key): + h0 = key & 0xff + h1 = (key >> 8) & 0xff + h2 = (key >> 16) & 0xff + h3 = (key >> 24) & 0xff + t = self.tables + return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets + +class CuckooTable: + """Hash table with Cuckoo hashing. + + We have two hash functions, which map 32-bit keys to buckets of a common + hash table. Unused buckets contain None. + """ + + def __init__(self, num_buckets): + """Initialize the table with the given number of buckets. + The number of buckets is expected to stay constant.""" + + # The array of buckets + self.num_buckets = num_buckets + self.table = [None] * num_buckets + + # Create two fresh hash functions + self.hashes = [TabulationHash(num_buckets), TabulationHash(num_buckets)] + + def lookup(self, key): + """Check if the table contains the given key. Returns True or False.""" + + b0 = self.hashes[0].hash(key) + b1 = self.hashes[1].hash(key) + # print("## Lookup key={} b0={} b1={}".format(key, b0, b1)) + return self.table[b0] == key or self.table[b1] == key + + def insert(self, key): + """Insert a new key to the table. Assumes that the key is not present yet.""" + + # TODO: Implement + raise NotImplementedError diff --git a/07-cuckoo_hash/python/cuckoo_hash_test.py b/07-cuckoo_hash/python/cuckoo_hash_test.py new file mode 100755 index 0000000..f9137c4 --- /dev/null +++ b/07-cuckoo_hash/python/cuckoo_hash_test.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +import sys +import random + +from cuckoo_hash import CuckooTable + +def simple_test(n, table_size_percentage): + random.seed(42) + table = CuckooTable(n*table_size_percentage//100) + + # Insert an arithmetic progression + for i in range(n): + table.insert(37*i) + + # Verify contents of the table + for i in range(n): + assert table.lookup(37*i), "Item not present in table, but it should be." + assert not table.lookup(37*i+1), "Item present in table, even though it should not be." + +def multiple_test(min_n, max_n, step_n, table_size_percentage): + for n in range(min_n, max_n, step_n): + print("\tn={}".format(n)) + simple_test(n, table_size_percentage) + +# A list of all tests +tests = [ + ("small", lambda: simple_test(100, 400)), + ("middle", lambda: simple_test(31415, 300)), + ("big", lambda: simple_test(1000000, 300)), + ("tight", lambda: multiple_test(20000, 40000, 500, 205)), +] + +if __name__ == "__main__": + for required_test in sys.argv[1:] or [name for name, _ in tests]: + for name, test in tests: + if name == required_test: + print("Running test {}".format(name), file=sys.stderr) + test() + break + else: + raise ValueError("Unknown test {}".format(name)) diff --git a/07-cuckoo_hash/task.md b/07-cuckoo_hash/task.md new file mode 100644 index 0000000..dc71991 --- /dev/null +++ b/07-cuckoo_hash/task.md @@ -0,0 +1,12 @@ +Implement Cuckoo hash table with simple tabulation hashing. + +You are given a skeleton code which defines the table, implements +`lookup()`, and provides hash functions. You have to add an `insert()` +method. + +If too many elements are moved during a single insert, the table must +be rehashed with new hash functions. See lecture notes for the particular +bounds. + +The size of the table should stay constant +throughout the existence of the data structure. -- GitLab