From 172652cff185b40da6dd8056d17f524efa24020e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Mare=C4=8Dek?= <marecek@ufal.mff.cuni.cz> Date: Tue, 23 Apr 2024 09:16:16 +0200 Subject: [PATCH] cuckoo hashing --- 08-cuckoo_hash/cpp/Makefile | 13 ++++ 08-cuckoo_hash/cpp/cuckoo_hash.h | 60 +++++++++++++++ 08-cuckoo_hash/cpp/cuckoo_hash_test.cpp | 72 ++++++++++++++++++ 08-cuckoo_hash/cpp/hash_functions.h | 92 +++++++++++++++++++++++ 08-cuckoo_hash/cpp/random.h | 59 +++++++++++++++ 08-cuckoo_hash/cpp/test_main.cpp | 43 +++++++++++ 08-cuckoo_hash/python/cuckoo_hash.py | 42 +++++++++++ 08-cuckoo_hash/python/cuckoo_hash_test.py | 71 +++++++++++++++++ 08-cuckoo_hash/python/hash_functions.py | 71 +++++++++++++++++ 08-cuckoo_hash/task.md | 14 ++++ 10 files changed, 537 insertions(+) create mode 100644 08-cuckoo_hash/cpp/Makefile create mode 100644 08-cuckoo_hash/cpp/cuckoo_hash.h create mode 100644 08-cuckoo_hash/cpp/cuckoo_hash_test.cpp create mode 100644 08-cuckoo_hash/cpp/hash_functions.h create mode 100644 08-cuckoo_hash/cpp/random.h create mode 100644 08-cuckoo_hash/cpp/test_main.cpp create mode 100644 08-cuckoo_hash/python/cuckoo_hash.py create mode 100755 08-cuckoo_hash/python/cuckoo_hash_test.py create mode 100644 08-cuckoo_hash/python/hash_functions.py create mode 100644 08-cuckoo_hash/task.md diff --git a/08-cuckoo_hash/cpp/Makefile b/08-cuckoo_hash/cpp/Makefile new file mode 100644 index 0000000..de5d48e --- /dev/null +++ b/08-cuckoo_hash/cpp/Makefile @@ -0,0 +1,13 @@ +test: cuckoo_hash_test + ./$< + +INCLUDE ?= . +CXXFLAGS=-std=c++20 -O2 -Wall -Wextra -g -Wno-sign-compare -Wno-array-bounds -I$(INCLUDE) + +cuckoo_hash_test: cuckoo_hash_test.cpp test_main.cpp cuckoo_hash.h hash_functions.h random.h + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ + +clean: + rm -f cuckoo_hash_test + +.PHONY: clean test diff --git a/08-cuckoo_hash/cpp/cuckoo_hash.h b/08-cuckoo_hash/cpp/cuckoo_hash.h new file mode 100644 index 0000000..94df4be --- /dev/null +++ b/08-cuckoo_hash/cpp/cuckoo_hash.h @@ -0,0 +1,60 @@ +#include <vector> +#include <cstdint> +#include <iostream> + +const uint32_t UNUSED = 0xffffffff; + +template<class Hash> +class CuckooTable { + /* + * Hash table with Cuckoo hashing. + * + * We have two hash functions, which map 32-bit keys to buckets of a common + * hash table. Unused buckets contain 0xffffffff. + */ + + // The array of buckets + vector<uint32_t> table; + uint32_t num_buckets; + + // Hash functions and the random generator used to create them + array<Hash,2> &hashes; + +public: + + CuckooTable(uint32_t num_buckets, array<Hash,2> &hashes) : num_buckets{num_buckets}, hashes{hashes} + { + // Initialize the table with the given number of buckets. + // The number of buckets is expected to stay constant. + + table.resize(num_buckets, UNUSED); + + } + + const vector<uint32_t>& get_table() const { + return table; + } + + bool lookup(uint32_t key) const { + // Check if the table contains the given key. Returns True or False. + uint32_t h0 = hashes[0].hash(key); + uint32_t h1 = hashes[1].hash(key); + return (table[h0] == key || table[h1] == key); + } + + void insert(uint32_t key) { + // Insert a new key to the table. Assumes that the key is not present yet. + EXPECT(key != UNUSED, "Keys must differ from UNUSED."); + + // TODO: Implement + } + + uint32_t rehash(uint32_t key) { + // Relocate all items using new hash functions and insert a given key. + for (int i=0; i<2; i++) + hashes[i].regenerate(); + + // TODO: Implement + return key; + } +}; diff --git a/08-cuckoo_hash/cpp/cuckoo_hash_test.cpp b/08-cuckoo_hash/cpp/cuckoo_hash_test.cpp new file mode 100644 index 0000000..2778181 --- /dev/null +++ b/08-cuckoo_hash/cpp/cuckoo_hash_test.cpp @@ -0,0 +1,72 @@ +#include <functional> +#include <cstdlib> +#include <vector> +#include <string> +#include <array> +#include <iostream> + +#include "hash_functions.h" +#include "cuckoo_hash.h" + +template<class Hash> +void inspect_table(const CuckooTable<Hash> &cuckoo, const array<Hash,2> &hashes, uint32_t n, uint32_t table_size, uint32_t step) { + const vector<uint32_t> &table = cuckoo.get_table(); + EXPECT(table.size() == table_size, "The size of table is given and it is expected not to be changed."); + for (uint32_t i = 0; i < n; i++) { + uint32_t k = step*i; + uint32_t h0 = hashes[0].hash(k), h1 = hashes[1].hash(k);; + EXPECT(table[h0] == k || table[h1] == k, "Item should be stored on one of two positions given by hash functions."); + EXPECT(h0 == h1 || table[h0] != k || table[h1] != k, "Item should be stored only on one position."); + } + for (uint32_t t = 0; t < table_size; t++) { + uint32_t k = table[t]; + if (k != UNUSED) { + EXPECT(k % step == 0 && k < step * n, "Only inserted items should be stored."); + EXPECT(hashes[0].hash(k) == t || hashes[1].hash(k) == t, "Item should be stored on one of two positions given by hash functions."); + } + } +} + +void simple_test(uint32_t n, uint32_t table_size_percentage) { + const uint32_t table_size = n * table_size_percentage / 100; + RandomGen random_gen(42); + array<TabulationHash,2> hashes{TabulationHash(table_size, random_gen), TabulationHash(table_size, random_gen)}; + CuckooTable cuckoo(table_size, hashes); + + for (uint32_t i=0; i < n; i++) + cuckoo.insert(37*i); + + for (uint32_t i=0; i < n; i++) { + EXPECT(cuckoo.lookup(37*i), "Item not present in table, but it should be."); + EXPECT(!cuckoo.lookup(37*i+1), "Item present in table, even though it should not be."); + } + + inspect_table(cuckoo, hashes, n, table_size, 37); +} + +void multiple_test(uint32_t min_n, uint32_t max_n, uint32_t step_n, uint32_t table_size_percentage) { + for (uint32_t n = min_n; n < max_n; n += step_n) { + printf("\tn=%u\n", n); + simple_test(n, table_size_percentage); + } +} + +void fixed_test() { + const uint32_t table_size = FixedHash::table_size; + array<FixedHash,2> hashes{FixedHash(0), FixedHash(1)}; + CuckooTable cuckoo(table_size, hashes); + for (uint32_t k = 0; k < FixedHash::keys; k++) { + cuckoo.insert(k); + } + inspect_table(cuckoo, hashes, FixedHash::keys, table_size, 1); +} + +/*** A list of all tests ***/ + +vector<pair<string, function<void()>>> tests = { + { "small", [] { simple_test(100, 400); } }, + { "middle", [] { simple_test(31415, 300); } }, + { "big", [] { simple_test(1000000, 300); } }, + { "tight", [] { multiple_test(20000, 40000, 500, 205); } }, + { "fixed", fixed_test } +}; diff --git a/08-cuckoo_hash/cpp/hash_functions.h b/08-cuckoo_hash/cpp/hash_functions.h new file mode 100644 index 0000000..08995d1 --- /dev/null +++ b/08-cuckoo_hash/cpp/hash_functions.h @@ -0,0 +1,92 @@ +#include <cstdlib> +#include <string> + +#include "random.h" + +using namespace std; + +// If the condition is not true, report an error and halt. +#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) + +void expect_failed(const string& message); + +class TabulationHash { + /* + * Hash function for hashing by tabulation. + * + * The 32-bit key is split to four 8-bit parts. Each part indexes + * a separate table of 256 randomly generated values. Obtained values + * are XORed together. + */ + + size_t num_buckets; + RandomGen &random_gen; + uint32_t tables[4][256]; + +public: + TabulationHash(size_t num_buckets, RandomGen &random_gen) : num_buckets(num_buckets), random_gen(random_gen) { + regenerate(); + } + + void regenerate() { + for (int i=0; i<4; i++) + for (int j=0; j<256; j++) + tables[i][j] = random_gen.next_u32(); + } + + uint32_t hash(uint32_t key) const { + uint32_t h0 = key & 0xff; + uint32_t h1 = (key >> 8) & 0xff; + uint32_t h2 = (key >> 16) & 0xff; + uint32_t h3 = (key >> 24) & 0xff; + return (tables[0][h0] ^ tables[1][h1] ^ tables[2][h2] ^ tables[3][h3]) % num_buckets; + } +}; + +class FixedHash { +public: + static constexpr uint32_t keys = 5, max_regenerations = 6, table_size = 16; + +private: + static constexpr uint32_t hashes[max_regenerations][2][keys] { + { // Two items hashed into the same bucket by both functions + { 1, 7, 3, 7, 10 }, + { 2, 7, 4, 7, 11 } + }, + { // Three items stored in two positions + { 1, 7, 3, 8, 7 }, + { 2, 8, 4, 7, 8 } + }, + { // Four items stored in three positions + { 1, 7, 7, 8, 9 }, + { 2, 8, 9, 7, 8 } + }, + { // Five should be possible to store in five positions, but the cuckoo's insert operation may not find the proper locations + { 1, 2, 3, 4, 5 }, + { 2, 3, 4, 5, 1 } + }, + { // Five should be possible to store in six positions, the timeout in the insert may not be sufficient + { 1, 2, 3, 4, 5 }, + { 2, 3, 4, 5, 6 } + }, + { // This should be easy + { 8, 7, 7, 8, 12 }, + { 11, 6, 7, 9, 0 } + } + }; + + size_t regenerations, id; + +public: + FixedHash(size_t id) : regenerations{0}, id{id} {} + + uint32_t hash(uint32_t key) const { + EXPECT(key < keys, "Invalid key"); + return hashes[regenerations][id][key]; + } + + void regenerate() { + regenerations++; + EXPECT(regenerations < max_regenerations, "Too many rehashes"); + } +}; diff --git a/08-cuckoo_hash/cpp/random.h b/08-cuckoo_hash/cpp/random.h new file mode 100644 index 0000000..7d18ab6 --- /dev/null +++ b/08-cuckoo_hash/cpp/random.h @@ -0,0 +1,59 @@ +#define DS1_RANDOM_H + +#include <cstdint> + +/* + * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman + * and Sebastiano Vigna, distributed under the CC-0 license. For more details, + * see http://vigna.di.unimi.it/xorshift/. + * + * Rewritten to C++ by Martin Mares, also placed under CC-0. + */ + +class RandomGen { + uint64_t state[2]; + + uint64_t rotl(uint64_t x, int k) + { + return (x << k) | (x >> (64 - k)); + } + + public: + // Initialize the generator, set its seed and warm it up. + RandomGen(unsigned int seed) + { + state[0] = seed * 0xdeadbeef; + state[1] = seed ^ 0xc0de1234; + for (int i=0; i<100; i++) + next_u64(); + } + + // Generate a random 64-bit number. + uint64_t next_u64(void) + { + uint64_t s0 = state[0], s1 = state[1]; + uint64_t result = s0 + s1; + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + return result; + } + + // Generate a random 32-bit number. + uint32_t next_u32(void) + { + return next_u64() >> 11; + } + + // Generate a number between 0 and range-1. + unsigned int next_range(unsigned int range) + { + /* + * This is not perfectly uniform, unless the range is a power of two. + * However, for 64-bit random values and 32-bit ranges, the bias is + * insignificant. + */ + return next_u64() % range; + } +}; + diff --git a/08-cuckoo_hash/cpp/test_main.cpp b/08-cuckoo_hash/cpp/test_main.cpp new file mode 100644 index 0000000..3f4aff0 --- /dev/null +++ b/08-cuckoo_hash/cpp/test_main.cpp @@ -0,0 +1,43 @@ +#include <cstdlib> +#include <functional> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +using namespace std; + +extern vector<pair<string, function<void()>>> tests; + +void expect_failed(const string& message) { + cerr << "Test error: " << message << endl; + exit(1); +} + +int main(int argc, char* argv[]) { + vector<string> required_tests; + + if (argc > 1) { + required_tests.assign(argv + 1, argv + argc); + } else { + for (const auto& test : tests) + required_tests.push_back(test.first); + } + + for (const auto& required_test : required_tests) { + bool found = false; + for (const auto& test : tests) + if (required_test == test.first) { + cerr << "Running test " << required_test << endl; + test.second(); + found = true; + break; + } + if (!found) { + cerr << "Unknown test " << required_test << endl; + return 1; + } + } + + return 0; +} diff --git a/08-cuckoo_hash/python/cuckoo_hash.py b/08-cuckoo_hash/python/cuckoo_hash.py new file mode 100644 index 0000000..8af26b1 --- /dev/null +++ b/08-cuckoo_hash/python/cuckoo_hash.py @@ -0,0 +1,42 @@ +import math + +class CuckooTable: + """Hash table with Cuckoo hashing. + + We have two hash functions, which map 32-bit keys to buckets of a common + hash table. Unused buckets contain None. + """ + + def __init__(self, num_buckets, hashes): + """Initialize the table with the given number of buckets. + The number of buckets is expected to stay constant.""" + + # The array of buckets + self.num_buckets = num_buckets + self.table = [None] * num_buckets + self.hashes = hashes + + def get_table(self): + return self.table + + def lookup(self, key): + """Check if the table contains the given key. Returns True or False.""" + + b0 = self.hashes[0].hash(key) + b1 = self.hashes[1].hash(key) + # print("## Lookup key={} b0={} b1={}".format(key, b0, b1)) + return self.table[b0] == key or self.table[b1] == key + + def insert(self, key): + """Insert a new key to the table. Assumes that the key is not present yet.""" + + # TODO: Implement + raise NotImplementedError + + def rehash(self, key): + """ Relocate all items using new hash functions and insert a given key. """ + # Obtain new hash functions + for i in range(2): + self.hashes[i].regenerate() + + # TODO: Implement diff --git a/08-cuckoo_hash/python/cuckoo_hash_test.py b/08-cuckoo_hash/python/cuckoo_hash_test.py new file mode 100755 index 0000000..dfbfd4d --- /dev/null +++ b/08-cuckoo_hash/python/cuckoo_hash_test.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import sys +import random + +from cuckoo_hash import CuckooTable +from hash_functions import TabulationHash, FixedHash + +def inspect_table(cuckoo, hashes, n, table_size, step): + table = cuckoo.get_table() + assert len(table) == table_size, "The size of table is given and it is expected not to be changed." + for i in range(n): + k = step*i + h0 = hashes[0].hash(k) + h1 = hashes[1].hash(k) + assert table[h0] == k or table[h1] == k, "Item should be stored on one of two positions given by hash functions." + assert h0 == h1 or table[h0] != k or table[h1] != k, "Item should be stored only on one position." + + for t in range(table_size): + k = table[t] + if k is not None: + assert k % step == 0 and k < step * n, "Only inserted items should be stored." + assert hashes[0].hash(k) == t or hashes[1].hash(k) == t, "Item should be stored on one of two positions given by hash functions." + +def simple_test(n, table_size_percentage): + random.seed(42) + num_buckets = n*table_size_percentage//100 + hashes = [TabulationHash(num_buckets), TabulationHash(num_buckets)] + table = CuckooTable(num_buckets, hashes) + + # Insert an arithmetic progression + for i in range(n): + table.insert(37*i) + + # Verify contents of the table + for i in range(n): + assert table.lookup(37*i), "Item not present in table, but it should be." + assert not table.lookup(37*i+1), "Item present in table, even though it should not be." + + inspect_table(table, hashes, n, num_buckets, 37) + +def multiple_test(min_n, max_n, step_n, table_size_percentage): + for n in range(min_n, max_n, step_n): + print("\tn={}".format(n)) + simple_test(n, table_size_percentage) + +def fixed_test(): + table_size = FixedHash.table_size + hashes = [FixedHash(0), FixedHash(1) ] + cuckoo = CuckooTable(table_size, hashes) + for k in range(FixedHash.keys): + cuckoo.insert(k) + inspect_table(cuckoo, hashes, FixedHash.keys, table_size, 1) + +# A list of all tests +tests = [ + ("small", lambda: simple_test(100, 400)), + ("middle", lambda: simple_test(31415, 300)), + ("big", lambda: simple_test(1000000, 300)), + ("tight", lambda: multiple_test(20000, 40000, 500, 205)), + ("fixed", fixed_test) +] + +if __name__ == "__main__": + for required_test in sys.argv[1:] or [name for name, _ in tests]: + for name, test in tests: + if name == required_test: + print("Running test {}".format(name), file=sys.stderr) + test() + break + else: + raise ValueError("Unknown test {}".format(name)) diff --git a/08-cuckoo_hash/python/hash_functions.py b/08-cuckoo_hash/python/hash_functions.py new file mode 100644 index 0000000..a1abbc6 --- /dev/null +++ b/08-cuckoo_hash/python/hash_functions.py @@ -0,0 +1,71 @@ +import random + +class TabulationHash: + """Hash function for hashing by tabulation. + + The 32-bit key is split to four 8-bit parts. Each part indexes + a separate table of 256 randomly generated values. Obtained values + are XORed together. + """ + + def __init__(self, num_buckets): + self.tables = [None] * 4 + self.num_buckets = num_buckets + self.regenerate() + + def regenerate(self): + for i in range(4): + self.tables[i] = [random.randint(0, 0xffffffff) for _ in range(256)] + + def hash(self, key): + h0 = key & 0xff + h1 = (key >> 8) & 0xff + h2 = (key >> 16) & 0xff + h3 = (key >> 24) & 0xff + t = self.tables + return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets + + +class FixedHash: + keys = 5 + max_regenerations = 6 + table_size = 16 + + hashes = [ + [ # Two items hashed into the same bucket by both functions + [ 1, 7, 3, 7, 10 ], + [ 2, 7, 4, 7, 11 ] + ], + [ # Three items stored in two positions + [ 1, 7, 3, 8, 7 ], + [ 2, 8, 4, 7, 8 ] + ], + [ # Four items stored in three positions + [ 1, 7, 7, 8, 9 ], + [ 2, 8, 9, 7, 8 ] + ], + [ # Five should be possible to store in five positions, but the cuckoo's insert operation may not find the proper locations + [ 1, 2, 3, 4, 5 ], + [ 2, 3, 4, 5, 1 ] + ], + [ # Five should be possible to store in six positions, the timeout in the insert may not be sufficient + [ 1, 2, 3, 4, 5 ], + [ 2, 3, 4, 5, 6 ] + ], + [ # This should be easy + [ 8, 7, 7, 8, 12 ], + [ 11, 6, 7, 9, 0 ] + ] + ] + + def __init__(self, id): + self.id = id + self.regenerations = 0 + + def regenerate(self): + self.regenerations += 1 + assert self.regenerations < self.max_regenerations, "Too many rehashes" + + def hash(self, key): + assert 0 <= key and key < self.keys, "Invalid key" + return self.hashes[self.regenerations][self.id][key] diff --git a/08-cuckoo_hash/task.md b/08-cuckoo_hash/task.md new file mode 100644 index 0000000..87f378c --- /dev/null +++ b/08-cuckoo_hash/task.md @@ -0,0 +1,14 @@ +Implement Cuckoo hash table with simple tabulation hashing. + +You are given a skeleton code which defines the table, implements +`lookup()`, and provides hash functions. You have to add an `insert()` +method. + +If too many elements are moved during a single insert, the table must +be rehashed with new hash functions. See lecture notes for the particular +bounds. + +The size of the table should stay constant +throughout the existence of the data structure. + +Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master). \ No newline at end of file -- GitLab