diff --git a/07-cuckoo_hash/cpp/Makefile b/07-cuckoo_hash/cpp/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f32e87ad710a520cec3a5f5a211462fcb0b97fa1 --- /dev/null +++ b/07-cuckoo_hash/cpp/Makefile @@ -0,0 +1,13 @@ +test: cuckoo_hash_test + ./$< + +INCLUDE ?= . +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) + +cuckoo_hash_test: cuckoo_hash_test.cpp cuckoo_hash.h test_main.cpp $(INCLUDE)/random.h + $(CXX) $(CXXFLAGS) $^ -o $@ + +clean: + rm -f cuckoo_hash_test + +.PHONY: clean test diff --git a/07-cuckoo_hash/cpp/cuckoo_hash.h b/07-cuckoo_hash/cpp/cuckoo_hash.h new file mode 100644 index 0000000000000000000000000000000000000000..32a566040c729e2b52fe2e1208d2b4455a9d094c --- /dev/null +++ b/07-cuckoo_hash/cpp/cuckoo_hash.h @@ -0,0 +1,103 @@ +#include <string> +#include <vector> +#include <cstdint> +#include <iostream> + +#include "random.h" + +using namespace std; + +// If the condition is not true, report an error and halt. +#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) + +void expect_failed(const string& message); + +class TabulationHash { + /* + * Hash function for hashing by tabulation. + * + * The 32-bit key is split to four 8-bit parts. Each part indexes + * a separate table of 256 randomly generated values. Obtained values + * are XORed together. + */ + + unsigned num_buckets; + uint32_t tables[4][256]; + +public: + TabulationHash(unsigned num_buckets, RandomGen *random_gen) + { + this->num_buckets = num_buckets; + for (int i=0; i<4; i++) + for (int j=0; j<256; j++) + tables[i][j] = random_gen->next_u32(); + } + + uint32_t hash(uint32_t key) + { + unsigned h0 = key & 0xff; + unsigned h1 = (key >> 8) & 0xff; + unsigned h2 = (key >> 16) & 0xff; + unsigned h3 = (key >> 24) & 0xff; + return (tables[0][h0] ^ tables[1][h1] ^ tables[2][h2] ^ tables[3][h3]) % num_buckets; + } +}; + +class CuckooTable { + /* + * Hash table with Cuckoo hashing. + * + * We have two hash functions, which map 32-bit keys to buckets of a common + * hash table. Unused buckets contain 0xffffffff. + */ + + const uint32_t UNUSED = 0xffffffff; + + // The array of buckets + vector<uint32_t> table; + unsigned num_buckets; + + // Hash functions and the random generator used to create them + TabulationHash *hashes[2]; + RandomGen *random_gen; + +public: + + CuckooTable(unsigned num_buckets) + { + // Initialize the table with the given number of buckets. + // The number of buckets is expected to stay constant. + + this->num_buckets = num_buckets; + table.resize(num_buckets, UNUSED); + + // Obtain two fresh hash functions. + random_gen = new RandomGen(42); + for (int i=0; i<2; i++) + hashes[i] = new TabulationHash(num_buckets, random_gen); + } + + ~CuckooTable() + { + for (int i=0; i<2; i++) + delete hashes[i]; + delete random_gen; + } + + bool lookup(uint32_t key) + { + // Check if the table contains the given key. Returns True or False. + unsigned h0 = hashes[0]->hash(key); + unsigned h1 = hashes[1]->hash(key); + return (table[h0] == key || table[h1] == key); + } + + void insert(uint32_t key) + { + // Insert a new key to the table. Assumes that the key is not present yet. + EXPECT(key != UNUSED, "Keys must differ from UNUSED."); + + // TODO: Implement + } + +}; diff --git a/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp b/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..84ececb2de628938f855590f0f0b06fa60f73957 --- /dev/null +++ b/07-cuckoo_hash/cpp/cuckoo_hash_test.cpp @@ -0,0 +1,35 @@ +#include <functional> +#include <cstdlib> +#include <vector> + +#include "cuckoo_hash.h" + +void simple_test(unsigned n, unsigned table_size_percentage) +{ + CuckooTable table(n * table_size_percentage / 100); + + for (unsigned i=0; i < n; i++) + table.insert(37*i); + + for (unsigned i=0; i < n; i++) { + EXPECT(table.lookup(37*i), "Item not present in table, but it should be."); + EXPECT(!table.lookup(37*i+1), "Item present in table, even though it should not be."); + } +} + +void multiple_test(unsigned min_n, unsigned max_n, unsigned step_n, unsigned table_size_percentage) +{ + for (unsigned n=min_n; n < max_n; n += step_n) { + printf("\tn=%u\n", n); + simple_test(n, table_size_percentage); + } +} + +/*** A list of all tests ***/ + +vector<pair<string, function<void()>>> tests = { + { "small", [] { simple_test(100, 400); } }, + { "middle", [] { simple_test(31415, 300); } }, + { "big", [] { simple_test(1000000, 300); } }, + { "tight", [] { multiple_test(20000, 40000, 500, 205); } }, +}; diff --git a/07-cuckoo_hash/cpp/random.h b/07-cuckoo_hash/cpp/random.h new file mode 100644 index 0000000000000000000000000000000000000000..7d18ab60dfd6302a9261fc034f28e91d37eca78b --- /dev/null +++ b/07-cuckoo_hash/cpp/random.h @@ -0,0 +1,59 @@ +#define DS1_RANDOM_H + +#include <cstdint> + +/* + * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman + * and Sebastiano Vigna, distributed under the CC-0 license. For more details, + * see http://vigna.di.unimi.it/xorshift/. + * + * Rewritten to C++ by Martin Mares, also placed under CC-0. + */ + +class RandomGen { + uint64_t state[2]; + + uint64_t rotl(uint64_t x, int k) + { + return (x << k) | (x >> (64 - k)); + } + + public: + // Initialize the generator, set its seed and warm it up. + RandomGen(unsigned int seed) + { + state[0] = seed * 0xdeadbeef; + state[1] = seed ^ 0xc0de1234; + for (int i=0; i<100; i++) + next_u64(); + } + + // Generate a random 64-bit number. + uint64_t next_u64(void) + { + uint64_t s0 = state[0], s1 = state[1]; + uint64_t result = s0 + s1; + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + return result; + } + + // Generate a random 32-bit number. + uint32_t next_u32(void) + { + return next_u64() >> 11; + } + + // Generate a number between 0 and range-1. + unsigned int next_range(unsigned int range) + { + /* + * This is not perfectly uniform, unless the range is a power of two. + * However, for 64-bit random values and 32-bit ranges, the bias is + * insignificant. + */ + return next_u64() % range; + } +}; + diff --git a/07-cuckoo_hash/cpp/test_main.cpp b/07-cuckoo_hash/cpp/test_main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f4aff0785f636b7fd0ea1a15aa69dafe06f290f --- /dev/null +++ b/07-cuckoo_hash/cpp/test_main.cpp @@ -0,0 +1,43 @@ +#include <cstdlib> +#include <functional> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +using namespace std; + +extern vector<pair<string, function<void()>>> tests; + +void expect_failed(const string& message) { + cerr << "Test error: " << message << endl; + exit(1); +} + +int main(int argc, char* argv[]) { + vector<string> required_tests; + + if (argc > 1) { + required_tests.assign(argv + 1, argv + argc); + } else { + for (const auto& test : tests) + required_tests.push_back(test.first); + } + + for (const auto& required_test : required_tests) { + bool found = false; + for (const auto& test : tests) + if (required_test == test.first) { + cerr << "Running test " << required_test << endl; + test.second(); + found = true; + break; + } + if (!found) { + cerr << "Unknown test " << required_test << endl; + return 1; + } + } + + return 0; +} diff --git a/07-cuckoo_hash/python/cuckoo_hash.py b/07-cuckoo_hash/python/cuckoo_hash.py new file mode 100644 index 0000000000000000000000000000000000000000..72a415ce0f8a0f6b643536a2a0c7e0861eb1026d --- /dev/null +++ b/07-cuckoo_hash/python/cuckoo_hash.py @@ -0,0 +1,56 @@ +import random +import math + +class TabulationHash: + """Hash function for hashing by tabulation. + + The 32-bit key is split to four 8-bit parts. Each part indexes + a separate table of 256 randomly generated values. Obtained values + are XORed together. + """ + + def __init__(self, num_buckets): + self.tables = [None] * 4 + for i in range(4): + self.tables[i] = [random.randint(0, 0xffffffff) for _ in range(256)] + self.num_buckets = num_buckets + + def hash(self, key): + h0 = key & 0xff + h1 = (key >> 8) & 0xff + h2 = (key >> 16) & 0xff + h3 = (key >> 24) & 0xff + t = self.tables + return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets + +class CuckooTable: + """Hash table with Cuckoo hashing. + + We have two hash functions, which map 32-bit keys to buckets of a common + hash table. Unused buckets contain None. + """ + + def __init__(self, num_buckets): + """Initialize the table with the given number of buckets. + The number of buckets is expected to stay constant.""" + + # The array of buckets + self.num_buckets = num_buckets + self.table = [None] * num_buckets + + # Create two fresh hash functions + self.hashes = [TabulationHash(num_buckets), TabulationHash(num_buckets)] + + def lookup(self, key): + """Check if the table contains the given key. Returns True or False.""" + + b0 = self.hashes[0].hash(key) + b1 = self.hashes[1].hash(key) + # print("## Lookup key={} b0={} b1={}".format(key, b0, b1)) + return self.table[b0] == key or self.table[b1] == key + + def insert(self, key): + """Insert a new key to the table. Assumes that the key is not present yet.""" + + # TODO: Implement + raise NotImplementedError diff --git a/07-cuckoo_hash/python/cuckoo_hash_test.py b/07-cuckoo_hash/python/cuckoo_hash_test.py new file mode 100755 index 0000000000000000000000000000000000000000..f9137c45d9a5cfd8e3f0f526626961dbffe78c30 --- /dev/null +++ b/07-cuckoo_hash/python/cuckoo_hash_test.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 +import sys +import random + +from cuckoo_hash import CuckooTable + +def simple_test(n, table_size_percentage): + random.seed(42) + table = CuckooTable(n*table_size_percentage//100) + + # Insert an arithmetic progression + for i in range(n): + table.insert(37*i) + + # Verify contents of the table + for i in range(n): + assert table.lookup(37*i), "Item not present in table, but it should be." + assert not table.lookup(37*i+1), "Item present in table, even though it should not be." + +def multiple_test(min_n, max_n, step_n, table_size_percentage): + for n in range(min_n, max_n, step_n): + print("\tn={}".format(n)) + simple_test(n, table_size_percentage) + +# A list of all tests +tests = [ + ("small", lambda: simple_test(100, 400)), + ("middle", lambda: simple_test(31415, 300)), + ("big", lambda: simple_test(1000000, 300)), + ("tight", lambda: multiple_test(20000, 40000, 500, 205)), +] + +if __name__ == "__main__": + for required_test in sys.argv[1:] or [name for name, _ in tests]: + for name, test in tests: + if name == required_test: + print("Running test {}".format(name), file=sys.stderr) + test() + break + else: + raise ValueError("Unknown test {}".format(name)) diff --git a/07-cuckoo_hash/task.md b/07-cuckoo_hash/task.md new file mode 100644 index 0000000000000000000000000000000000000000..dc71991be8aea6ee26f8b673dc0813af72bc913d --- /dev/null +++ b/07-cuckoo_hash/task.md @@ -0,0 +1,12 @@ +Implement Cuckoo hash table with simple tabulation hashing. + +You are given a skeleton code which defines the table, implements +`lookup()`, and provides hash functions. You have to add an `insert()` +method. + +If too many elements are moved during a single insert, the table must +be rehashed with new hash functions. See lecture notes for the particular +bounds. + +The size of the table should stay constant +throughout the existence of the data structure. diff --git a/08-hash_experiment/cpp/Makefile b/08-hash_experiment/cpp/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8745327329aeec9e1e0d2bbce44df5e3e717b4be --- /dev/null +++ b/08-hash_experiment/cpp/Makefile @@ -0,0 +1,20 @@ +INCLUDE ?= . +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) +STUDENT_ID ?= PLEASE_SET_STUDENT_ID + +HASHFUNCS=ms-low ms-high poly-1 poly-2 tab + +.PHONY: test +test: $(addprefix out/t-grow-, $(HASHFUNCS)) $(addprefix out/t-usage-, $(HASHFUNCS)) + +out/t-%: hash_experiment + @mkdir -p out + ./hash_experiment $* $(STUDENT_ID) >$@ + +hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h + $(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@ + +.PHONY: clean +clean: + rm -f hash_experiment + rm -rf out diff --git a/08-hash_experiment/cpp/hash_experiment.cpp b/08-hash_experiment/cpp/hash_experiment.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7eac12f5a6eae0dbb753ebfa6ee71dd2804feb54 --- /dev/null +++ b/08-hash_experiment/cpp/hash_experiment.cpp @@ -0,0 +1,314 @@ +#include <vector> +#include <functional> +#include <algorithm> +#include <utility> +#include <stdexcept> +#include <stdio.h> +#include <stdint.h> +#include <math.h> +#include "random.h" + +using namespace std; + +RandomGen rng(42); + +typedef uint32_t uint; + +typedef function<uint(uint)> HashFunction; +typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory; + +/* + * Hash function for hashing by tabulation. + * + * The 32-bit key is split to four 8-bit parts. Each part indexes + * a separate table of 256 randomly generated values. Obtained values + * are XORed together. + */ +class TabulationHash { + unsigned num_buckets; + vector<uint> tables; + + TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) { + for (uint& x : tables) x = rng.next_u32(); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(TabulationHash(num_buckets)); + } + + uint operator()(uint key) { + return ( + tables[key & 0xff] ^ + tables[((key >> 8) & 0xff) | 0x100] ^ + tables[((key >> 16) & 0xff) | 0x200] ^ + tables[((key >> 24) & 0xff) | 0x300] + ) % num_buckets; + } +}; + +// Hash function using polynomial modulo a prime. +template < int degree, uint prime = 2147483647 > +class PolynomialHash { + unsigned num_buckets; + vector<uint> coefs; + + PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) { + for (uint& x : coefs) x = rng.next_u32(); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(PolynomialHash(num_buckets)); + } + + uint operator()(uint key) { + uint64_t acc = 0; + for (uint c : coefs) acc = (acc * key + c) % prime; + return (uint)(acc % num_buckets); + } +}; + +typedef PolynomialHash<1> LinearHash; +typedef PolynomialHash<2> QuadraticHash; + +// Multiply-shift hash function taking top bits of 32-bit word +class MultiplyShiftLowHash { + uint mult; + uint mask; + int shift = 0; + + MultiplyShiftLowHash(unsigned num_buckets) { + mult = rng.next_u32() | 0x1; + mask = num_buckets - 1; + + if (mask & num_buckets) + throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2"); + + unsigned tmp = num_buckets - 1; + while ((0x80000000U & tmp) == 0) { + tmp <<= 1; + shift++; + } + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(MultiplyShiftLowHash(num_buckets)); + } + + uint operator()(uint key) { + return ((key * mult) >> shift) & mask; + } +}; + +// Multiply-shift hash function taking low bits of upper half of 64-bit word +class MultiplyShiftHighHash { + uint mask; + uint64_t mult; + + MultiplyShiftHighHash(unsigned num_buckets) { + mult = rng.next_u64() | 0x1; + mask = num_buckets - 1; + + if (mask & num_buckets) + throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2"); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(MultiplyShiftHighHash(num_buckets)); + } + + uint operator()(uint key) { + return ((key * mult) >> 32) & mask; + } +}; + + +// Hash table with linear probing +class HashTable { + HashFunction hash; + vector<uint> table; + unsigned size = 0; + + unsigned ops; + unsigned max_; + uint64_t steps; + + public: + // We reserve one integer to mark unused buckets. This integer + // cannot be stored in the table. + static constexpr uint UNUSED = ~((uint)0); + + HashTable(const HashFunctionFactory& factory, unsigned num_buckets) : + hash(factory(num_buckets)), table(num_buckets, +UNUSED) { + reset_counter(); + } + + // Check whether key is present in the table. + bool lookup(uint key) { + if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED"); + + bool ret = false; + unsigned steps = 1; + + uint b = hash(key); + while (table[b] != UNUSED) { + if (table[b] == key) { + ret = true; + break; + } + steps++; + b = next_bucket(b); + } + + update_counter(steps); + return ret; + } + + // Add the key in the table. + void insert(uint key) { + if (key == UNUSED) throw runtime_error("Cannot insert UNUSED"); + if (size >= table.size()) throw runtime_error("Insert: Table is full"); + + unsigned steps = 1; + uint b = hash(key); + + while (table[b] != UNUSED) { + if (table[b] == key) goto key_found; + steps++; + b = next_bucket(b); + } + + table[b] = key; + size++; + + key_found: + update_counter(steps); + } + + void reset_counter() { ops = steps = max_ = 0; } + double report_avg() { return ((double)steps) / max(1U, ops); } + double report_max() { return max_; } + + private: + void update_counter(unsigned steps) { + ops++; + this->steps += steps; + max_ = max(steps, max_); + } + + unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); } +}; + +void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) { + vector<double> avg(max_usage, 0.0); + vector<double> avg2(max_usage, 0.0); + + unsigned N = 1 << 20; + unsigned step_size = N / 100; + + vector<uint> elements(N); + for (unsigned i = 0; i < N; i++) elements[i] = i; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + for (unsigned i = 0; i < N-1; i++) + swap(elements[i], elements[i + (rng.next_u32() % (N-i))]); + + for (int s = 0; s < max_usage; s++) { + H.reset_counter(); + for (unsigned i = 0; i < step_size; i++) + H.insert(elements[s*step_size + i]); + + avg[s] += H.report_avg(); + avg2[s] += H.report_avg() * H.report_avg(); + } + } + + for (int i = 0; i < max_usage; i++) { + avg[i] /= retry; + avg2[i] /= retry; + double std_dev = sqrt(avg2[i] - avg[i]*avg[i]); + + printf("%i %.03lf %.03lf\n", i+1, avg[i], std_dev); + } +} + + +void grow_test(HashFunctionFactory factory, int usage = 60, int retry = 40, + int begin = 7, int end = 22) { + + for (int n = begin; n < end; n++) { + double avg = 0; + double avg2 = 0; + unsigned N = 1 << n; + + vector<uint> elements(N); + for (unsigned i = 0; i < N; i++) elements[i] = i; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + for (unsigned i = 0; i < N-1; i++) + swap(elements[i], elements[i + (rng.next_u32() % (N-i))]); + + for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++) + H.insert(elements[i]); + + for (unsigned i = 0; i < N; i++) + H.lookup(i); + + avg += H.report_avg(); + avg2 += H.report_avg() * H.report_avg(); + } + + avg /= retry; + avg2 /= retry; + double std_dev = sqrt(avg2 - avg*avg); + + printf("%i %.03lf %.03lf\n", N, avg, std_dev); + } +} + +int main(int argc, char** argv) { + vector<pair<string, HashFunctionFactory>> grow_tests = { + {"grow-ms-low", MultiplyShiftLowHash::factory}, + {"grow-ms-high", MultiplyShiftHighHash::factory}, + {"grow-poly-1", LinearHash::factory}, + {"grow-poly-2", QuadraticHash::factory}, + {"grow-tab", TabulationHash::factory} + }; + vector<pair<string, HashFunctionFactory>> usage_tests = { + {"usage-ms-low", MultiplyShiftLowHash::factory}, + {"usage-ms-high", MultiplyShiftHighHash::factory}, + {"usage-poly-1", LinearHash::factory}, + {"usage-poly-2", QuadraticHash::factory}, + {"usage-tab", TabulationHash::factory} + }; + + if (argc != 3) goto fail; + + rng = RandomGen(atoi(argv[2])); + + for (auto t : grow_tests) { + if (t.first == argv[1]) { + grow_test(t.second); + return 0; + } + } + + for (auto t : usage_tests) { + if (t.first == argv[1]) { + usage_test(t.second); + return 0; + } + } + + fail: + printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]); + for (auto t : grow_tests) printf(" %s", t.first.c_str()); + for (auto t : usage_tests) printf(" %s", t.first.c_str()); + return 1; +} + diff --git a/08-hash_experiment/cpp/random.h b/08-hash_experiment/cpp/random.h new file mode 100644 index 0000000000000000000000000000000000000000..5ef10aeb1fe7e58a48277fb3565169ec267d43d9 --- /dev/null +++ b/08-hash_experiment/cpp/random.h @@ -0,0 +1,61 @@ +#ifndef DS1_RANDOM_H +#define DS1_RANDOM_H + +#include <cstdint> + +/* + * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman + * and Sebastiano Vigna, distributed under the CC-0 license. For more details, + * see http://vigna.di.unimi.it/xorshift/. + * + * Rewritten to C++ by Martin Mares, also placed under CC-0. + */ + +class RandomGen { + uint64_t state[2]; + + uint64_t rotl(uint64_t x, int k) + { + return (x << k) | (x >> (64 - k)); + } + + public: + // Initialize the generator, set its seed and warm it up. + RandomGen(unsigned int seed) + { + state[0] = seed * 0xdeadbeef; + state[1] = seed ^ 0xc0de1234; + for (int i=0; i<100; i++) + next_u64(); + } + + // Generate a random 64-bit number. + uint64_t next_u64(void) + { + uint64_t s0 = state[0], s1 = state[1]; + uint64_t result = s0 + s1; + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + return result; + } + + // Generate a random 32-bit number. + uint32_t next_u32(void) + { + return next_u64() >> 11; + } + + // Generate a number between 0 and range-1. + unsigned int next_range(unsigned int range) + { + /* + * This is not perfectly uniform, unless the range is a power of two. + * However, for 64-bit random values and 32-bit ranges, the bias is + * insignificant. + */ + return next_u64() % range; + } +}; + +#endif diff --git a/08-hash_experiment/python/Makefile b/08-hash_experiment/python/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..e9373dc47eb7ca0b2e18db5cbfff4f95392d0164 --- /dev/null +++ b/08-hash_experiment/python/Makefile @@ -0,0 +1,14 @@ +STUDENT_ID ?= PLEASE_SET_STUDENT_ID + +HASHFUNCS=ms-low ms-high poly-1 poly-2 tab + +.PHONY: test +test: $(addprefix out/t-grow-, $(HASHFUNCS)) $(addprefix out/t-usage-, $(HASHFUNCS)) + +out/t-%: hash_experiment.py + @mkdir -p out + ./hash_experiment.py $* $(STUDENT_ID) >$@ + +.PHONY: clean +clean: + rm -rf out diff --git a/08-hash_experiment/python/hash_experiment.py b/08-hash_experiment/python/hash_experiment.py new file mode 100644 index 0000000000000000000000000000000000000000..9de266ede9de09b87d209f29ff7bfca9e3ed6ec1 --- /dev/null +++ b/08-hash_experiment/python/hash_experiment.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 + +import random, sys +from math import sqrt + +# Our wrapper of random so we can substitute it with another random generator +rng_init = lambda x: random.seed(x) +rng_next_u32 = lambda: random.randint(0, 2**32 - 1) + +class TabulationHash: + """Hash function for hashing by tabulation. + + The 32-bit key is split to four 8-bit parts. Each part indexes + a separate table of 256 randomly generated values. Obtained values + are XORed together. + """ + + def __init__(self, num_buckets): + self.num_buckets = num_buckets + self.tables = [None] * 4 + for i in range(4): + self.tables[i] = [ rng_next_u32() for _ in range(256) ] + + def __call__(self, key): + h0 = key & 0xff; + h1 = (key >> 8) & 0xff; + h2 = (key >> 16) & 0xff; + h3 = (key >> 24) & 0xff; + t = self.tables + return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets + +class PolynomialHash: + """Hash function using polynomial modulo a prime.""" + + def __init__(self, num_buckets, degree, prime = 2147483647): + self.num_buckets = num_buckets + self.prime = prime + self.coefs = [ rng_next_u32() for _ in range(degree + 1) ] + + def __call__(self, key): + acc = 0 + for c in self.coefs: + acc = (acc * key + c) % self.prime + return acc % self.num_buckets + +LinearHash = lambda num_buckets: PolynomialHash(num_buckets, 1) +QuadraticHash = lambda num_buckets: PolynomialHash(num_buckets, 2) + +class MultiplyShiftLowHash: + """Multiply-shift hash function taking top bits of 32-bit word""" + + def __init__(self, num_buckets): + self.mask = num_buckets - 1 + assert (num_buckets & self.mask == 0), \ + "MultiplyShiftLowHash: num_buckets must be power of 2" + + self.mult = rng_next_u32() | 0x1 + self.shift = 0; + tmp = num_buckets - 1 + while 0x80000000 & tmp == 0: + tmp <<= 1 + self.shift += 1 + + def __call__(self, key): + return ((key * self.mult) >> self.shift) & self.mask + +class MultiplyShiftHighHash: + """Multiply-shift hash function taking low bits of upper half of 64-bit word""" + + def __init__(self, num_buckets): + self.mask = num_buckets - 1 + assert (num_buckets & self.mask == 0), \ + "MultiplyShiftLowHash: num_buckets must be power of 2" + self.mult = (rng_next_u32() << 32) | rng_next_u32() | 0x1 + + def __call__(self, key): + return ((key * self.mult) >> 32) & self.mask + +class HashTable: + """Hash table with linear probing""" + + def __init__(self, hash_fun_factory, num_buckets): + self._hash = hash_fun_factory(num_buckets) + self._num_buckets = num_buckets + self._table = [None] * num_buckets + self._size = 0 + self.reset_counter() + + def _next_bucket(self, b): + return (b + 1) % self._num_buckets + + def lookup(self, key): + """Check whether key is present in the table.""" + ret = False + steps = 1 + + b = self._hash(key) + while self._table[b] is not None: + if self._table[b] == key: + ret = True + break + steps += 1 + b = self._next_bucket(b) + + self._update_counter(steps) + return ret + + def insert(self, key): + """Add the key in the table.""" + assert self._size < self._num_buckets, "Cannot insert into a full table." + steps = 1 + + b = self._hash(key) + while self._table[b] is not None: + if self._table[b] == key: break + steps += 1 + b = self._next_bucket(b) + else: + self._table[b] = key + + self._update_counter(steps) + + def _update_counter(self, steps): + self._ops += 1 + self._steps += steps + self._max = max(self._max, steps) + + def reset_counter(self): + self._steps = 0 + self._ops = 0 + self._max = 0 + + def report_avg(self): return self._steps / max(1, self._ops) + def report_max(self): return self._max + +def permute_list(l): + N = len(l) + for i in range(N - 1): + dst = i + (rng_next_u32() % (N-i)) + l[i], l[dst] = l[dst], l[i] + +def usage_test(hash_fun_factory, max_usage = 90, retry = 40): + avg = [0.0] * max_usage + avg2 = [0.0] * max_usage + + N = 2**19 + step_size = N // 100 + elements = list(range(N)) + + for _ in range(retry): + H = HashTable(hash_fun_factory, N) + permute_list(elements) + + for s in range(max_usage): + H.reset_counter() + for i in range(step_size): + H.insert(s*step_size + i) + avg[s] += H.report_avg() + avg2[s] += H.report_avg() ** 2 + + for i in range(max_usage): + avg[i] /= retry; + avg2[i] /= retry; + std_dev = sqrt(avg2[i] - avg[i]**2) + + print("%i %.03f %.03f" % ((i + 1), avg[i], std_dev)) + +def grow_test(hash_fun_factory, usage = 60, retry = 40, begin = 7, end = 21): + for n in range(begin, end): + avg = 0.0 + avg2 = 0.0 + N = 2 ** n + elements = list(range(N)) + + for _ in range(retry): + H = HashTable(hash_fun_factory, N) + permute_list(elements) + + for x in elements[:N * usage // 100]: + H.insert(x) + + for i in range(N): + H.lookup(i) + + avg += H.report_avg() + avg2 += H.report_avg() ** 2 + + avg /= retry + avg2 /= retry + std_dev = sqrt(avg2 - avg**2) + + print("%i %.03f %.03f" % (N, avg, std_dev)) + +tests = { + "usage-ms-low": lambda: usage_test(MultiplyShiftLowHash), + "usage-ms-high": lambda: usage_test(MultiplyShiftHighHash), + "usage-poly-1": lambda: usage_test(LinearHash), + "usage-poly-2": lambda: usage_test(QuadraticHash), + "usage-tab": lambda: usage_test(TabulationHash), + + "grow-ms-low": lambda: grow_test(MultiplyShiftLowHash), + "grow-ms-high": lambda: grow_test(MultiplyShiftHighHash), + "grow-poly-1": lambda: grow_test(LinearHash), + "grow-poly-2": lambda: grow_test(QuadraticHash), + "grow-tab": lambda: grow_test(TabulationHash), +} + +if len(sys.argv) == 3: + test, student_id = sys.argv[1], sys.argv[2] + rng_init(int(student_id)) + if test in tests: + tests[test]() + else: + raise ValueError("Unknown test {}".format(test)) +else: + raise ValueError("Usage: {} <test> <student-id>".format(sys.argv[0])) + diff --git a/08-hash_experiment/task.md b/08-hash_experiment/task.md new file mode 100644 index 0000000000000000000000000000000000000000..8616811f8c71c7079cd37f4c044b406b6c5ec8d5 --- /dev/null +++ b/08-hash_experiment/task.md @@ -0,0 +1,74 @@ +## Goal + +The goal of this assignment is to experimentally evaluate Linear probing +hash table with different systems of hash functions. + +You are given a test program (`hash_experiment`) which implements everything +needed to perform the following experiments: + +- _Grow experiment:_ This experiment tries different sizes $N$ of the hash table and for each size + it inserts small keys in random order until 60% of the table is used + and then it performs lookup operation for keys $0,\ldots,N-1$. +- _Usage experiment:_ This experiment uses hash table of size $2^{20}$. It performs insertions + to increase usage of the table by 1%, reports efficiency of the insert operation, + and repeats until usage of the table reaches 90%. + +Both experiments measure number of probed buckets per operation, are repeated 40 times +and report average and standard deviation. Note that even with 40 repetitions +the reported numbers still depend quite a lot on the random seed used. + +You should perform these experiments for 5 different classes of hash functions – +tabulation, multiply-shift which uses top bits of 32-bit word (`ms-low`), +multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`), +and polynomial hash function of degree 1 and 2 – and write a report, which contains two +plots of the measured data for each experiment. The first plot should contain average +complexity of operations and the second one the standard deviation. + +Each plot should show the dependence of the average number of probed buckets +either on size of the hash table (the grow experiment) or the usage of the hash table +(the usage experiment). + +The report should discuss the experimental results and try to explain the observed +behavior using theory from the lectures. (If you want, you can carry out further +experiments to gain better understanding of the data structure and include these +in the report. This is strictly optional.) + +You should submit a PDF file with the report (and no source code). +You will get 1 temporary point upon submission if the file is syntactically correct; +proper points will be assigned later. + +## Test program + +The test program is given two arguments: +- The name of the test (`{grow,usage}-{ms-low,ms-high,poly-1,poly-2,tab}`). +- The random seed: you should use the last 2 digits of your student ID (you can find + it in the Study Information System – just click on the Personal data icon). Please + include the random seed in your report. + +The output of the program contains one line per experiment, which consists of +the set size and the average number of structural changes. + +## Hints + +The following tools can be useful for producing nice plots: +- [pandas](https://pandas.pydata.org/) +- [matplotlib](https://matplotlib.org/) +- [gnuplot](http://www.gnuplot.info/) + +A quick checklist for plots: +- Is there a caption explaining what is plotted? +- Are the axes clearly labelled? Do they have value ranges and units? +- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs + are more fitting in some cases, but you should tell.) +- Is it clear which curve means what? +- Is it clear what are the measured points and what is an interpolated + curve between them? +- Are there any overlaps? (E.g., the most interesting part of the curve + hidden underneath a label?) + +In your discussion, please distinguish the following kinds of claims. +It should be always clear which is which: +- Experimental results (i.e., the raw data you obtained from the experiments) +- Theoretical facts (i.e., claims we have proved mathematically) +- Your hypotheses (e.g., when you claim that the graph looks like something is true, + but you are not able to prove rigorously that it always holds)