diff --git a/08-hash_experiment/cpp/Makefile b/08-hash_experiment/cpp/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d844438cdebcecece806d4a705a505dcaff2370b --- /dev/null +++ b/08-hash_experiment/cpp/Makefile @@ -0,0 +1,9 @@ +INCLUDE ?= . +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) + +hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h + $(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@ + +.PHONY: clean +clean: + rm -f hash_experiment diff --git a/08-hash_experiment/cpp/hash_experiment.cpp b/08-hash_experiment/cpp/hash_experiment.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7eac12f5a6eae0dbb753ebfa6ee71dd2804feb54 --- /dev/null +++ b/08-hash_experiment/cpp/hash_experiment.cpp @@ -0,0 +1,314 @@ +#include <vector> +#include <functional> +#include <algorithm> +#include <utility> +#include <stdexcept> +#include <stdio.h> +#include <stdint.h> +#include <math.h> +#include "random.h" + +using namespace std; + +RandomGen rng(42); + +typedef uint32_t uint; + +typedef function<uint(uint)> HashFunction; +typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory; + +/* + * Hash function for hashing by tabulation. + * + * The 32-bit key is split to four 8-bit parts. Each part indexes + * a separate table of 256 randomly generated values. Obtained values + * are XORed together. + */ +class TabulationHash { + unsigned num_buckets; + vector<uint> tables; + + TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) { + for (uint& x : tables) x = rng.next_u32(); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(TabulationHash(num_buckets)); + } + + uint operator()(uint key) { + return ( + tables[key & 0xff] ^ + tables[((key >> 8) & 0xff) | 0x100] ^ + tables[((key >> 16) & 0xff) | 0x200] ^ + tables[((key >> 24) & 0xff) | 0x300] + ) % num_buckets; + } +}; + +// Hash function using polynomial modulo a prime. +template < int degree, uint prime = 2147483647 > +class PolynomialHash { + unsigned num_buckets; + vector<uint> coefs; + + PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) { + for (uint& x : coefs) x = rng.next_u32(); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(PolynomialHash(num_buckets)); + } + + uint operator()(uint key) { + uint64_t acc = 0; + for (uint c : coefs) acc = (acc * key + c) % prime; + return (uint)(acc % num_buckets); + } +}; + +typedef PolynomialHash<1> LinearHash; +typedef PolynomialHash<2> QuadraticHash; + +// Multiply-shift hash function taking top bits of 32-bit word +class MultiplyShiftLowHash { + uint mult; + uint mask; + int shift = 0; + + MultiplyShiftLowHash(unsigned num_buckets) { + mult = rng.next_u32() | 0x1; + mask = num_buckets - 1; + + if (mask & num_buckets) + throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2"); + + unsigned tmp = num_buckets - 1; + while ((0x80000000U & tmp) == 0) { + tmp <<= 1; + shift++; + } + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(MultiplyShiftLowHash(num_buckets)); + } + + uint operator()(uint key) { + return ((key * mult) >> shift) & mask; + } +}; + +// Multiply-shift hash function taking low bits of upper half of 64-bit word +class MultiplyShiftHighHash { + uint mask; + uint64_t mult; + + MultiplyShiftHighHash(unsigned num_buckets) { + mult = rng.next_u64() | 0x1; + mask = num_buckets - 1; + + if (mask & num_buckets) + throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2"); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(MultiplyShiftHighHash(num_buckets)); + } + + uint operator()(uint key) { + return ((key * mult) >> 32) & mask; + } +}; + + +// Hash table with linear probing +class HashTable { + HashFunction hash; + vector<uint> table; + unsigned size = 0; + + unsigned ops; + unsigned max_; + uint64_t steps; + + public: + // We reserve one integer to mark unused buckets. This integer + // cannot be stored in the table. + static constexpr uint UNUSED = ~((uint)0); + + HashTable(const HashFunctionFactory& factory, unsigned num_buckets) : + hash(factory(num_buckets)), table(num_buckets, +UNUSED) { + reset_counter(); + } + + // Check whether key is present in the table. + bool lookup(uint key) { + if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED"); + + bool ret = false; + unsigned steps = 1; + + uint b = hash(key); + while (table[b] != UNUSED) { + if (table[b] == key) { + ret = true; + break; + } + steps++; + b = next_bucket(b); + } + + update_counter(steps); + return ret; + } + + // Add the key in the table. + void insert(uint key) { + if (key == UNUSED) throw runtime_error("Cannot insert UNUSED"); + if (size >= table.size()) throw runtime_error("Insert: Table is full"); + + unsigned steps = 1; + uint b = hash(key); + + while (table[b] != UNUSED) { + if (table[b] == key) goto key_found; + steps++; + b = next_bucket(b); + } + + table[b] = key; + size++; + + key_found: + update_counter(steps); + } + + void reset_counter() { ops = steps = max_ = 0; } + double report_avg() { return ((double)steps) / max(1U, ops); } + double report_max() { return max_; } + + private: + void update_counter(unsigned steps) { + ops++; + this->steps += steps; + max_ = max(steps, max_); + } + + unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); } +}; + +void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) { + vector<double> avg(max_usage, 0.0); + vector<double> avg2(max_usage, 0.0); + + unsigned N = 1 << 20; + unsigned step_size = N / 100; + + vector<uint> elements(N); + for (unsigned i = 0; i < N; i++) elements[i] = i; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + for (unsigned i = 0; i < N-1; i++) + swap(elements[i], elements[i + (rng.next_u32() % (N-i))]); + + for (int s = 0; s < max_usage; s++) { + H.reset_counter(); + for (unsigned i = 0; i < step_size; i++) + H.insert(elements[s*step_size + i]); + + avg[s] += H.report_avg(); + avg2[s] += H.report_avg() * H.report_avg(); + } + } + + for (int i = 0; i < max_usage; i++) { + avg[i] /= retry; + avg2[i] /= retry; + double std_dev = sqrt(avg2[i] - avg[i]*avg[i]); + + printf("%i %.03lf %.03lf\n", i+1, avg[i], std_dev); + } +} + + +void grow_test(HashFunctionFactory factory, int usage = 60, int retry = 40, + int begin = 7, int end = 22) { + + for (int n = begin; n < end; n++) { + double avg = 0; + double avg2 = 0; + unsigned N = 1 << n; + + vector<uint> elements(N); + for (unsigned i = 0; i < N; i++) elements[i] = i; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + for (unsigned i = 0; i < N-1; i++) + swap(elements[i], elements[i + (rng.next_u32() % (N-i))]); + + for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++) + H.insert(elements[i]); + + for (unsigned i = 0; i < N; i++) + H.lookup(i); + + avg += H.report_avg(); + avg2 += H.report_avg() * H.report_avg(); + } + + avg /= retry; + avg2 /= retry; + double std_dev = sqrt(avg2 - avg*avg); + + printf("%i %.03lf %.03lf\n", N, avg, std_dev); + } +} + +int main(int argc, char** argv) { + vector<pair<string, HashFunctionFactory>> grow_tests = { + {"grow-ms-low", MultiplyShiftLowHash::factory}, + {"grow-ms-high", MultiplyShiftHighHash::factory}, + {"grow-poly-1", LinearHash::factory}, + {"grow-poly-2", QuadraticHash::factory}, + {"grow-tab", TabulationHash::factory} + }; + vector<pair<string, HashFunctionFactory>> usage_tests = { + {"usage-ms-low", MultiplyShiftLowHash::factory}, + {"usage-ms-high", MultiplyShiftHighHash::factory}, + {"usage-poly-1", LinearHash::factory}, + {"usage-poly-2", QuadraticHash::factory}, + {"usage-tab", TabulationHash::factory} + }; + + if (argc != 3) goto fail; + + rng = RandomGen(atoi(argv[2])); + + for (auto t : grow_tests) { + if (t.first == argv[1]) { + grow_test(t.second); + return 0; + } + } + + for (auto t : usage_tests) { + if (t.first == argv[1]) { + usage_test(t.second); + return 0; + } + } + + fail: + printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]); + for (auto t : grow_tests) printf(" %s", t.first.c_str()); + for (auto t : usage_tests) printf(" %s", t.first.c_str()); + return 1; +} + diff --git a/08-hash_experiment/cpp/random.h b/08-hash_experiment/cpp/random.h new file mode 100644 index 0000000000000000000000000000000000000000..5ef10aeb1fe7e58a48277fb3565169ec267d43d9 --- /dev/null +++ b/08-hash_experiment/cpp/random.h @@ -0,0 +1,61 @@ +#ifndef DS1_RANDOM_H +#define DS1_RANDOM_H + +#include <cstdint> + +/* + * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman + * and Sebastiano Vigna, distributed under the CC-0 license. For more details, + * see http://vigna.di.unimi.it/xorshift/. + * + * Rewritten to C++ by Martin Mares, also placed under CC-0. + */ + +class RandomGen { + uint64_t state[2]; + + uint64_t rotl(uint64_t x, int k) + { + return (x << k) | (x >> (64 - k)); + } + + public: + // Initialize the generator, set its seed and warm it up. + RandomGen(unsigned int seed) + { + state[0] = seed * 0xdeadbeef; + state[1] = seed ^ 0xc0de1234; + for (int i=0; i<100; i++) + next_u64(); + } + + // Generate a random 64-bit number. + uint64_t next_u64(void) + { + uint64_t s0 = state[0], s1 = state[1]; + uint64_t result = s0 + s1; + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + return result; + } + + // Generate a random 32-bit number. + uint32_t next_u32(void) + { + return next_u64() >> 11; + } + + // Generate a number between 0 and range-1. + unsigned int next_range(unsigned int range) + { + /* + * This is not perfectly uniform, unless the range is a power of two. + * However, for 64-bit random values and 32-bit ranges, the bias is + * insignificant. + */ + return next_u64() % range; + } +}; + +#endif diff --git a/08-hash_experiment/python/hash_experiment.py b/08-hash_experiment/python/hash_experiment.py new file mode 100644 index 0000000000000000000000000000000000000000..9de266ede9de09b87d209f29ff7bfca9e3ed6ec1 --- /dev/null +++ b/08-hash_experiment/python/hash_experiment.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 + +import random, sys +from math import sqrt + +# Our wrapper of random so we can substitute it with another random generator +rng_init = lambda x: random.seed(x) +rng_next_u32 = lambda: random.randint(0, 2**32 - 1) + +class TabulationHash: + """Hash function for hashing by tabulation. + + The 32-bit key is split to four 8-bit parts. Each part indexes + a separate table of 256 randomly generated values. Obtained values + are XORed together. + """ + + def __init__(self, num_buckets): + self.num_buckets = num_buckets + self.tables = [None] * 4 + for i in range(4): + self.tables[i] = [ rng_next_u32() for _ in range(256) ] + + def __call__(self, key): + h0 = key & 0xff; + h1 = (key >> 8) & 0xff; + h2 = (key >> 16) & 0xff; + h3 = (key >> 24) & 0xff; + t = self.tables + return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets + +class PolynomialHash: + """Hash function using polynomial modulo a prime.""" + + def __init__(self, num_buckets, degree, prime = 2147483647): + self.num_buckets = num_buckets + self.prime = prime + self.coefs = [ rng_next_u32() for _ in range(degree + 1) ] + + def __call__(self, key): + acc = 0 + for c in self.coefs: + acc = (acc * key + c) % self.prime + return acc % self.num_buckets + +LinearHash = lambda num_buckets: PolynomialHash(num_buckets, 1) +QuadraticHash = lambda num_buckets: PolynomialHash(num_buckets, 2) + +class MultiplyShiftLowHash: + """Multiply-shift hash function taking top bits of 32-bit word""" + + def __init__(self, num_buckets): + self.mask = num_buckets - 1 + assert (num_buckets & self.mask == 0), \ + "MultiplyShiftLowHash: num_buckets must be power of 2" + + self.mult = rng_next_u32() | 0x1 + self.shift = 0; + tmp = num_buckets - 1 + while 0x80000000 & tmp == 0: + tmp <<= 1 + self.shift += 1 + + def __call__(self, key): + return ((key * self.mult) >> self.shift) & self.mask + +class MultiplyShiftHighHash: + """Multiply-shift hash function taking low bits of upper half of 64-bit word""" + + def __init__(self, num_buckets): + self.mask = num_buckets - 1 + assert (num_buckets & self.mask == 0), \ + "MultiplyShiftLowHash: num_buckets must be power of 2" + self.mult = (rng_next_u32() << 32) | rng_next_u32() | 0x1 + + def __call__(self, key): + return ((key * self.mult) >> 32) & self.mask + +class HashTable: + """Hash table with linear probing""" + + def __init__(self, hash_fun_factory, num_buckets): + self._hash = hash_fun_factory(num_buckets) + self._num_buckets = num_buckets + self._table = [None] * num_buckets + self._size = 0 + self.reset_counter() + + def _next_bucket(self, b): + return (b + 1) % self._num_buckets + + def lookup(self, key): + """Check whether key is present in the table.""" + ret = False + steps = 1 + + b = self._hash(key) + while self._table[b] is not None: + if self._table[b] == key: + ret = True + break + steps += 1 + b = self._next_bucket(b) + + self._update_counter(steps) + return ret + + def insert(self, key): + """Add the key in the table.""" + assert self._size < self._num_buckets, "Cannot insert into a full table." + steps = 1 + + b = self._hash(key) + while self._table[b] is not None: + if self._table[b] == key: break + steps += 1 + b = self._next_bucket(b) + else: + self._table[b] = key + + self._update_counter(steps) + + def _update_counter(self, steps): + self._ops += 1 + self._steps += steps + self._max = max(self._max, steps) + + def reset_counter(self): + self._steps = 0 + self._ops = 0 + self._max = 0 + + def report_avg(self): return self._steps / max(1, self._ops) + def report_max(self): return self._max + +def permute_list(l): + N = len(l) + for i in range(N - 1): + dst = i + (rng_next_u32() % (N-i)) + l[i], l[dst] = l[dst], l[i] + +def usage_test(hash_fun_factory, max_usage = 90, retry = 40): + avg = [0.0] * max_usage + avg2 = [0.0] * max_usage + + N = 2**19 + step_size = N // 100 + elements = list(range(N)) + + for _ in range(retry): + H = HashTable(hash_fun_factory, N) + permute_list(elements) + + for s in range(max_usage): + H.reset_counter() + for i in range(step_size): + H.insert(s*step_size + i) + avg[s] += H.report_avg() + avg2[s] += H.report_avg() ** 2 + + for i in range(max_usage): + avg[i] /= retry; + avg2[i] /= retry; + std_dev = sqrt(avg2[i] - avg[i]**2) + + print("%i %.03f %.03f" % ((i + 1), avg[i], std_dev)) + +def grow_test(hash_fun_factory, usage = 60, retry = 40, begin = 7, end = 21): + for n in range(begin, end): + avg = 0.0 + avg2 = 0.0 + N = 2 ** n + elements = list(range(N)) + + for _ in range(retry): + H = HashTable(hash_fun_factory, N) + permute_list(elements) + + for x in elements[:N * usage // 100]: + H.insert(x) + + for i in range(N): + H.lookup(i) + + avg += H.report_avg() + avg2 += H.report_avg() ** 2 + + avg /= retry + avg2 /= retry + std_dev = sqrt(avg2 - avg**2) + + print("%i %.03f %.03f" % (N, avg, std_dev)) + +tests = { + "usage-ms-low": lambda: usage_test(MultiplyShiftLowHash), + "usage-ms-high": lambda: usage_test(MultiplyShiftHighHash), + "usage-poly-1": lambda: usage_test(LinearHash), + "usage-poly-2": lambda: usage_test(QuadraticHash), + "usage-tab": lambda: usage_test(TabulationHash), + + "grow-ms-low": lambda: grow_test(MultiplyShiftLowHash), + "grow-ms-high": lambda: grow_test(MultiplyShiftHighHash), + "grow-poly-1": lambda: grow_test(LinearHash), + "grow-poly-2": lambda: grow_test(QuadraticHash), + "grow-tab": lambda: grow_test(TabulationHash), +} + +if len(sys.argv) == 3: + test, student_id = sys.argv[1], sys.argv[2] + rng_init(int(student_id)) + if test in tests: + tests[test]() + else: + raise ValueError("Unknown test {}".format(test)) +else: + raise ValueError("Usage: {} <test> <student-id>".format(sys.argv[0])) + diff --git a/08-hash_experiment/task.md b/08-hash_experiment/task.md new file mode 100644 index 0000000000000000000000000000000000000000..8616811f8c71c7079cd37f4c044b406b6c5ec8d5 --- /dev/null +++ b/08-hash_experiment/task.md @@ -0,0 +1,74 @@ +## Goal + +The goal of this assignment is to experimentally evaluate Linear probing +hash table with different systems of hash functions. + +You are given a test program (`hash_experiment`) which implements everything +needed to perform the following experiments: + +- _Grow experiment:_ This experiment tries different sizes $N$ of the hash table and for each size + it inserts small keys in random order until 60% of the table is used + and then it performs lookup operation for keys $0,\ldots,N-1$. +- _Usage experiment:_ This experiment uses hash table of size $2^{20}$. It performs insertions + to increase usage of the table by 1%, reports efficiency of the insert operation, + and repeats until usage of the table reaches 90%. + +Both experiments measure number of probed buckets per operation, are repeated 40 times +and report average and standard deviation. Note that even with 40 repetitions +the reported numbers still depend quite a lot on the random seed used. + +You should perform these experiments for 5 different classes of hash functions – +tabulation, multiply-shift which uses top bits of 32-bit word (`ms-low`), +multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`), +and polynomial hash function of degree 1 and 2 – and write a report, which contains two +plots of the measured data for each experiment. The first plot should contain average +complexity of operations and the second one the standard deviation. + +Each plot should show the dependence of the average number of probed buckets +either on size of the hash table (the grow experiment) or the usage of the hash table +(the usage experiment). + +The report should discuss the experimental results and try to explain the observed +behavior using theory from the lectures. (If you want, you can carry out further +experiments to gain better understanding of the data structure and include these +in the report. This is strictly optional.) + +You should submit a PDF file with the report (and no source code). +You will get 1 temporary point upon submission if the file is syntactically correct; +proper points will be assigned later. + +## Test program + +The test program is given two arguments: +- The name of the test (`{grow,usage}-{ms-low,ms-high,poly-1,poly-2,tab}`). +- The random seed: you should use the last 2 digits of your student ID (you can find + it in the Study Information System – just click on the Personal data icon). Please + include the random seed in your report. + +The output of the program contains one line per experiment, which consists of +the set size and the average number of structural changes. + +## Hints + +The following tools can be useful for producing nice plots: +- [pandas](https://pandas.pydata.org/) +- [matplotlib](https://matplotlib.org/) +- [gnuplot](http://www.gnuplot.info/) + +A quick checklist for plots: +- Is there a caption explaining what is plotted? +- Are the axes clearly labelled? Do they have value ranges and units? +- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs + are more fitting in some cases, but you should tell.) +- Is it clear which curve means what? +- Is it clear what are the measured points and what is an interpolated + curve between them? +- Are there any overlaps? (E.g., the most interesting part of the curve + hidden underneath a label?) + +In your discussion, please distinguish the following kinds of claims. +It should be always clear which is which: +- Experimental results (i.e., the raw data you obtained from the experiments) +- Theoretical facts (i.e., claims we have proved mathematically) +- Your hypotheses (e.g., when you claim that the graph looks like something is true, + but you are not able to prove rigorously that it always holds)