Publish hash_experiment

e80b4475 · Tung Anh Vu · 37755580 · e80b4475 · e80b4475 · e80b4475
Commit e80b4475 authored Nov 28, 2021 by Tung Anh Vu
--- a/09-hash_experiment/cpp/Makefile
+++ b/09-hash_experiment/cpp/Makefile
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+STUDENT_ID ?= PLEASE_SET_STUDENT_ID
+
+HASHFUNCS=ms-low ms-high poly-1 poly-2 tab
+
+.PHONY: test
+test: $(addprefix out/t-grow-, $(HASHFUNCS)) $(addprefix out/t-usage-, $(HASHFUNCS))
+
+out/t-%: hash_experiment
+	@mkdir -p out
+	./hash_experiment $* $(STUDENT_ID) >$@
+
+hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
+
+.PHONY: clean
+clean:
+	rm -f hash_experiment
+	rm -rf out
--- a/09-hash_experiment/cpp/hash_experiment.cpp
+++ b/09-hash_experiment/cpp/hash_experiment.cpp
+#include <vector>
+#include <functional>
+#include <algorithm>
+#include <utility>
+#include <stdexcept>
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+#include "random.h"
+
+using namespace std;
+
+RandomGen rng(42);
+
+typedef uint32_t uint;
+
+typedef function<uint(uint)> HashFunction;
+typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
+
+/*
+ * Hash function for hashing by tabulation.
+ *
+ * The 32-bit key is split to four 8-bit parts. Each part indexes
+ * a separate table of 256 randomly generated values. Obtained values
+ * are XORed together.
+ */
+class TabulationHash {
+    unsigned num_buckets;
+    vector<uint> tables;
+
+    TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
+        for (uint& x : tables) x = rng.next_u32();
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(TabulationHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        return (
+            tables[key & 0xff] ^
+            tables[((key >> 8) & 0xff) | 0x100] ^
+            tables[((key >> 16) & 0xff) | 0x200] ^
+            tables[((key >> 24) & 0xff) | 0x300]
+        ) % num_buckets;
+    }
+};
+
+// Hash function using polynomial modulo a prime.
+template < int degree, uint prime = 2147483647 >
+class PolynomialHash {
+    unsigned num_buckets;
+    vector<uint> coefs;
+
+    PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
+        for (uint& x : coefs) x = rng.next_u32();
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(PolynomialHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        uint64_t acc = 0;
+        for (uint c : coefs) acc = (acc * key + c) % prime;
+        return (uint)(acc % num_buckets);
+    }
+};
+
+typedef PolynomialHash<1> LinearHash;
+typedef PolynomialHash<2> QuadraticHash;
+
+// Multiply-shift hash function taking top bits of 32-bit word
+class MultiplyShiftLowHash {
+    uint mult;
+    uint mask;
+    int shift = 0;
+
+    MultiplyShiftLowHash(unsigned num_buckets) {
+        mult = rng.next_u32() | 0x1;
+        mask = num_buckets - 1;
+
+        if (mask & num_buckets)
+            throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
+
+        unsigned tmp = num_buckets - 1;
+        while ((0x80000000U & tmp) == 0) {
+            tmp <<= 1;
+            shift++;
+        }
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(MultiplyShiftLowHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        return ((key * mult) >> shift) & mask;
+    }
+};
+
+// Multiply-shift hash function taking low bits of upper half of 64-bit word
+class MultiplyShiftHighHash {
+    uint mask;
+    uint64_t mult;
+
+    MultiplyShiftHighHash(unsigned num_buckets) {
+        mult = rng.next_u64() | 0x1;
+        mask = num_buckets - 1;
+
+        if (mask & num_buckets)
+            throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
+    }
+
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(MultiplyShiftHighHash(num_buckets));
+    }
+
+    uint operator()(uint key) {
+        return ((key * mult) >> 32) & mask;
+    }
+};
+
+
+// Hash table with linear probing
+class HashTable {
+    HashFunction hash;
+    vector<uint> table;
+    unsigned size = 0;
+
+    unsigned ops;
+    unsigned max_;
+    uint64_t steps;
+
+  public:
+    // We reserve one integer to mark unused buckets. This integer
+    // cannot be stored in the table.
+    static constexpr uint UNUSED = ~((uint)0);
+
+    HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
+        hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
+        reset_counter();
+    }
+
+    // Check whether key is present in the table.
+    bool lookup(uint key) {
+        if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
+
+        bool ret = false;
+        unsigned steps = 1;
+
+        uint b = hash(key);
+        while (table[b] != UNUSED) {
+            if (table[b] == key) {
+                ret = true;
+                break;
+            }
+            steps++;
+            b = next_bucket(b);
+        }
+
+        update_counter(steps);
+        return ret;
+    }
+
+    // Add the key in the table.
+    void insert(uint key) {
+        if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
+        if (size >= table.size()) throw runtime_error("Insert: Table is full");
+
+        unsigned steps = 1;
+        uint b = hash(key);
+
+        while (table[b] != UNUSED) {
+            if (table[b] == key) goto key_found;
+            steps++;
+            b = next_bucket(b);
+        }
+
+        table[b] = key;
+        size++;
+
+      key_found:
+        update_counter(steps);
+    }
+
+    void reset_counter() { ops = steps = max_ = 0; }
+    double report_avg() { return ((double)steps) / max(1U, ops); }
+    double report_max() { return max_; }
+
+  private:
+    void update_counter(unsigned steps) {
+        ops++;
+        this->steps += steps;
+        max_ = max(steps, max_);
+    }
+
+    unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
+};
+
+void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) {
+    vector<double> avg(max_usage, 0.0);
+    vector<double> avg2(max_usage, 0.0);
+
+    unsigned N = 1 << 20;
+    unsigned step_size = N / 100;
+
+    vector<uint> elements(N);
+    for (unsigned i = 0; i < N; i++) elements[i] = i;
+
+    for (int t = 0; t < retry; t++) {
+        HashTable H(factory, N);
+        for (unsigned i = 0; i < N-1; i++)
+            swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
+
+        for (int s = 0; s < max_usage; s++) {
+            H.reset_counter();
+            for (unsigned i = 0; i < step_size; i++)
+                H.insert(elements[s*step_size + i]);
+
+            avg[s] += H.report_avg();
+            avg2[s] += H.report_avg() * H.report_avg();
+        }
+    }
+
+    for (int i = 0; i < max_usage; i++) {
+        avg[i] /= retry;
+        avg2[i] /= retry;
+        double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
+
+        printf("%i %.03lf %.03lf\n", i+1, avg[i], std_dev);
+    }
+}
+
+
+void grow_test(HashFunctionFactory factory, int usage = 60, int retry = 40,
+               int begin = 7, int end = 22) {
+
+    for (int n = begin; n < end; n++) {
+        double avg = 0;
+        double avg2 = 0;
+        unsigned N = 1 << n;
+
+        vector<uint> elements(N);
+        for (unsigned i = 0; i < N; i++) elements[i] = i;
+
+        for (int t = 0; t < retry; t++) {
+            HashTable H(factory, N);
+            for (unsigned i = 0; i < N-1; i++)
+                swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
+
+            for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
+                H.insert(elements[i]);
+
+            for (unsigned i = 0; i < N; i++)
+                H.lookup(i);
+
+            avg += H.report_avg();
+            avg2 += H.report_avg() * H.report_avg();
+        }
+
+        avg /= retry;
+        avg2 /= retry;
+        double std_dev = sqrt(avg2 - avg*avg);
+
+        printf("%i %.03lf %.03lf\n", N, avg, std_dev);
+    }
+}
+
+int main(int argc, char** argv) {
+    vector<pair<string, HashFunctionFactory>> grow_tests = {
+        {"grow-ms-low", MultiplyShiftLowHash::factory},
+        {"grow-ms-high", MultiplyShiftHighHash::factory},
+        {"grow-poly-1", LinearHash::factory},
+        {"grow-poly-2", QuadraticHash::factory},
+        {"grow-tab", TabulationHash::factory}
+    };
+    vector<pair<string, HashFunctionFactory>> usage_tests = {
+        {"usage-ms-low", MultiplyShiftLowHash::factory},
+        {"usage-ms-high", MultiplyShiftHighHash::factory},
+        {"usage-poly-1", LinearHash::factory},
+        {"usage-poly-2", QuadraticHash::factory},
+        {"usage-tab", TabulationHash::factory}
+    };
+
+    if (argc != 3) goto fail;
+
+    rng = RandomGen(atoi(argv[2]));
+
+    for (auto t : grow_tests) {
+        if (t.first == argv[1]) {
+            grow_test(t.second);
+            return 0;
+        }
+    }
+
+    for (auto t : usage_tests) {
+        if (t.first == argv[1]) {
+            usage_test(t.second);
+            return 0;
+        }
+    }
+
+  fail:
+    printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]);
+    for (auto t : grow_tests) printf(" %s", t.first.c_str());
+    for (auto t : usage_tests) printf(" %s", t.first.c_str());
+    return 1;
+}
+
--- a/09-hash_experiment/cpp/random.h
+++ b/09-hash_experiment/cpp/random.h
+#ifndef DS1_RANDOM_H
+#define DS1_RANDOM_H
+
+#include <cstdint>
+
+/*
+ * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
+ * and Sebastiano Vigna, distributed under the CC-0 license. For more details,
+ * see http://vigna.di.unimi.it/xorshift/.
+ *
+ * Rewritten to C++ by Martin Mares, also placed under CC-0.
+ */
+
+class RandomGen {
+    uint64_t state[2];
+
+    uint64_t rotl(uint64_t x, int k)
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+
+  public:
+    // Initialize the generator, set its seed and warm it up.
+    RandomGen(unsigned int seed)
+    {
+        state[0] = seed * 0xdeadbeef;
+        state[1] = seed ^ 0xc0de1234;
+        for (int i=0; i<100; i++)
+            next_u64();
+    }
+
+    // Generate a random 64-bit number.
+    uint64_t next_u64(void)
+    {
+        uint64_t s0 = state[0], s1 = state[1];
+        uint64_t result = s0 + s1;
+        s1 ^= s0;
+        state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+        state[1] = rotl(s1, 36);
+        return result;
+    }
+
+    // Generate a random 32-bit number.
+    uint32_t next_u32(void)
+    {
+      return next_u64() >> 11;
+    }
+
+    // Generate a number between 0 and range-1.
+    unsigned int next_range(unsigned int range)
+    {
+        /*
+         * This is not perfectly uniform, unless the range is a power of two.
+         * However, for 64-bit random values and 32-bit ranges, the bias is
+         * insignificant.
+         */
+        return next_u64() % range;
+    }
+};
+
+#endif
--- a/09-hash_experiment/python/Makefile
+++ b/09-hash_experiment/python/Makefile
+STUDENT_ID ?= PLEASE_SET_STUDENT_ID
+
+HASHFUNCS=ms-low ms-high poly-1 poly-2 tab
+
+.PHONY: test
+test: $(addprefix out/t-grow-, $(HASHFUNCS)) $(addprefix out/t-usage-, $(HASHFUNCS))
+
+out/t-%: hash_experiment.py
+	@mkdir -p out
+	./hash_experiment.py $* $(STUDENT_ID) >$@
+
+.PHONY: clean
+clean:
+	rm -rf out
--- a/09-hash_experiment/python/hash_experiment.py
+++ b/09-hash_experiment/python/hash_experiment.py
+#!/usr/bin/env python3
+
+import random, sys
+from math import sqrt
+
+# Our wrapper of random so we can substitute it with another random generator
+rng_init = lambda x: random.seed(x)
+rng_next_u32 = lambda: random.randint(0, 2**32 - 1)
+
+class TabulationHash:
+    """Hash function for hashing by tabulation.
+
+    The 32-bit key is split to four 8-bit parts. Each part indexes
+    a separate table of 256 randomly generated values. Obtained values
+    are XORed together.
+    """
+
+    def __init__(self, num_buckets):
+        self.num_buckets = num_buckets
+        self.tables = [None] * 4
+        for i in range(4):
+            self.tables[i] = [ rng_next_u32() for _ in range(256) ]
+
+    def __call__(self, key):
+        h0 = key & 0xff;
+        h1 = (key >> 8) & 0xff;
+        h2 = (key >> 16) & 0xff;
+        h3 = (key >> 24) & 0xff;
+        t = self.tables
+        return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
+
+class PolynomialHash:
+    """Hash function using polynomial modulo a prime."""
+
+    def __init__(self, num_buckets, degree, prime = 2147483647):
+        self.num_buckets = num_buckets
+        self.prime = prime
+        self.coefs = [ rng_next_u32() for _ in range(degree + 1) ]
+
+    def __call__(self, key):
+        acc = 0
+        for c in self.coefs:
+            acc = (acc * key + c) % self.prime
+        return acc % self.num_buckets
+
+LinearHash = lambda num_buckets: PolynomialHash(num_buckets, 1)
+QuadraticHash = lambda num_buckets: PolynomialHash(num_buckets, 2)
+
+class MultiplyShiftLowHash:
+    """Multiply-shift hash function taking top bits of 32-bit word"""
+
+    def __init__(self, num_buckets):
+        self.mask = num_buckets - 1
+        assert (num_buckets & self.mask == 0), \
+            "MultiplyShiftLowHash: num_buckets must be power of 2"
+
+        self.mult = rng_next_u32() | 0x1
+        self.shift = 0;
+        tmp = num_buckets - 1
+        while 0x80000000 & tmp == 0:
+            tmp <<= 1
+            self.shift += 1
+
+    def __call__(self, key):
+        return ((key * self.mult) >> self.shift) & self.mask
+
+class MultiplyShiftHighHash:
+    """Multiply-shift hash function taking low bits of upper half of 64-bit word"""
+
+    def __init__(self, num_buckets):
+        self.mask = num_buckets - 1
+        assert (num_buckets & self.mask == 0), \
+            "MultiplyShiftLowHash: num_buckets must be power of 2"
+        self.mult = (rng_next_u32() << 32) | rng_next_u32() | 0x1
+
+    def __call__(self, key):
+        return ((key * self.mult) >> 32) & self.mask
+
+class HashTable:
+    """Hash table with linear probing"""
+
+    def __init__(self, hash_fun_factory, num_buckets):
+        self._hash = hash_fun_factory(num_buckets)
+        self._num_buckets = num_buckets
+        self._table = [None] * num_buckets
+        self._size = 0
+        self.reset_counter()
+
+    def _next_bucket(self, b):
+        return (b + 1) % self._num_buckets
+
+    def lookup(self, key):
+        """Check whether key is present in the table."""
+        ret = False
+        steps = 1
+
+        b = self._hash(key)
+        while self._table[b] is not None:
+            if self._table[b] == key:
+              ret = True
+              break
+            steps += 1
+            b = self._next_bucket(b)
+
+        self._update_counter(steps)
+        return ret
+
+    def insert(self, key):
+        """Add the key in the table."""
+        assert self._size < self._num_buckets, "Cannot insert into a full table."
+        steps = 1
+
+        b = self._hash(key)
+        while self._table[b] is not None:
+            if self._table[b] == key: break
+            steps += 1
+            b = self._next_bucket(b)
+        else:
+            self._table[b] = key
+
+        self._update_counter(steps)
+
+    def _update_counter(self, steps):
+        self._ops += 1
+        self._steps += steps
+        self._max = max(self._max, steps)
+
+    def reset_counter(self):
+        self._steps = 0
+        self._ops = 0
+        self._max = 0
+
+    def report_avg(self): return self._steps / max(1, self._ops)
+    def report_max(self): return self._max
+
+def permute_list(l):
+    N = len(l)
+    for i in range(N - 1):
+        dst = i + (rng_next_u32() % (N-i))
+        l[i], l[dst] = l[dst], l[i]
+
+def usage_test(hash_fun_factory, max_usage = 90, retry = 40):
+    avg = [0.0] * max_usage
+    avg2 = [0.0] * max_usage
+
+    N = 2**19
+    step_size = N // 100
+    elements = list(range(N))
+
+    for _ in range(retry):
+        H = HashTable(hash_fun_factory, N)
+        permute_list(elements)
+
+        for s in range(max_usage):
+            H.reset_counter()
+            for i in range(step_size):
+                H.insert(s*step_size + i)
+            avg[s] += H.report_avg()
+            avg2[s] += H.report_avg() ** 2
+
+    for i in range(max_usage):
+        avg[i] /= retry;
+        avg2[i] /= retry;
+        std_dev = sqrt(avg2[i] - avg[i]**2)
+
+        print("%i %.03f %.03f" % ((i + 1), avg[i], std_dev))
+
+def grow_test(hash_fun_factory, usage = 60, retry = 40, begin = 7, end = 21):
+    for n in range(begin, end):
+        avg = 0.0
+        avg2 = 0.0
+        N = 2 ** n
+        elements = list(range(N))
+
+        for _ in range(retry):
+            H = HashTable(hash_fun_factory, N)
+            permute_list(elements)
+
+            for x in elements[:N * usage // 100]:
+                H.insert(x)
+
+            for i in range(N):
+                H.lookup(i)
+
+            avg += H.report_avg()
+            avg2 += H.report_avg() ** 2
+
+        avg /= retry
+        avg2 /= retry
+        std_dev = sqrt(avg2 - avg**2)
+
+        print("%i %.03f %.03f" % (N, avg, std_dev))
+
+tests = {
+    "usage-ms-low": lambda: usage_test(MultiplyShiftLowHash),
+    "usage-ms-high": lambda: usage_test(MultiplyShiftHighHash),
+    "usage-poly-1": lambda: usage_test(LinearHash),
+    "usage-poly-2": lambda: usage_test(QuadraticHash),
+    "usage-tab": lambda: usage_test(TabulationHash),
+
+    "grow-ms-low": lambda: grow_test(MultiplyShiftLowHash),
+    "grow-ms-high": lambda: grow_test(MultiplyShiftHighHash),
+    "grow-poly-1": lambda: grow_test(LinearHash),
+    "grow-poly-2": lambda: grow_test(QuadraticHash),
+    "grow-tab": lambda: grow_test(TabulationHash),
+}
+
+if len(sys.argv) == 3:
+    test, student_id = sys.argv[1], sys.argv[2]
+    rng_init(int(student_id))
+    if test in tests:
+        tests[test]()
+    else:
+        raise ValueError("Unknown test {}".format(test))
+else:
+    raise ValueError("Usage: {} <test> <student-id>".format(sys.argv[0]))
+
--- a/09-hash_experiment/task.md
+++ b/09-hash_experiment/task.md
+## Goal
+
+The goal of this assignment is to experimentally evaluate Linear probing
+hash table with different systems of hash functions.
+
+You are given a test program (`hash_experiment`) which implements everything
+needed to perform the following experiments:
+
+- _Grow experiment:_ This experiment tries different sizes $N$ of the hash table and for each size
+  it inserts small keys in random order until 60% of the table is used
+  and then it performs lookup operation for keys $0,\ldots,N-1$.
+- _Usage experiment:_ This experiment uses hash table of size $2^{20}$. It performs insertions
+  to increase usage of the table by 1%, reports efficiency of the insert operation,
+  and repeats until usage of the table reaches 90%.
+
+Both experiments measure number of probed buckets per operation, are repeated 40 times
+and report average and standard deviation. Note that even with 40 repetitions
+the reported numbers still depend quite a lot on the random seed used.
+
+You should perform these experiments for 5 different classes of hash functions –
+tabulation, multiply-shift which uses top bits of 32-bit word (`ms-low`),
+multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`),
+and polynomial hash function of degree 1 and 2 – and write a report, which contains two
+plots of the measured data for each experiment. The first plot should contain average
+complexity of operations and the second one the standard deviation.
+
+Each plot should show the dependence of the average number of probed buckets
+either on size of the hash table (the grow experiment) or the usage of the hash table
+(the usage experiment).
+
+The report should discuss the experimental results and try to explain the observed
+behavior using theory from the lectures. (If you want, you can carry out further
+experiments to gain better understanding of the data structure and include these
+in the report. This is strictly optional.)
+
+You should submit a PDF file with the report (and no source code).
+You will get 1 temporary point upon submission if the file is syntactically correct;
+proper points will be assigned later.
+
+## Test program
+
+The test program is given two arguments:
+- The name of the test (`{grow,usage}-{ms-low,ms-high,poly-1,poly-2,tab}`).
+- The random seed: you should use the last 2 digits of your student ID (you can find
+  it in the Study Information System – just click on the Personal data icon). Please
+  include the random seed in your report.
+
+The output of the program contains one line per experiment, which consists of
+the set size and the average number of structural changes.
+
+## Hints
+
+The following tools can be useful for producing nice plots:
+- [pandas](https://pandas.pydata.org/)
+- [matplotlib](https://matplotlib.org/)
+- [gnuplot](http://www.gnuplot.info/)
+
+A quick checklist for plots:
+- Is there a caption explaining what is plotted?
+- Are the axes clearly labelled? Do they have value ranges and units?
+- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs
+  are more fitting in some cases, but you should tell.)
+- Is it clear which curve means what?
+- Is it clear what are the measured points and what is an interpolated
+  curve between them?
+- Are there any overlaps? (E.g., the most interesting part of the curve
+  hidden underneath a label?)
+
+In your discussion, please distinguish the following kinds of claims.
+It should be always clear which is which:
+- Experimental results (i.e., the raw data you obtained from the experiments)
+- Theoretical facts (i.e., claims we have proved mathematically)
+- Your hypotheses (e.g., when you claim that the graph looks like something is true,
+  but you are not able to prove rigorously that it always holds)
+
+Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).