hash exp.

3abeaaf5 · Pavel Veselý · 3ef1aa96 · 3abeaaf5 · 3abeaaf5 · 3abeaaf5
Commit 3abeaaf5 authored Dec 5, 2023 by Pavel Veselý
--- a/10-hash_experiment/cpp/Makefile
+++ b/10-hash_experiment/cpp/Makefile
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+STUDENT_ID ?= PLEASE_SET_STUDENT_ID
+HASHFUNCS=ms-high poly-1 poly-2 tab
+.PHONY: test
+test: $(addprefix out/t-growSeq-, $(HASHFUNCS)) $(addprefix out/t-usageSeq-, $(HASHFUNCS))
+out/t-%: hash_experiment
+	@mkdir -p out
+	./hash_experiment $* $(STUDENT_ID) >$@
+hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
+	$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
+.PHONY: clean
+clean:
+	rm -f hash_experiment
+	rm -rf out
--- a/10-hash_experiment/cpp/hash_experiment.cpp
+++ b/10-hash_experiment/cpp/hash_experiment.cpp
+#include <vector>
+#include <functional>
+#include <algorithm>
+#include <utility>
+#include <stdexcept>
+#include <stdio.h>
+#include <stdint.h>
+#include <math.h>
+#include "random.h"
+using namespace std;
+RandomGen rng(42);
+typedef uint32_t uint;
+typedef function<uint(uint)> HashFunction;
+typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
+/*
+ * Hash function for hashing by tabulation.
+ *
+ * The 32-bit key is split to four 8-bit parts. Each part indexes
+ * a separate table of 256 randomly generated values. Obtained values
+ * are XORed together.
+ */
+class TabulationHash {
+    unsigned num_buckets;
+    vector<uint> tables;
+    TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
+        for (uint& x : tables) x = rng.next_u32();
+    }
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(TabulationHash(num_buckets));
+    }
+    uint operator()(uint key) {
+        return (
+            tables[key & 0xff] ^
+            tables[((key >> 8) & 0xff) | 0x100] ^
+            tables[((key >> 16) & 0xff) | 0x200] ^
+            tables[((key >> 24) & 0xff) | 0x300]
+        ) % num_buckets;
+    }
+};
+// Hash function using polynomial modulo a prime.
+template < int degree, uint prime = 2147483647 >
+class PolynomialHash {
+    unsigned num_buckets;
+    vector<uint> coefs;
+    PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
+        for (uint& x : coefs) x = rng.next_u32();
+    }
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(PolynomialHash(num_buckets));
+    }
+    uint operator()(uint key) {
+        uint64_t acc = 0;
+        for (uint c : coefs) acc = (acc * key + c) % prime;
+        return (uint)(acc % num_buckets);
+    }
+};
+typedef PolynomialHash<1> LinearHash;
+typedef PolynomialHash<2> QuadraticHash;
+// Multiply-shift hash function taking top bits of 32-bit word
+// Note: not evaluated in Makefile; its inclusion in experiments is voluntary
+class MultiplyShiftLowHash {
+    uint mult;
+    uint mask;
+    int shift = 0;
+    MultiplyShiftLowHash(unsigned num_buckets) {
+        mult = rng.next_u32() | 0x1;
+        mask = num_buckets - 1;
+        if (mask & num_buckets)
+            throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
+        unsigned tmp = num_buckets - 1;
+        while ((0x80000000U & tmp) == 0) {
+            tmp <<= 1;
+            shift++;
+        }
+    }
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(MultiplyShiftLowHash(num_buckets));
+    }
+    uint operator()(uint key) {
+        return ((key * mult) >> shift) & mask;
+    }
+};
+// Multiply-shift hash function taking low bits of upper half of 64-bit word
+class MultiplyShiftHighHash {
+    uint mask;
+    uint64_t mult;
+    MultiplyShiftHighHash(unsigned num_buckets) {
+        mult = rng.next_u64() | 0x1;
+        mask = num_buckets - 1;
+        if (mask & num_buckets)
+            throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
+    }
+  public:
+    static HashFunction factory(unsigned num_buckets) {
+        return HashFunction(MultiplyShiftHighHash(num_buckets));
+    }
+    uint operator()(uint key) {
+        return ((key * mult) >> 32) & mask;
+    }
+};
+// Hash table with linear probing
+class HashTable {
+    HashFunction hash;
+    vector<uint> table;
+    unsigned size = 0;
+    unsigned ops;
+    unsigned max_;
+    uint64_t steps;
+  public:
+    // We reserve one integer to mark unused buckets. This integer
+    // cannot be stored in the table.
+    static constexpr uint UNUSED = ~((uint)0);
+    HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
+        hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
+        reset_counter();
+    }
+    // Check whether key is present in the table.
+    bool lookup(uint key) {
+        if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
+        bool ret = false;
+        unsigned steps = 1;
+        uint b = hash(key);
+        while (table[b] != UNUSED) {
+            if (table[b] == key) {
+                ret = true;
+                break;
+            }
+            steps++;
+            b = next_bucket(b);
+        }
+        update_counter(steps);
+        return ret;
+    }
+    // Add the key in the table.
+    void insert(uint key) {
+        if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
+        if (size >= table.size()) throw runtime_error("Insert: Table is full");
+        unsigned steps = 1;
+        uint b = hash(key);
+        while (table[b] != UNUSED) {
+            if (table[b] == key) goto key_found;
+            steps++;
+            b = next_bucket(b);
+        }
+        table[b] = key;
+        size++;
+      key_found:
+        update_counter(steps);
+    }
+/*
+    Return expected number of steps for removing one random element.
+    I.e. the average number of positions between an element's hash position and the first empty position.
+    Note: not used in experiments
+*/
+    double delete_avg() {
+        vector<unsigned> hashed(table.size(), 0);
+        for(uint x : table)
+            if(x != UNUSED)
+                hashed[hash(x)]++;
+        const unsigned first_unused = distance(table.begin(), find_if(table.begin(), table.end(), [](uint x){ return x == UNUSED; }));
+        unsigned total_steps = 0, elements = 0;
+        for(unsigned i = first_unused+1; i < first_unused+table.size(); i++)
+            if(table[i % table.size()] == UNUSED)
+                elements = 0;
+            else {
+                elements += hashed[i % table.size()];
+                total_steps += elements;
+            }
+        return (double)total_steps / size;
+    }
+    void reset_counter() { ops = steps = max_ = 0; }
+    double report_avg() { return ((double)steps) / max(1U, ops); }
+    double report_max() { return max_; }
+  private:
+    void update_counter(unsigned steps) {
+        ops++;
+        this->steps += steps;
+        max_ = max(steps, max_);
+    }
+    unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
+};
+// Usage test with inserting a prefix of 1...N
+void usageSeq_test(HashFunctionFactory factory, int max_usage = 90, int retry = 100) { 
+    vector<double> avg(max_usage, 0.0);
+    vector<double> avg2(max_usage, 0.0);
+    vector<double> maximum(max_usage, 0.0);
+    unsigned N = 1 << 20;
+    unsigned step_size = N / 100;
+    for (int t = 0; t < retry; t++) {
+        HashTable H(factory, N);
+        for (int s = 0; s < max_usage; s++) {
+            H.reset_counter();
+            for (unsigned i = 0; i < step_size; i++)
+                H.insert((s*step_size + i));
+            avg[s] += H.report_avg();
+            avg2[s] += H.report_avg() * H.report_avg();
+            maximum[s] = max(maximum[s], H.report_avg());
+        }
+    }
+    for (int i = 0; i < max_usage; i++) {
+        avg[i] /= retry;
+        avg2[i] /= retry;
+        double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
+        printf("%i %.03lf %.03lf %.03lf\n", i+1, avg[i], std_dev, maximum[i]);
+    }
+}
+// Usage test with inserting a prefix of a random permutation of 1...N
+// Note: this test is not evaluated in Makefile; its inclusion in experiments is voluntary
+void usageRnd_test(HashFunctionFactory factory, int max_usage = 90, int retry = 100) {
+    vector<double> avg(max_usage, 0.0);
+    vector<double> avg2(max_usage, 0.0);
+    vector<double> maximum(max_usage, 0.0);
+    unsigned N = 1 << 20;
+    unsigned step_size = N / 100;
+    vector<uint> elements(N);
+    for (unsigned i = 0; i < N; i++) elements[i] = i;
+    for (int t = 0; t < retry; t++) {
+        HashTable H(factory, N);
+        for (unsigned i = 0; i < N-1; i++)
+            swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
+        for (int s = 0; s < max_usage; s++) {
+            H.reset_counter();
+            for (unsigned i = 0; i < step_size; i++)
+                H.insert(elements[s*step_size + i]);
+            avg[s] += H.report_avg();
+            avg2[s] += H.report_avg() * H.report_avg();
+            maximum[s] = max(maximum[s], H.report_avg());
+        }
+    }
+    for (int i = 0; i < max_usage; i++) {
+        avg[i] /= retry;
+        avg2[i] /= retry;
+        double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
+        printf("%i %.03lf %.03lf %.03lf\n", i+1, avg[i], std_dev, maximum[i]);
+    }
+}
+// Grow test with inserting 1...0.6*N
+void growSeq_test(HashFunctionFactory factory, int usage = 60, int retry = 100,
+               int begin = 7, int end = 22) {
+    for (int n = begin; n < end; n++) {
+        double avg = 0;
+        double avg2 = 0;
+        double maximum = 0;
+        unsigned N = 1 << n;
+        vector<uint> elements(N);
+        for (unsigned i = 0; i < N; i++) elements[i] = i;
+        for (int t = 0; t < retry; t++) {
+            HashTable H(factory, N);
+            for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
+                H.insert(elements[i]);
+            //for (unsigned i = 0; i < N; i++)
+            //    H.lookup(i);
+            avg += H.report_avg();
+            avg2 += H.report_avg() * H.report_avg();
+            maximum = max(maximum, H.report_avg());
+        }
+        avg /= retry;
+        avg2 /= retry;
+        double std_dev = sqrt(avg2 - avg*avg);
+        printf("%i %.03lf %.03lf %.03lf\n", N, avg, std_dev, maximum);
+    }
+}
+// Grow test with inserting a prefix (first 60%) of a random permutation of 1...N
+// Note: this test is not evaluated in Makefile; its inclusion in experiments is voluntary
+void growRnd_test(HashFunctionFactory factory, int usage = 60, int retry = 100, 
+               int begin = 7, int end = 22) {
+    for (int n = begin; n < end; n++) {
+        double avg = 0;
+        double avg2 = 0;
+        double maximum = 0;
+        unsigned N = 1 << n;
+        vector<uint> elements(N);
+        for (unsigned i = 0; i < N; i++) elements[i] = i;
+        for (int t = 0; t < retry; t++) {
+            HashTable H(factory, N);
+            for (unsigned i = 0; i < N-1; i++)
+                swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
+            for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
+                H.insert(elements[i]);
+            for (unsigned i = 0; i < N; i++)
+                H.lookup(i);
+            avg += H.report_avg();
+            avg2 += H.report_avg() * H.report_avg();
+            maximum = max(maximum, H.report_avg());
+        }
+        avg /= retry;
+        avg2 /= retry;
+        double std_dev = sqrt(avg2 - avg*avg);
+        printf("%i %.03lf %.03lf %.03lf\n", N, avg, std_dev, maximum);
+    }
+}
+int main(int argc, char** argv) {
+    vector<pair<string, HashFunctionFactory>> growRnd_tests = {
+        {"growRnd-ms-low", MultiplyShiftLowHash::factory},
+        {"growRnd-ms-high", MultiplyShiftHighHash::factory},
+        {"growRnd-poly-1", LinearHash::factory},
+        {"growRnd-poly-2", QuadraticHash::factory},
+        {"growRnd-tab", TabulationHash::factory}
+    };
+    vector<pair<string, HashFunctionFactory>> growSeq_tests = {
+        {"growSeq-ms-low", MultiplyShiftLowHash::factory},
+        {"growSeq-ms-high", MultiplyShiftHighHash::factory},
+        {"growSeq-poly-1", LinearHash::factory},
+        {"growSeq-poly-2", QuadraticHash::factory},
+        {"growSeq-tab", TabulationHash::factory}
+    };
+    vector<pair<string, HashFunctionFactory>> usageRnd_tests = {
+        {"usageRnd-ms-low", MultiplyShiftLowHash::factory},
+        {"usageRnd-ms-high", MultiplyShiftHighHash::factory},
+        {"usageRnd-poly-1", LinearHash::factory},
+        {"usageRnd-poly-2", QuadraticHash::factory},
+        {"usageRnd-tab", TabulationHash::factory}
+    };
+    vector<pair<string, HashFunctionFactory>> usageSeq_tests = {
+        {"usageSeq-ms-low", MultiplyShiftLowHash::factory},
+        {"usageSeq-ms-high", MultiplyShiftHighHash::factory},
+        {"usageSeq-poly-1", LinearHash::factory},
+        {"usageSeq-poly-2", QuadraticHash::factory},
+        {"usageSeq-tab", TabulationHash::factory}
+    };
+    if (argc != 3) goto fail;
+    rng = RandomGen(atoi(argv[2]));
+    for (auto t : growRnd_tests) {
+        if (t.first == argv[1]) {
+            growRnd_test(t.second);
+            return 0;
+        }
+    }
+    for (auto t : growSeq_tests) {
+        if (t.first == argv[1]) {
+            growSeq_test(t.second);
+            return 0;
+        }
+    }
+    for (auto t : usageRnd_tests) {
+        if (t.first == argv[1]) {
+            usageRnd_test(t.second);
+            return 0;
+        }
+    }
+    for (auto t : usageSeq_tests) {
+        if (t.first == argv[1]) {
+            usageSeq_test(t.second);
+            return 0;
+        }
+    }
+  fail:
+    printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]);
+    for (auto t : growRnd_tests) printf(" %s", t.first.c_str());
+    for (auto t : growSeq_tests) printf(" %s", t.first.c_str());
+    for (auto t : usageRnd_tests) printf(" %s", t.first.c_str());
+    for (auto t : usageSeq_tests) printf(" %s", t.first.c_str());
+    return 1;
+}
--- a/10-hash_experiment/cpp/random.h
+++ b/10-hash_experiment/cpp/random.h
+#define DS1_RANDOM_H
+#include <cstdint>
+/*
+ * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
+ * and Sebastiano Vigna, distributed under the CC-0 license. For more details,
+ * see http://vigna.di.unimi.it/xorshift/.
+ *
+ * Rewritten to C++ by Martin Mares, also placed under CC-0.
+ */
+class RandomGen {
+    uint64_t state[2];
+    uint64_t rotl(uint64_t x, int k)
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+  public:
+    // Initialize the generator, set its seed and warm it up.
+    RandomGen(unsigned int seed)
+    {
+        state[0] = seed * 0xdeadbeef;
+        state[1] = seed ^ 0xc0de1234;
+        for (int i=0; i<100; i++)
+            next_u64();
+    }
+    // Generate a random 64-bit number.
+    uint64_t next_u64(void)
+    {
+        uint64_t s0 = state[0], s1 = state[1];
+        uint64_t result = s0 + s1;
+        s1 ^= s0;
+        state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
+        state[1] = rotl(s1, 36);
+        return result;
+    }
+    // Generate a random 32-bit number.
+    uint32_t next_u32(void)
+    {
+      return next_u64() >> 11;
+    }
+    // Generate a number between 0 and range-1.
+    unsigned int next_range(unsigned int range)
+    {
+        /*
+         * This is not perfectly uniform, unless the range is a power of two.
+         * However, for 64-bit random values and 32-bit ranges, the bias is
+         * insignificant.
+         */
+        return next_u64() % range;
+    }
+};
--- a/10-hash_experiment/task.md
+++ b/10-hash_experiment/task.md
+## Goal
+The goal of this assignment is to experimentally evaluate Linear probing
+hash table with different systems of hash functions.
+You are given a test C++ program (`hash_experiment`) which implements everything
+needed to perform the following experiments:
+- _Grow experiment:_ This experiment tries different sizes $m$ of the hash table and for each size
+  it inserts keys $1, 2, ..., 0.6\cdot m$ in this order (that is, the tables will be 60% full).
+- _Usage experiment:_ This experiment uses hash table of size $2^{20}$. It performs insertions
+  to increase usage of the table by 1%, reports efficiency of the insert operation,
+  and repeats until usage of the table reaches 90%.
+Both experiments measure the average number of probed buckets per operation, are repeated 100 times
+and report the mean, standard deviation, and maximum of these averages over all repetitions.
+Note that even with 100 (or more) repetitions the reported numbers still depend quite a lot on the random seed used.
+You should perform these experiments for 4 different classes of hash functions –
+tabulation, multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`),
+and polynomial hash function of degree 1 and 2 – and write a report, which contains three
+plots of the measured data for each experiment, each plot with four curves. The first plot should contain average
+complexity of operations over all repetitions, the second one the standard deviation, and the third one the maximum.
+Each plot should show the dependence of the average number of probed buckets
+either on size of the hash table (the grow experiment) or the usage of the hash table
+(the usage experiment).
+The report should discuss the experimental results and if possible, try to explain the observed
+behavior using theory mentioned during the lecture. (If you want, you can carry out further
+experiments to gain better understanding of the data structure and include these
+in the report. This is strictly optional.)
+You should submit a PDF file with the report (and no source code).
+You will get 1 temporary point upon submission if the file is syntactically correct;
+proper points will be assigned later.
+## Test program
+The test program is given two arguments:
+- The name of the test (`{growSeq,usageSeq}-{ms-high,poly-1,poly-2,tab}`).
+- The random seed: you should use the last 2 digits of your student ID (you can find
+  it in the Study Information System – just click on the Personal data icon). Please
+  include the random seed in your report.
+The output of the program contains one line per experiment, which consists of
+the table size (for growSeq) or usage of the table in percents (for usageSeq),
+the mean of the averages, the standard deviation of the averages, and the maximum average
+number of probes per insert.
+Note that as Python tends to be substantially slower, the test program is provided in C++ only.
+Nevertheless, to generate all the data needed for the plots,
+it is sufficient to run `make` on a Linux machine with the `g++` compiler
+(on Windows, one can use WSL, Cygwin, etc.).
+Before running `make`, you only need to set the student ID inside `Makefile`.
+## Hints
+The following tools can be useful for producing nice plots:
+- [pandas](https://pandas.pydata.org/)
+- [matplotlib](https://matplotlib.org/)
+- [gnuplot](http://www.gnuplot.info/)
+A quick checklist for plots:
+- Is there a caption explaining what is plotted?
+- Are the axes clearly labelled? Do they have value ranges and units?
+- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs
+  are more fitting in some cases, but you should tell.)
+- Is it clear which curve means what?
+- Is it clear what are the measured points and what is an interpolated
+  curve between them?
+- Are there any overlaps? (E.g., the most interesting part of the curve
+  hidden underneath a label?)
+In your discussion, please distinguish the following kinds of claims.
+It should be always clear which is which:
+- Experimental results (i.e., the raw data you obtained from the experiments)
+- Theoretical facts (i.e., claims have been proved mathematically)
+- Your hypotheses (e.g., when you claim that the graph looks like something is true,
+  but you are not able to prove rigorously that it always holds)
+Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).