From 3abeaaf59d2def23d95724662ece8010aefb5999 Mon Sep 17 00:00:00 2001 From: Pavel Vesely <vesely@iuuk.mff.cuni.cz> Date: Tue, 5 Dec 2023 21:57:10 +0100 Subject: [PATCH] hash exp. --- 10-hash_experiment/cpp/Makefile | 20 + 10-hash_experiment/cpp/hash_experiment.cpp | 441 +++++++++++++++++++++ 10-hash_experiment/cpp/random.h | 59 +++ 10-hash_experiment/task.md | 82 ++++ 4 files changed, 602 insertions(+) create mode 100644 10-hash_experiment/cpp/Makefile create mode 100644 10-hash_experiment/cpp/hash_experiment.cpp create mode 100644 10-hash_experiment/cpp/random.h create mode 100644 10-hash_experiment/task.md diff --git a/10-hash_experiment/cpp/Makefile b/10-hash_experiment/cpp/Makefile new file mode 100644 index 0000000..bd3e297 --- /dev/null +++ b/10-hash_experiment/cpp/Makefile @@ -0,0 +1,20 @@ +INCLUDE ?= . +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) +STUDENT_ID ?= PLEASE_SET_STUDENT_ID + +HASHFUNCS=ms-high poly-1 poly-2 tab + +.PHONY: test +test: $(addprefix out/t-growSeq-, $(HASHFUNCS)) $(addprefix out/t-usageSeq-, $(HASHFUNCS)) + +out/t-%: hash_experiment + @mkdir -p out + ./hash_experiment $* $(STUDENT_ID) >$@ + +hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h + $(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@ + +.PHONY: clean +clean: + rm -f hash_experiment + rm -rf out diff --git a/10-hash_experiment/cpp/hash_experiment.cpp b/10-hash_experiment/cpp/hash_experiment.cpp new file mode 100644 index 0000000..05457e9 --- /dev/null +++ b/10-hash_experiment/cpp/hash_experiment.cpp @@ -0,0 +1,441 @@ +#include <vector> +#include <functional> +#include <algorithm> +#include <utility> +#include <stdexcept> +#include <stdio.h> +#include <stdint.h> +#include <math.h> +#include "random.h" + +using namespace std; + +RandomGen rng(42); + +typedef uint32_t uint; + +typedef function<uint(uint)> HashFunction; +typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory; + +/* + * Hash function for hashing by tabulation. + * + * The 32-bit key is split to four 8-bit parts. Each part indexes + * a separate table of 256 randomly generated values. Obtained values + * are XORed together. + */ +class TabulationHash { + unsigned num_buckets; + vector<uint> tables; + + TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) { + for (uint& x : tables) x = rng.next_u32(); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(TabulationHash(num_buckets)); + } + + uint operator()(uint key) { + return ( + tables[key & 0xff] ^ + tables[((key >> 8) & 0xff) | 0x100] ^ + tables[((key >> 16) & 0xff) | 0x200] ^ + tables[((key >> 24) & 0xff) | 0x300] + ) % num_buckets; + } +}; + +// Hash function using polynomial modulo a prime. +template < int degree, uint prime = 2147483647 > +class PolynomialHash { + unsigned num_buckets; + vector<uint> coefs; + + PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) { + for (uint& x : coefs) x = rng.next_u32(); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(PolynomialHash(num_buckets)); + } + + uint operator()(uint key) { + uint64_t acc = 0; + for (uint c : coefs) acc = (acc * key + c) % prime; + return (uint)(acc % num_buckets); + } +}; + +typedef PolynomialHash<1> LinearHash; +typedef PolynomialHash<2> QuadraticHash; + +// Multiply-shift hash function taking top bits of 32-bit word +// Note: not evaluated in Makefile; its inclusion in experiments is voluntary +class MultiplyShiftLowHash { + uint mult; + uint mask; + int shift = 0; + + MultiplyShiftLowHash(unsigned num_buckets) { + mult = rng.next_u32() | 0x1; + mask = num_buckets - 1; + + if (mask & num_buckets) + throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2"); + + unsigned tmp = num_buckets - 1; + while ((0x80000000U & tmp) == 0) { + tmp <<= 1; + shift++; + } + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(MultiplyShiftLowHash(num_buckets)); + } + + uint operator()(uint key) { + return ((key * mult) >> shift) & mask; + } +}; + +// Multiply-shift hash function taking low bits of upper half of 64-bit word +class MultiplyShiftHighHash { + uint mask; + uint64_t mult; + + MultiplyShiftHighHash(unsigned num_buckets) { + mult = rng.next_u64() | 0x1; + mask = num_buckets - 1; + + if (mask & num_buckets) + throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2"); + } + + public: + static HashFunction factory(unsigned num_buckets) { + return HashFunction(MultiplyShiftHighHash(num_buckets)); + } + + uint operator()(uint key) { + return ((key * mult) >> 32) & mask; + } +}; + +// Hash table with linear probing +class HashTable { + HashFunction hash; + vector<uint> table; + unsigned size = 0; + + unsigned ops; + unsigned max_; + uint64_t steps; + + public: + // We reserve one integer to mark unused buckets. This integer + // cannot be stored in the table. + static constexpr uint UNUSED = ~((uint)0); + + HashTable(const HashFunctionFactory& factory, unsigned num_buckets) : + hash(factory(num_buckets)), table(num_buckets, +UNUSED) { + reset_counter(); + } + + // Check whether key is present in the table. + bool lookup(uint key) { + if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED"); + + bool ret = false; + unsigned steps = 1; + + uint b = hash(key); + while (table[b] != UNUSED) { + if (table[b] == key) { + ret = true; + break; + } + steps++; + b = next_bucket(b); + } + + update_counter(steps); + return ret; + } + + // Add the key in the table. + void insert(uint key) { + if (key == UNUSED) throw runtime_error("Cannot insert UNUSED"); + if (size >= table.size()) throw runtime_error("Insert: Table is full"); + + unsigned steps = 1; + uint b = hash(key); + + while (table[b] != UNUSED) { + if (table[b] == key) goto key_found; + steps++; + b = next_bucket(b); + } + + table[b] = key; + size++; + + key_found: + update_counter(steps); + } + +/* + Return expected number of steps for removing one random element. + I.e. the average number of positions between an element's hash position and the first empty position. + Note: not used in experiments +*/ + double delete_avg() { + vector<unsigned> hashed(table.size(), 0); + for(uint x : table) + if(x != UNUSED) + hashed[hash(x)]++; + + const unsigned first_unused = distance(table.begin(), find_if(table.begin(), table.end(), [](uint x){ return x == UNUSED; })); + unsigned total_steps = 0, elements = 0; + for(unsigned i = first_unused+1; i < first_unused+table.size(); i++) + if(table[i % table.size()] == UNUSED) + elements = 0; + else { + elements += hashed[i % table.size()]; + total_steps += elements; + } + + return (double)total_steps / size; + } + + void reset_counter() { ops = steps = max_ = 0; } + double report_avg() { return ((double)steps) / max(1U, ops); } + double report_max() { return max_; } + + private: + void update_counter(unsigned steps) { + ops++; + this->steps += steps; + max_ = max(steps, max_); + } + + unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); } +}; + +// Usage test with inserting a prefix of 1...N +void usageSeq_test(HashFunctionFactory factory, int max_usage = 90, int retry = 100) { + vector<double> avg(max_usage, 0.0); + vector<double> avg2(max_usage, 0.0); + vector<double> maximum(max_usage, 0.0); + + unsigned N = 1 << 20; + unsigned step_size = N / 100; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + + for (int s = 0; s < max_usage; s++) { + H.reset_counter(); + for (unsigned i = 0; i < step_size; i++) + H.insert((s*step_size + i)); + + avg[s] += H.report_avg(); + avg2[s] += H.report_avg() * H.report_avg(); + maximum[s] = max(maximum[s], H.report_avg()); + } + } + + for (int i = 0; i < max_usage; i++) { + avg[i] /= retry; + avg2[i] /= retry; + double std_dev = sqrt(avg2[i] - avg[i]*avg[i]); + + printf("%i %.03lf %.03lf %.03lf\n", i+1, avg[i], std_dev, maximum[i]); + } +} + +// Usage test with inserting a prefix of a random permutation of 1...N +// Note: this test is not evaluated in Makefile; its inclusion in experiments is voluntary +void usageRnd_test(HashFunctionFactory factory, int max_usage = 90, int retry = 100) { + vector<double> avg(max_usage, 0.0); + vector<double> avg2(max_usage, 0.0); + vector<double> maximum(max_usage, 0.0); + + unsigned N = 1 << 20; + unsigned step_size = N / 100; + + vector<uint> elements(N); + for (unsigned i = 0; i < N; i++) elements[i] = i; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + for (unsigned i = 0; i < N-1; i++) + swap(elements[i], elements[i + (rng.next_u32() % (N-i))]); + + for (int s = 0; s < max_usage; s++) { + H.reset_counter(); + for (unsigned i = 0; i < step_size; i++) + H.insert(elements[s*step_size + i]); + + avg[s] += H.report_avg(); + avg2[s] += H.report_avg() * H.report_avg(); + maximum[s] = max(maximum[s], H.report_avg()); + } + } + + for (int i = 0; i < max_usage; i++) { + avg[i] /= retry; + avg2[i] /= retry; + double std_dev = sqrt(avg2[i] - avg[i]*avg[i]); + + printf("%i %.03lf %.03lf %.03lf\n", i+1, avg[i], std_dev, maximum[i]); + } +} + +// Grow test with inserting 1...0.6*N +void growSeq_test(HashFunctionFactory factory, int usage = 60, int retry = 100, + int begin = 7, int end = 22) { + + for (int n = begin; n < end; n++) { + double avg = 0; + double avg2 = 0; + double maximum = 0; + unsigned N = 1 << n; + + vector<uint> elements(N); + for (unsigned i = 0; i < N; i++) elements[i] = i; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + + for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++) + H.insert(elements[i]); + + //for (unsigned i = 0; i < N; i++) + // H.lookup(i); + + avg += H.report_avg(); + avg2 += H.report_avg() * H.report_avg(); + maximum = max(maximum, H.report_avg()); + } + + avg /= retry; + avg2 /= retry; + double std_dev = sqrt(avg2 - avg*avg); + + printf("%i %.03lf %.03lf %.03lf\n", N, avg, std_dev, maximum); + } +} + +// Grow test with inserting a prefix (first 60%) of a random permutation of 1...N +// Note: this test is not evaluated in Makefile; its inclusion in experiments is voluntary +void growRnd_test(HashFunctionFactory factory, int usage = 60, int retry = 100, + int begin = 7, int end = 22) { + + for (int n = begin; n < end; n++) { + double avg = 0; + double avg2 = 0; + double maximum = 0; + unsigned N = 1 << n; + + vector<uint> elements(N); + for (unsigned i = 0; i < N; i++) elements[i] = i; + + for (int t = 0; t < retry; t++) { + HashTable H(factory, N); + for (unsigned i = 0; i < N-1; i++) + swap(elements[i], elements[i + (rng.next_u32() % (N-i))]); + + for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++) + H.insert(elements[i]); + + for (unsigned i = 0; i < N; i++) + H.lookup(i); + + avg += H.report_avg(); + avg2 += H.report_avg() * H.report_avg(); + maximum = max(maximum, H.report_avg()); + } + + avg /= retry; + avg2 /= retry; + double std_dev = sqrt(avg2 - avg*avg); + + printf("%i %.03lf %.03lf %.03lf\n", N, avg, std_dev, maximum); + } +} + +int main(int argc, char** argv) { + vector<pair<string, HashFunctionFactory>> growRnd_tests = { + {"growRnd-ms-low", MultiplyShiftLowHash::factory}, + {"growRnd-ms-high", MultiplyShiftHighHash::factory}, + {"growRnd-poly-1", LinearHash::factory}, + {"growRnd-poly-2", QuadraticHash::factory}, + {"growRnd-tab", TabulationHash::factory} + }; + vector<pair<string, HashFunctionFactory>> growSeq_tests = { + {"growSeq-ms-low", MultiplyShiftLowHash::factory}, + {"growSeq-ms-high", MultiplyShiftHighHash::factory}, + {"growSeq-poly-1", LinearHash::factory}, + {"growSeq-poly-2", QuadraticHash::factory}, + {"growSeq-tab", TabulationHash::factory} + }; + vector<pair<string, HashFunctionFactory>> usageRnd_tests = { + {"usageRnd-ms-low", MultiplyShiftLowHash::factory}, + {"usageRnd-ms-high", MultiplyShiftHighHash::factory}, + {"usageRnd-poly-1", LinearHash::factory}, + {"usageRnd-poly-2", QuadraticHash::factory}, + {"usageRnd-tab", TabulationHash::factory} + }; + vector<pair<string, HashFunctionFactory>> usageSeq_tests = { + {"usageSeq-ms-low", MultiplyShiftLowHash::factory}, + {"usageSeq-ms-high", MultiplyShiftHighHash::factory}, + {"usageSeq-poly-1", LinearHash::factory}, + {"usageSeq-poly-2", QuadraticHash::factory}, + {"usageSeq-tab", TabulationHash::factory} + }; + + if (argc != 3) goto fail; + + rng = RandomGen(atoi(argv[2])); + + for (auto t : growRnd_tests) { + if (t.first == argv[1]) { + growRnd_test(t.second); + return 0; + } + } + for (auto t : growSeq_tests) { + if (t.first == argv[1]) { + growSeq_test(t.second); + return 0; + } + } + + for (auto t : usageRnd_tests) { + if (t.first == argv[1]) { + usageRnd_test(t.second); + return 0; + } + } + + for (auto t : usageSeq_tests) { + if (t.first == argv[1]) { + usageSeq_test(t.second); + return 0; + } + } + + fail: + printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]); + for (auto t : growRnd_tests) printf(" %s", t.first.c_str()); + for (auto t : growSeq_tests) printf(" %s", t.first.c_str()); + for (auto t : usageRnd_tests) printf(" %s", t.first.c_str()); + for (auto t : usageSeq_tests) printf(" %s", t.first.c_str()); + return 1; +} + diff --git a/10-hash_experiment/cpp/random.h b/10-hash_experiment/cpp/random.h new file mode 100644 index 0000000..7d18ab6 --- /dev/null +++ b/10-hash_experiment/cpp/random.h @@ -0,0 +1,59 @@ +#define DS1_RANDOM_H + +#include <cstdint> + +/* + * This is the xoroshiro128+ random generator, designed in 2016 by David Blackman + * and Sebastiano Vigna, distributed under the CC-0 license. For more details, + * see http://vigna.di.unimi.it/xorshift/. + * + * Rewritten to C++ by Martin Mares, also placed under CC-0. + */ + +class RandomGen { + uint64_t state[2]; + + uint64_t rotl(uint64_t x, int k) + { + return (x << k) | (x >> (64 - k)); + } + + public: + // Initialize the generator, set its seed and warm it up. + RandomGen(unsigned int seed) + { + state[0] = seed * 0xdeadbeef; + state[1] = seed ^ 0xc0de1234; + for (int i=0; i<100; i++) + next_u64(); + } + + // Generate a random 64-bit number. + uint64_t next_u64(void) + { + uint64_t s0 = state[0], s1 = state[1]; + uint64_t result = s0 + s1; + s1 ^= s0; + state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14); + state[1] = rotl(s1, 36); + return result; + } + + // Generate a random 32-bit number. + uint32_t next_u32(void) + { + return next_u64() >> 11; + } + + // Generate a number between 0 and range-1. + unsigned int next_range(unsigned int range) + { + /* + * This is not perfectly uniform, unless the range is a power of two. + * However, for 64-bit random values and 32-bit ranges, the bias is + * insignificant. + */ + return next_u64() % range; + } +}; + diff --git a/10-hash_experiment/task.md b/10-hash_experiment/task.md new file mode 100644 index 0000000..da71d3a --- /dev/null +++ b/10-hash_experiment/task.md @@ -0,0 +1,82 @@ +## Goal + +The goal of this assignment is to experimentally evaluate Linear probing +hash table with different systems of hash functions. + +You are given a test C++ program (`hash_experiment`) which implements everything +needed to perform the following experiments: + +- _Grow experiment:_ This experiment tries different sizes $m$ of the hash table and for each size + it inserts keys $1, 2, ..., 0.6\cdot m$ in this order (that is, the tables will be 60% full). +- _Usage experiment:_ This experiment uses hash table of size $2^{20}$. It performs insertions + to increase usage of the table by 1%, reports efficiency of the insert operation, + and repeats until usage of the table reaches 90%. + +Both experiments measure the average number of probed buckets per operation, are repeated 100 times +and report the mean, standard deviation, and maximum of these averages over all repetitions. +Note that even with 100 (or more) repetitions the reported numbers still depend quite a lot on the random seed used. + +You should perform these experiments for 4 different classes of hash functions – +tabulation, multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`), +and polynomial hash function of degree 1 and 2 – and write a report, which contains three +plots of the measured data for each experiment, each plot with four curves. The first plot should contain average +complexity of operations over all repetitions, the second one the standard deviation, and the third one the maximum. + +Each plot should show the dependence of the average number of probed buckets +either on size of the hash table (the grow experiment) or the usage of the hash table +(the usage experiment). + +The report should discuss the experimental results and if possible, try to explain the observed +behavior using theory mentioned during the lecture. (If you want, you can carry out further +experiments to gain better understanding of the data structure and include these +in the report. This is strictly optional.) + +You should submit a PDF file with the report (and no source code). +You will get 1 temporary point upon submission if the file is syntactically correct; +proper points will be assigned later. + +## Test program + +The test program is given two arguments: +- The name of the test (`{growSeq,usageSeq}-{ms-high,poly-1,poly-2,tab}`). +- The random seed: you should use the last 2 digits of your student ID (you can find + it in the Study Information System – just click on the Personal data icon). Please + include the random seed in your report. + +The output of the program contains one line per experiment, which consists of +the table size (for growSeq) or usage of the table in percents (for usageSeq), +the mean of the averages, the standard deviation of the averages, and the maximum average +number of probes per insert. + +Note that as Python tends to be substantially slower, the test program is provided in C++ only. +Nevertheless, to generate all the data needed for the plots, +it is sufficient to run `make` on a Linux machine with the `g++` compiler +(on Windows, one can use WSL, Cygwin, etc.). +Before running `make`, you only need to set the student ID inside `Makefile`. + +## Hints + +The following tools can be useful for producing nice plots: +- [pandas](https://pandas.pydata.org/) +- [matplotlib](https://matplotlib.org/) +- [gnuplot](http://www.gnuplot.info/) + +A quick checklist for plots: +- Is there a caption explaining what is plotted? +- Are the axes clearly labelled? Do they have value ranges and units? +- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs + are more fitting in some cases, but you should tell.) +- Is it clear which curve means what? +- Is it clear what are the measured points and what is an interpolated + curve between them? +- Are there any overlaps? (E.g., the most interesting part of the curve + hidden underneath a label?) + +In your discussion, please distinguish the following kinds of claims. +It should be always clear which is which: +- Experimental results (i.e., the raw data you obtained from the experiments) +- Theoretical facts (i.e., claims have been proved mathematically) +- Your hypotheses (e.g., when you claim that the graph looks like something is true, + but you are not able to prove rigorously that it always holds) + +Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master). -- GitLab