Skip to content
Snippets Groups Projects
Commit 3abeaaf5 authored by Pavel Veselý's avatar Pavel Veselý
Browse files

hash exp.

parent 3ef1aa96
No related branches found
No related tags found
No related merge requests found
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
STUDENT_ID ?= PLEASE_SET_STUDENT_ID
HASHFUNCS=ms-high poly-1 poly-2 tab
.PHONY: test
test: $(addprefix out/t-growSeq-, $(HASHFUNCS)) $(addprefix out/t-usageSeq-, $(HASHFUNCS))
out/t-%: hash_experiment
@mkdir -p out
./hash_experiment $* $(STUDENT_ID) >$@
hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
.PHONY: clean
clean:
rm -f hash_experiment
rm -rf out
#include <vector>
#include <functional>
#include <algorithm>
#include <utility>
#include <stdexcept>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "random.h"
using namespace std;
RandomGen rng(42);
typedef uint32_t uint;
typedef function<uint(uint)> HashFunction;
typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
class TabulationHash {
unsigned num_buckets;
vector<uint> tables;
TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
for (uint& x : tables) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(TabulationHash(num_buckets));
}
uint operator()(uint key) {
return (
tables[key & 0xff] ^
tables[((key >> 8) & 0xff) | 0x100] ^
tables[((key >> 16) & 0xff) | 0x200] ^
tables[((key >> 24) & 0xff) | 0x300]
) % num_buckets;
}
};
// Hash function using polynomial modulo a prime.
template < int degree, uint prime = 2147483647 >
class PolynomialHash {
unsigned num_buckets;
vector<uint> coefs;
PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
for (uint& x : coefs) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(PolynomialHash(num_buckets));
}
uint operator()(uint key) {
uint64_t acc = 0;
for (uint c : coefs) acc = (acc * key + c) % prime;
return (uint)(acc % num_buckets);
}
};
typedef PolynomialHash<1> LinearHash;
typedef PolynomialHash<2> QuadraticHash;
// Multiply-shift hash function taking top bits of 32-bit word
// Note: not evaluated in Makefile; its inclusion in experiments is voluntary
class MultiplyShiftLowHash {
uint mult;
uint mask;
int shift = 0;
MultiplyShiftLowHash(unsigned num_buckets) {
mult = rng.next_u32() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
unsigned tmp = num_buckets - 1;
while ((0x80000000U & tmp) == 0) {
tmp <<= 1;
shift++;
}
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftLowHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> shift) & mask;
}
};
// Multiply-shift hash function taking low bits of upper half of 64-bit word
class MultiplyShiftHighHash {
uint mask;
uint64_t mult;
MultiplyShiftHighHash(unsigned num_buckets) {
mult = rng.next_u64() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftHighHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> 32) & mask;
}
};
// Hash table with linear probing
class HashTable {
HashFunction hash;
vector<uint> table;
unsigned size = 0;
unsigned ops;
unsigned max_;
uint64_t steps;
public:
// We reserve one integer to mark unused buckets. This integer
// cannot be stored in the table.
static constexpr uint UNUSED = ~((uint)0);
HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
reset_counter();
}
// Check whether key is present in the table.
bool lookup(uint key) {
if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
bool ret = false;
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) {
ret = true;
break;
}
steps++;
b = next_bucket(b);
}
update_counter(steps);
return ret;
}
// Add the key in the table.
void insert(uint key) {
if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
if (size >= table.size()) throw runtime_error("Insert: Table is full");
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) goto key_found;
steps++;
b = next_bucket(b);
}
table[b] = key;
size++;
key_found:
update_counter(steps);
}
/*
Return expected number of steps for removing one random element.
I.e. the average number of positions between an element's hash position and the first empty position.
Note: not used in experiments
*/
double delete_avg() {
vector<unsigned> hashed(table.size(), 0);
for(uint x : table)
if(x != UNUSED)
hashed[hash(x)]++;
const unsigned first_unused = distance(table.begin(), find_if(table.begin(), table.end(), [](uint x){ return x == UNUSED; }));
unsigned total_steps = 0, elements = 0;
for(unsigned i = first_unused+1; i < first_unused+table.size(); i++)
if(table[i % table.size()] == UNUSED)
elements = 0;
else {
elements += hashed[i % table.size()];
total_steps += elements;
}
return (double)total_steps / size;
}
void reset_counter() { ops = steps = max_ = 0; }
double report_avg() { return ((double)steps) / max(1U, ops); }
double report_max() { return max_; }
private:
void update_counter(unsigned steps) {
ops++;
this->steps += steps;
max_ = max(steps, max_);
}
unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
};
// Usage test with inserting a prefix of 1...N
void usageSeq_test(HashFunctionFactory factory, int max_usage = 90, int retry = 100) {
vector<double> avg(max_usage, 0.0);
vector<double> avg2(max_usage, 0.0);
vector<double> maximum(max_usage, 0.0);
unsigned N = 1 << 20;
unsigned step_size = N / 100;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (int s = 0; s < max_usage; s++) {
H.reset_counter();
for (unsigned i = 0; i < step_size; i++)
H.insert((s*step_size + i));
avg[s] += H.report_avg();
avg2[s] += H.report_avg() * H.report_avg();
maximum[s] = max(maximum[s], H.report_avg());
}
}
for (int i = 0; i < max_usage; i++) {
avg[i] /= retry;
avg2[i] /= retry;
double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
printf("%i %.03lf %.03lf %.03lf\n", i+1, avg[i], std_dev, maximum[i]);
}
}
// Usage test with inserting a prefix of a random permutation of 1...N
// Note: this test is not evaluated in Makefile; its inclusion in experiments is voluntary
void usageRnd_test(HashFunctionFactory factory, int max_usage = 90, int retry = 100) {
vector<double> avg(max_usage, 0.0);
vector<double> avg2(max_usage, 0.0);
vector<double> maximum(max_usage, 0.0);
unsigned N = 1 << 20;
unsigned step_size = N / 100;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (int s = 0; s < max_usage; s++) {
H.reset_counter();
for (unsigned i = 0; i < step_size; i++)
H.insert(elements[s*step_size + i]);
avg[s] += H.report_avg();
avg2[s] += H.report_avg() * H.report_avg();
maximum[s] = max(maximum[s], H.report_avg());
}
}
for (int i = 0; i < max_usage; i++) {
avg[i] /= retry;
avg2[i] /= retry;
double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
printf("%i %.03lf %.03lf %.03lf\n", i+1, avg[i], std_dev, maximum[i]);
}
}
// Grow test with inserting 1...0.6*N
void growSeq_test(HashFunctionFactory factory, int usage = 60, int retry = 100,
int begin = 7, int end = 22) {
for (int n = begin; n < end; n++) {
double avg = 0;
double avg2 = 0;
double maximum = 0;
unsigned N = 1 << n;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
H.insert(elements[i]);
//for (unsigned i = 0; i < N; i++)
// H.lookup(i);
avg += H.report_avg();
avg2 += H.report_avg() * H.report_avg();
maximum = max(maximum, H.report_avg());
}
avg /= retry;
avg2 /= retry;
double std_dev = sqrt(avg2 - avg*avg);
printf("%i %.03lf %.03lf %.03lf\n", N, avg, std_dev, maximum);
}
}
// Grow test with inserting a prefix (first 60%) of a random permutation of 1...N
// Note: this test is not evaluated in Makefile; its inclusion in experiments is voluntary
void growRnd_test(HashFunctionFactory factory, int usage = 60, int retry = 100,
int begin = 7, int end = 22) {
for (int n = begin; n < end; n++) {
double avg = 0;
double avg2 = 0;
double maximum = 0;
unsigned N = 1 << n;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
H.insert(elements[i]);
for (unsigned i = 0; i < N; i++)
H.lookup(i);
avg += H.report_avg();
avg2 += H.report_avg() * H.report_avg();
maximum = max(maximum, H.report_avg());
}
avg /= retry;
avg2 /= retry;
double std_dev = sqrt(avg2 - avg*avg);
printf("%i %.03lf %.03lf %.03lf\n", N, avg, std_dev, maximum);
}
}
int main(int argc, char** argv) {
vector<pair<string, HashFunctionFactory>> growRnd_tests = {
{"growRnd-ms-low", MultiplyShiftLowHash::factory},
{"growRnd-ms-high", MultiplyShiftHighHash::factory},
{"growRnd-poly-1", LinearHash::factory},
{"growRnd-poly-2", QuadraticHash::factory},
{"growRnd-tab", TabulationHash::factory}
};
vector<pair<string, HashFunctionFactory>> growSeq_tests = {
{"growSeq-ms-low", MultiplyShiftLowHash::factory},
{"growSeq-ms-high", MultiplyShiftHighHash::factory},
{"growSeq-poly-1", LinearHash::factory},
{"growSeq-poly-2", QuadraticHash::factory},
{"growSeq-tab", TabulationHash::factory}
};
vector<pair<string, HashFunctionFactory>> usageRnd_tests = {
{"usageRnd-ms-low", MultiplyShiftLowHash::factory},
{"usageRnd-ms-high", MultiplyShiftHighHash::factory},
{"usageRnd-poly-1", LinearHash::factory},
{"usageRnd-poly-2", QuadraticHash::factory},
{"usageRnd-tab", TabulationHash::factory}
};
vector<pair<string, HashFunctionFactory>> usageSeq_tests = {
{"usageSeq-ms-low", MultiplyShiftLowHash::factory},
{"usageSeq-ms-high", MultiplyShiftHighHash::factory},
{"usageSeq-poly-1", LinearHash::factory},
{"usageSeq-poly-2", QuadraticHash::factory},
{"usageSeq-tab", TabulationHash::factory}
};
if (argc != 3) goto fail;
rng = RandomGen(atoi(argv[2]));
for (auto t : growRnd_tests) {
if (t.first == argv[1]) {
growRnd_test(t.second);
return 0;
}
}
for (auto t : growSeq_tests) {
if (t.first == argv[1]) {
growSeq_test(t.second);
return 0;
}
}
for (auto t : usageRnd_tests) {
if (t.first == argv[1]) {
usageRnd_test(t.second);
return 0;
}
}
for (auto t : usageSeq_tests) {
if (t.first == argv[1]) {
usageSeq_test(t.second);
return 0;
}
}
fail:
printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]);
for (auto t : growRnd_tests) printf(" %s", t.first.c_str());
for (auto t : growSeq_tests) printf(" %s", t.first.c_str());
for (auto t : usageRnd_tests) printf(" %s", t.first.c_str());
for (auto t : usageSeq_tests) printf(" %s", t.first.c_str());
return 1;
}
#define DS1_RANDOM_H
#include <cstdint>
/*
* This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
* and Sebastiano Vigna, distributed under the CC-0 license. For more details,
* see http://vigna.di.unimi.it/xorshift/.
*
* Rewritten to C++ by Martin Mares, also placed under CC-0.
*/
class RandomGen {
uint64_t state[2];
uint64_t rotl(uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
public:
// Initialize the generator, set its seed and warm it up.
RandomGen(unsigned int seed)
{
state[0] = seed * 0xdeadbeef;
state[1] = seed ^ 0xc0de1234;
for (int i=0; i<100; i++)
next_u64();
}
// Generate a random 64-bit number.
uint64_t next_u64(void)
{
uint64_t s0 = state[0], s1 = state[1];
uint64_t result = s0 + s1;
s1 ^= s0;
state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
state[1] = rotl(s1, 36);
return result;
}
// Generate a random 32-bit number.
uint32_t next_u32(void)
{
return next_u64() >> 11;
}
// Generate a number between 0 and range-1.
unsigned int next_range(unsigned int range)
{
/*
* This is not perfectly uniform, unless the range is a power of two.
* However, for 64-bit random values and 32-bit ranges, the bias is
* insignificant.
*/
return next_u64() % range;
}
};
## Goal
The goal of this assignment is to experimentally evaluate Linear probing
hash table with different systems of hash functions.
You are given a test C++ program (`hash_experiment`) which implements everything
needed to perform the following experiments:
- _Grow experiment:_ This experiment tries different sizes $m$ of the hash table and for each size
it inserts keys $1, 2, ..., 0.6\cdot m$ in this order (that is, the tables will be 60% full).
- _Usage experiment:_ This experiment uses hash table of size $2^{20}$. It performs insertions
to increase usage of the table by 1%, reports efficiency of the insert operation,
and repeats until usage of the table reaches 90%.
Both experiments measure the average number of probed buckets per operation, are repeated 100 times
and report the mean, standard deviation, and maximum of these averages over all repetitions.
Note that even with 100 (or more) repetitions the reported numbers still depend quite a lot on the random seed used.
You should perform these experiments for 4 different classes of hash functions –
tabulation, multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`),
and polynomial hash function of degree 1 and 2 – and write a report, which contains three
plots of the measured data for each experiment, each plot with four curves. The first plot should contain average
complexity of operations over all repetitions, the second one the standard deviation, and the third one the maximum.
Each plot should show the dependence of the average number of probed buckets
either on size of the hash table (the grow experiment) or the usage of the hash table
(the usage experiment).
The report should discuss the experimental results and if possible, try to explain the observed
behavior using theory mentioned during the lecture. (If you want, you can carry out further
experiments to gain better understanding of the data structure and include these
in the report. This is strictly optional.)
You should submit a PDF file with the report (and no source code).
You will get 1 temporary point upon submission if the file is syntactically correct;
proper points will be assigned later.
## Test program
The test program is given two arguments:
- The name of the test (`{growSeq,usageSeq}-{ms-high,poly-1,poly-2,tab}`).
- The random seed: you should use the last 2 digits of your student ID (you can find
it in the Study Information System – just click on the Personal data icon). Please
include the random seed in your report.
The output of the program contains one line per experiment, which consists of
the table size (for growSeq) or usage of the table in percents (for usageSeq),
the mean of the averages, the standard deviation of the averages, and the maximum average
number of probes per insert.
Note that as Python tends to be substantially slower, the test program is provided in C++ only.
Nevertheless, to generate all the data needed for the plots,
it is sufficient to run `make` on a Linux machine with the `g++` compiler
(on Windows, one can use WSL, Cygwin, etc.).
Before running `make`, you only need to set the student ID inside `Makefile`.
## Hints
The following tools can be useful for producing nice plots:
- [pandas](https://pandas.pydata.org/)
- [matplotlib](https://matplotlib.org/)
- [gnuplot](http://www.gnuplot.info/)
A quick checklist for plots:
- Is there a caption explaining what is plotted?
- Are the axes clearly labelled? Do they have value ranges and units?
- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs
are more fitting in some cases, but you should tell.)
- Is it clear which curve means what?
- Is it clear what are the measured points and what is an interpolated
curve between them?
- Are there any overlaps? (E.g., the most interesting part of the curve
hidden underneath a label?)
In your discussion, please distinguish the following kinds of claims.
It should be always clear which is which:
- Experimental results (i.e., the raw data you obtained from the experiments)
- Theoretical facts (i.e., claims have been proved mathematically)
- Your hypotheses (e.g., when you claim that the graph looks like something is true,
but you are not able to prove rigorously that it always holds)
Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment