Skip to content
Snippets Groups Projects
Commit ad5c9d05 authored by Radek Hušek's avatar Radek Hušek
Browse files

Hash experiment

parent efe15b97
No related branches found
No related tags found
No related merge requests found
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
.PHONY: clean
clean:
rm -f hash_experiment
#include <vector>
#include <functional>
#include <algorithm>
#include <utility>
#include <stdexcept>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "random.h"
using namespace std;
RandomGen rng(42);
typedef uint32_t uint;
typedef function<uint(uint)> HashFunction;
typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
class TabulationHash {
unsigned num_buckets;
vector<uint> tables;
TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
for (uint& x : tables) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(TabulationHash(num_buckets));
}
uint operator()(uint key) {
return (
tables[key & 0xff] ^
tables[((key >> 8) & 0xff) | 0x100] ^
tables[((key >> 16) & 0xff) | 0x200] ^
tables[((key >> 24) & 0xff) | 0x300]
) % num_buckets;
}
};
// Hash function using polynomial modulo a prime.
template < int degree, uint prime = 2147483647 >
class PolynomialHash {
unsigned num_buckets;
vector<uint> coefs;
PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
for (uint& x : coefs) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(PolynomialHash(num_buckets));
}
uint operator()(uint key) {
uint64_t acc = 0;
for (uint c : coefs) acc = (acc * key + c) % prime;
return (uint)(acc % num_buckets);
}
};
typedef PolynomialHash<1> LinearHash;
typedef PolynomialHash<2> QuadraticHash;
// Multiply-shift hash function taking top bits of 32-bit word
class MultiplyShiftLowHash {
uint mult;
uint mask;
int shift = 0;
MultiplyShiftLowHash(unsigned num_buckets) {
mult = rng.next_u32() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
unsigned tmp = num_buckets - 1;
while ((0x80000000U & tmp) == 0) {
tmp <<= 1;
shift++;
}
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftLowHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> shift) & mask;
}
};
// Multiply-shift hash function taking low bits of upper half of 64-bit word
class MultiplyShiftHighHash {
uint mask;
uint64_t mult;
MultiplyShiftHighHash(unsigned num_buckets) {
mult = rng.next_u64() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftHighHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> 32) & mask;
}
};
// Hash table with linear probing
class HashTable {
HashFunction hash;
vector<uint> table;
unsigned size = 0;
unsigned ops;
unsigned max_;
uint64_t steps;
public:
// We reserve one integer to mark unused buckets. This integer
// cannot be stored in the table.
static constexpr uint UNUSED = ~((uint)0);
HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
reset_counter();
}
// Check whether key is present in the table.
bool lookup(uint key) {
if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
bool ret = false;
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) {
ret = true;
break;
}
steps++;
b = next_bucket(b);
}
update_counter(steps);
return ret;
}
// Add the key in the table.
void insert(uint key) {
if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
if (size >= table.size()) throw runtime_error("Insert: Table is full");
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) goto key_found;
steps++;
b = next_bucket(b);
}
table[b] = key;
size++;
key_found:
update_counter(steps);
}
void reset_counter() { ops = steps = max_ = 0; }
double report_avg() { return ((double)steps) / max(1U, ops); }
double report_max() { return max_; }
private:
void update_counter(unsigned steps) {
ops++;
this->steps += steps;
max_ = max(steps, max_);
}
unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
};
void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) {
vector<double> avg(max_usage, 0.0);
vector<double> avg2(max_usage, 0.0);
unsigned N = 1 << 20;
unsigned step_size = N / 100;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (int s = 0; s < max_usage; s++) {
H.reset_counter();
for (unsigned i = 0; i < step_size; i++)
H.insert(elements[s*step_size + i]);
avg[s] += H.report_avg();
avg2[s] += H.report_avg() * H.report_avg();
}
}
for (int i = 0; i < max_usage; i++) {
avg[i] /= retry;
avg2[i] /= retry;
double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
printf("%i %.03lf %.03lf\n", i+1, avg[i], std_dev);
}
}
void grow_test(HashFunctionFactory factory, int usage = 60, int retry = 40,
int begin = 7, int end = 22) {
for (int n = begin; n < end; n++) {
double avg = 0;
double avg2 = 0;
unsigned N = 1 << n;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
H.insert(elements[i]);
for (unsigned i = 0; i < N; i++)
H.lookup(i);
avg += H.report_avg();
avg2 += H.report_avg() * H.report_avg();
}
avg /= retry;
avg2 /= retry;
double std_dev = sqrt(avg2 - avg*avg);
printf("%i %.03lf %.03lf\n", N, avg, std_dev);
}
}
int main(int argc, char** argv) {
vector<pair<string, HashFunctionFactory>> grow_tests = {
{"grow-ms-low", MultiplyShiftLowHash::factory},
{"grow-ms-high", MultiplyShiftHighHash::factory},
{"grow-poly-1", LinearHash::factory},
{"grow-poly-2", QuadraticHash::factory},
{"grow-tab", TabulationHash::factory}
};
vector<pair<string, HashFunctionFactory>> usage_tests = {
{"usage-ms-low", MultiplyShiftLowHash::factory},
{"usage-ms-high", MultiplyShiftHighHash::factory},
{"usage-poly-1", LinearHash::factory},
{"usage-poly-2", QuadraticHash::factory},
{"usage-tab", TabulationHash::factory}
};
if (argc != 3) goto fail;
rng = RandomGen(atoi(argv[2]));
for (auto t : grow_tests) {
if (t.first == argv[1]) {
grow_test(t.second);
return 0;
}
}
for (auto t : usage_tests) {
if (t.first == argv[1]) {
usage_test(t.second);
return 0;
}
}
fail:
printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]);
for (auto t : grow_tests) printf(" %s", t.first.c_str());
for (auto t : usage_tests) printf(" %s", t.first.c_str());
return 1;
}
#ifndef DS1_RANDOM_H
#define DS1_RANDOM_H
#include <cstdint>
/*
* This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
* and Sebastiano Vigna, distributed under the CC-0 license. For more details,
* see http://vigna.di.unimi.it/xorshift/.
*
* Rewritten to C++ by Martin Mares, also placed under CC-0.
*/
class RandomGen {
uint64_t state[2];
uint64_t rotl(uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
public:
// Initialize the generator, set its seed and warm it up.
RandomGen(unsigned int seed)
{
state[0] = seed * 0xdeadbeef;
state[1] = seed ^ 0xc0de1234;
for (int i=0; i<100; i++)
next_u64();
}
// Generate a random 64-bit number.
uint64_t next_u64(void)
{
uint64_t s0 = state[0], s1 = state[1];
uint64_t result = s0 + s1;
s1 ^= s0;
state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
state[1] = rotl(s1, 36);
return result;
}
// Generate a random 32-bit number.
uint32_t next_u32(void)
{
return next_u64() >> 11;
}
// Generate a number between 0 and range-1.
unsigned int next_range(unsigned int range)
{
/*
* This is not perfectly uniform, unless the range is a power of two.
* However, for 64-bit random values and 32-bit ranges, the bias is
* insignificant.
*/
return next_u64() % range;
}
};
#endif
#!/usr/bin/env python3
import random, sys
from math import sqrt
# Our wrapper of random so we can substitute it with another random generator
rng_init = lambda x: random.seed(x)
rng_next_u32 = lambda: random.randint(0, 2**32 - 1)
class TabulationHash:
"""Hash function for hashing by tabulation.
The 32-bit key is split to four 8-bit parts. Each part indexes
a separate table of 256 randomly generated values. Obtained values
are XORed together.
"""
def __init__(self, num_buckets):
self.num_buckets = num_buckets
self.tables = [None] * 4
for i in range(4):
self.tables[i] = [ rng_next_u32() for _ in range(256) ]
def __call__(self, key):
h0 = key & 0xff;
h1 = (key >> 8) & 0xff;
h2 = (key >> 16) & 0xff;
h3 = (key >> 24) & 0xff;
t = self.tables
return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
class PolynomialHash:
"""Hash function using polynomial modulo a prime."""
def __init__(self, num_buckets, degree, prime = 2147483647):
self.num_buckets = num_buckets
self.prime = prime
self.coefs = [ rng_next_u32() for _ in range(degree + 1) ]
def __call__(self, key):
acc = 0
for c in self.coefs:
acc = (acc * key + c) % self.prime
return acc % self.num_buckets
LinearHash = lambda num_buckets: PolynomialHash(num_buckets, 1)
QuadraticHash = lambda num_buckets: PolynomialHash(num_buckets, 2)
class MultiplyShiftLowHash:
"""Multiply-shift hash function taking top bits of 32-bit word"""
def __init__(self, num_buckets):
self.mask = num_buckets - 1
assert (num_buckets & self.mask == 0), \
"MultiplyShiftLowHash: num_buckets must be power of 2"
self.mult = rng_next_u32() | 0x1
self.shift = 0;
tmp = num_buckets - 1
while 0x80000000 & tmp == 0:
tmp <<= 1
self.shift += 1
def __call__(self, key):
return ((key * self.mult) >> self.shift) & self.mask
class MultiplyShiftHighHash:
"""Multiply-shift hash function taking low bits of upper half of 64-bit word"""
def __init__(self, num_buckets):
self.mask = num_buckets - 1
assert (num_buckets & self.mask == 0), \
"MultiplyShiftLowHash: num_buckets must be power of 2"
self.mult = (rng_next_u32() << 32) | rng_next_u32() | 0x1
def __call__(self, key):
return ((key * self.mult) >> 32) & self.mask
class HashTable:
"""Hash table with linear probing"""
def __init__(self, hash_fun_factory, num_buckets):
self._hash = hash_fun_factory(num_buckets)
self._num_buckets = num_buckets
self._table = [None] * num_buckets
self._size = 0
self.reset_counter()
def _next_bucket(self, b):
return (b + 1) % self._num_buckets
def lookup(self, key):
"""Check whether key is present in the table."""
ret = False
steps = 1
b = self._hash(key)
while self._table[b] is not None:
if self._table[b] == key:
ret = True
break
steps += 1
b = self._next_bucket(b)
self._update_counter(steps)
return ret
def insert(self, key):
"""Add the key in the table."""
assert self._size < self._num_buckets, "Cannot insert into a full table."
steps = 1
b = self._hash(key)
while self._table[b] is not None:
if self._table[b] == key: break
steps += 1
b = self._next_bucket(b)
else:
self._table[b] = key
self._update_counter(steps)
def _update_counter(self, steps):
self._ops += 1
self._steps += steps
self._max = max(self._max, steps)
def reset_counter(self):
self._steps = 0
self._ops = 0
self._max = 0
def report_avg(self): return self._steps / max(1, self._ops)
def report_max(self): return self._max
def permute_list(l):
N = len(l)
for i in range(N - 1):
dst = i + (rng_next_u32() % (N-i))
l[i], l[dst] = l[dst], l[i]
def usage_test(hash_fun_factory, max_usage = 90, retry = 40):
avg = [0.0] * max_usage
avg2 = [0.0] * max_usage
N = 2**20
step_size = N // 100
elements = list(range(N))
for _ in range(retry):
H = HashTable(hash_fun_factory, N)
permute_list(elements)
for s in range(max_usage):
H.reset_counter()
for i in range(step_size):
H.insert(s*step_size + i)
avg[s] += H.report_avg()
avg2[s] += H.report_avg() ** 2
for i in range(max_usage):
avg[i] /= retry;
avg2[i] /= retry;
std_dev = sqrt(avg2[i] - avg[i]**2)
print("%i %.03f %.03f" % ((i + 1), avg[i], std_dev))
def grow_test(hash_fun_factory, usage = 60, retry = 40, begin = 7, end = 22):
for n in range(begin, end):
avg = 0.0
avg2 = 0.0
N = 2 ** n
elements = list(range(N))
for _ in range(retry):
H = HashTable(hash_fun_factory, N)
permute_list(elements)
for x in elements[:N * usage // 100]:
H.insert(x)
for i in range(N):
H.lookup(i)
avg += H.report_avg()
avg2 += H.report_avg() ** 2
avg /= retry
avg2 /= retry
std_dev = sqrt(avg2 - avg**2)
print("%i %.03f %.03f" % (N, avg, std_dev))
tests = {
"usage-ms-low": lambda: usage_test(MultiplyShiftLowHash),
"usage-ms-high": lambda: usage_test(MultiplyShiftHighHash),
"usage-poly-1": lambda: usage_test(LinearHash),
"usage-poly-2": lambda: usage_test(QuadraticHash),
"usage-tab": lambda: usage_test(TabulationHash),
"grow-ms-low": lambda: grow_test(MultiplyShiftLowHash),
"grow-ms-high": lambda: grow_test(MultiplyShiftHighHash),
"grow-poly-1": lambda: grow_test(LinearHash),
"grow-poly-2": lambda: grow_test(QuadraticHash),
"grow-tab": lambda: grow_test(TabulationHash),
}
if len(sys.argv) == 3:
test, student_id = sys.argv[1], sys.argv[2]
rng_init(int(student_id))
if test in tests:
tests[test]()
else:
raise ValueError("Unknown test {}".format(test))
else:
raise ValueError("Usage: {} <test> <student-id>".format(sys.argv[0]))
## Goal
The goal of this assignment is to experimentally evaluate Linear probing
hash table with different systems of hash functions.
You are given a test program (`hash_experiment`) which implements everything
needed to perform the following experiments:
- _Grow test:_ This test tries different sizes $N$ of the hash table and for each size
it inserts small keys in random order until 60% of the table is used
and then it performs lookup operation for keys $0,\ldots,N-1$.
- _Usage test:_ This test uses hash table of size $2^20$. It performs insertions
to increase usage of the table by 1%, reports efficiency of the insert operation,
and repeats until usage of the table reaches 90%.
Both test measure number of probed buckets per operation, are repeated 40 times
and report average and standard deviation. Note that even with 40 repetitions
the reported numbers still depend quite a lot on the random seed used.
You should perform these experiments for 5 different classes of hash functions –
tabulation, multiply-shift which uses top bits of 32-bit word (`ms-low`),
multiply-shift which uses low bits of upper half of 64-bit word (`ms-high`),
and polynomial hash function of degree 1 and 2 – and write a report, which contains two
plots of the measured data for each test. The first plot should contain average
complexity of operations and the second one the standard deviation.
Each plot should show the dependence of the average number of probed buckets
either on size of the hash table (the grow test) or the usage of the hash table
(the usage test).
The report should discuss the experimental results and try to explain the observed
behavior using theory from the lectures. (If you want, you can carry out further
experiments to gain better understanding of the data structure and include these
in the report. This is strictly optional.)
You should submit a PDF file with the report (and no source code).
You will get 1 temporary point upon submission if the file is syntactically correct;
proper points will be assigned later.
## Test program
The test program is given two arguments:
- The name of the test (`{grow,usage}-{ms-low,ms-high,poly-1,poly-2,tab}`).
- The random seed: you should use the last 2 digits of your student ID (you can find
it in the Study Information System – just click on the Personal data icon). Please
include the random seed in your report.
The output of the program contains one line per experiment, which consists of
the set size and the average number of structural changes.
## Hints
The following tools can be useful for producing nice plots:
- [pandas](https://pandas.pydata.org/)
- [matplotlib](https://matplotlib.org/)
- [gnuplot](http://www.gnuplot.info/)
A quick checklist for plots:
- Is there a caption explaining what is plotted?
- Are the axes clearly labelled? Do they have value ranges and units?
- Have you mentioned that this axis has logarithmic scale? (Logarithmic graphs
are more fitting in some cases, but you should tell.)
- Is it clear which curve means what?
- Is it clear what are the measured points and what is an interpolated
curve between them?
- Are there any overlaps? (E.g., the most interesting part of the curve
hidden underneath a label?)
In your discussion, please distinguish the following kinds of claims.
It should be always clear which is which:
- Experimental results (i.e., the raw data you obtained from the experiments)
- Theoretical facts (i.e., claims we have proved mathematically)
- Your hypotheses (e.g., when you claim that the graph looks like something is true,
but you are not able to prove rigorously that it always holds)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment