Commit ad5c9d05 authored by Radek Hušek's avatar Radek Hušek

Hash experiment

parent efe15b97
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
.PHONY: clean
clean:
rm -f hash_experiment
#include <vector>
#include <functional>
#include <algorithm>
#include <utility>
#include <stdexcept>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "random.h"
using namespace std;
RandomGen rng(42);
typedef uint32_t uint;
typedef function<uint(uint)> HashFunction;
typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
class TabulationHash {
unsigned num_buckets;
vector<uint> tables;
TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
for (uint& x : tables) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(TabulationHash(num_buckets));
}
uint operator()(uint key) {
return (
tables[key & 0xff] ^
tables[((key >> 8) & 0xff) | 0x100] ^
tables[((key >> 16) & 0xff) | 0x200] ^
tables[((key >> 24) & 0xff) | 0x300]
) % num_buckets;
}
};
// Hash function using polynomial modulo a prime.
template < int degree, uint prime = 2147483647 >
class PolynomialHash {
unsigned num_buckets;
vector<uint> coefs;
PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
for (uint& x : coefs) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(PolynomialHash(num_buckets));
}
uint operator()(uint key) {
uint64_t acc = 0;
for (uint c : coefs) acc = (acc * key + c) % prime;
return (uint)(acc % num_buckets);
}
};
typedef PolynomialHash<1> LinearHash;
typedef PolynomialHash<2> QuadraticHash;
// Multiply-shift hash function taking top bits of 32-bit word
class MultiplyShiftLowHash {
uint mult;
uint mask;
int shift = 0;
MultiplyShiftLowHash(unsigned num_buckets) {
mult = rng.next_u32() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
unsigned tmp = num_buckets - 1;
while ((0x80000000U & tmp) == 0) {
tmp <<= 1;
shift++;
}
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftLowHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> shift) & mask;
}
};
// Multiply-shift hash function taking low bits of upper half of 64-bit word
class MultiplyShiftHighHash {
uint mask;
uint64_t mult;
MultiplyShiftHighHash(unsigned num_buckets) {
mult = rng.next_u64() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftHighHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> 32) & mask;
}
};
// Hash table with linear probing
class HashTable {
HashFunction hash;
vector<uint> table;
unsigned size = 0;
unsigned ops;
unsigned max_;
uint64_t steps;
public:
// We reserve one integer to mark unused buckets. This integer
// cannot be stored in the table.
static constexpr uint UNUSED = ~((uint)0);
HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
reset_counter();
}
// Check whether key is present in the table.
bool lookup(uint key) {
if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
bool ret = false;
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) {
ret = true;
break;
}
steps++;
b = next_bucket(b);
}
update_counter(steps);
return ret;
}
// Add the key in the table.
void insert(uint key) {
if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
if (size >= table.size()) throw runtime_error("Insert: Table is full");
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) goto key_found;
steps++;
b = next_bucket(b);
}
table[b] = key;
size++;
key_found:
update_counter(steps);
}
void reset_counter() { ops = steps = max_ = 0; }
double report_avg() { return ((double)steps) / max(1U, ops); }
double report_max() { return max_; }
private:
void update_counter(unsigned steps) {
ops++;
this->steps += steps;
max_ = max(steps, max_);
}
unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
};
void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) {
vector<double> avg(max_usage, 0.0);
vector<double> avg2(max_usage, 0.0);
unsigned N = 1 << 20;
unsigned step_size = N / 100;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (int s = 0; s < max_usage; s++) {
H.reset_counter();
for (unsigned i = 0; i < step_size; i++)
H.insert(elements[s*step_size + i]);
avg[s] += H.report_avg();
avg2[s] += H.report_avg() * H.report_avg();
}
}
for (int i = 0; i < max_usage; i++) {
avg[i] /= retry;
avg2[i] /= retry;
double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
printf("%i %.03lf %.03lf\n", i+1, avg[i], std_dev);
}
}
void grow_test(HashFunctionFactory factory, int usage = 60, int retry = 40,
int begin = 7, int end = 22) {
for (int n = begin; n < end; n++) {
double avg = 0;
double avg2 = 0;
unsigned N = 1 << n;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
H.insert(elements[i]);
for (unsigned i = 0; i < N; i++)
H.lookup(i);
avg += H.report_avg();
avg2 += H.report_avg() * H.report_avg();
}
avg /= retry;
avg2 /= retry;
double std_dev = sqrt(avg2 - avg*avg);
printf("%i %.03lf %.03lf\n", N, avg, std_dev);
}
}
int main(int argc, char** argv) {
vector<pair<string, HashFunctionFactory>> grow_tests = {
{"grow-ms-low", MultiplyShiftLowHash::factory},
{"grow-ms-high", MultiplyShiftHighHash::factory},
{"grow-poly-1", LinearHash::factory},
{"grow-poly-2", QuadraticHash::factory},
{"grow-tab", TabulationHash::factory}
};
vector<pair<string, HashFunctionFactory>> usage_tests = {
{"usage-ms-low", MultiplyShiftLowHash::factory},
{"usage-ms-high", MultiplyShiftHighHash::factory},
{"usage-poly-1", LinearHash::factory},
{"usage-poly-2", QuadraticHash::factory},
{"usage-tab", TabulationHash::factory}
};
if (argc != 3) goto fail;
rng = RandomGen(atoi(argv[2]));
for (auto t : grow_tests) {
if (t.first == argv[1]) {
grow_test(t.second);
return 0;
}
}
for (auto t : usage_tests) {
if (t.first == argv[1]) {
usage_test(t.second);
return 0;
}
}
fail:
printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]);
for (auto t : grow_tests) printf(" %s", t.first.c_str());
for (auto t : usage_tests) printf(" %s", t.first.c_str());
return 1;
}
#ifndef DS1_RANDOM_H
#define DS1_RANDOM_H
#include <cstdint>
/*
* This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
* and Sebastiano Vigna, distributed under the CC-0 license. For more details,
* see http://vigna.di.unimi.it/xorshift/.
*
* Rewritten to C++ by Martin Mares, also placed under CC-0.
*/
class RandomGen {
uint64_t state[2];
uint64_t rotl(uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
public:
// Initialize the generator, set its seed and warm it up.
RandomGen(unsigned int seed)
{
state[0] = seed * 0xdeadbeef;
state[1] = seed ^ 0xc0de1234;
for (int i=0; i<100; i++)
next_u64();
}
// Generate a random 64-bit number.
uint64_t next_u64(void)
{
uint64_t s0 = state[0], s1 = state[1];
uint64_t result = s0 + s1;
s1 ^= s0;
state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
state[1] = rotl(s1, 36);
return result;
}
// Generate a random 32-bit number.
uint32_t next_u32(void)
{
return next_u64() >> 11;
}
// Generate a number between 0 and range-1.
unsigned int next_range(unsigned int range)
{
/*
* This is not perfectly uniform, unless the range is a power of two.
* However, for 64-bit random values and 32-bit ranges, the bias is
* insignificant.
*/
return next_u64() % range;
}
};
#endif
#!/usr/bin/env python3
import random, sys
from math import sqrt
# Our wrapper of random so we can substitute it with another random generator
rng_init = lambda x: random.seed(x)
rng_next_u32 = lambda: random.randint(0, 2**32 - 1)
class TabulationHash:
"""Hash function for hashing by tabulation.
The 32-bit key is split to four 8-bit parts. Each part indexes
a separate table of 256 randomly generated values. Obtained values
are XORed together.
"""
def __init__(self, num_buckets):
self.num_buckets = num_buckets
self.tables = [None] * 4
for i in range(4):
self.tables[i] = [ rng_next_u32() for _ in range(256) ]
def __call__(self, key):
h0 = key & 0xff;
h1 = (key >> 8) & 0xff;
h2 = (key >> 16) & 0xff;
h3 = (key >> 24) & 0xff;
t = self.tables
return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
class PolynomialHash:
"""Hash function using polynomial modulo a prime."""
def __init__(self, num_buckets, degree, prime = 2147483647):
self.num_buckets = num_buckets
self.prime = prime
self.coefs = [ rng_next_u32() for _ in range(degree + 1) ]
def __call__(self, key):
acc = 0
for c in self.coefs:
acc = (acc * key + c) % self.prime
return acc % self.num_buckets
LinearHash = lambda num_buckets: PolynomialHash(num_buckets, 1)
QuadraticHash = lambda num_buckets: PolynomialHash(num_buckets, 2)
class MultiplyShiftLowHash:
"""Multiply-shift hash function taking top bits of 32-bit word"""
def __init__(self, num_buckets):
self.mask = num_buckets - 1
assert (num_buckets & self.mask == 0), \
"MultiplyShiftLowHash: num_buckets must be power of 2"
self.mult = rng_next_u32() | 0x1
self.shift = 0;
tmp = num_buckets - 1
while 0x80000000 & tmp == 0:
tmp <<= 1
self.shift += 1
def __call__(self, key):
return ((key * self.mult) >> self.shift) & self.mask
class MultiplyShiftHighHash:
"""Multiply-shift hash function taking low bits of upper half of 64-bit word"""
def __init__(self, num_buckets):
self.mask = num_buckets - 1
assert (num_buckets & self.mask == 0), \
"MultiplyShiftLowHash: num_buckets must be power of 2"
self.mult = (rng_next_u32() << 32) | rng_next_u32() | 0x1
def __call__(self, key):
return ((key * self.mult) >> 32) & self.mask
class HashTable:
"""Hash table with linear probing"""
def __init__(self, hash_fun_factory, num_buckets):
self._hash = hash_fun_factory(num_buckets)
self._num_buckets = num_buckets
self._table = [None] * num_buckets
self._size = 0
self.reset_counter()
def _next_bucket(self, b):
return (b + 1) % self._num_buckets
def lookup(self, key):
"""Check whether key is present in the table."""
ret = False
steps = 1
b = self._hash(key)
while self._table[b] is not None:
if self._table[b] == key:
ret = True
break
steps += 1
b = self._next_bucket(b)
self._update_counter(steps)
return ret
def insert(self, key):
"""Add the key in the table."""
assert self._size < self._num_buckets, "Cannot insert into a full table."
steps = 1
b = self._hash(key)
while self._table[b] is not None:
if self._table[b] == key: break
steps += 1
b = self._next_bucket(b)
else:
self._table[b] = key
self._update_counter(steps)
def _update_counter(self, steps):
self._ops += 1
self._steps += steps
self._max = max(self._max, steps)
def reset_counter(self):
self._steps = 0
self._ops = 0
self._max = 0
def report_avg(self): return self._steps / max(1, self._ops)
def report_max(self): return self._max
def permute_list(l):
N = len(l)
for i in range(N - 1):
dst = i + (rng_next_u32() % (N-i))
l[i], l[dst] = l[dst], l[i]
def usage_test(hash_fun_factory, max_usage = 90, retry = 40):
avg = [0.0] * max_usage
avg2 = [0.0] * max_usage
N = 2**20
step_size = N // 100
elements = list(range(N))
for _ in range(retry):
H = HashTable(hash_fun_factory, N)
permute_list(elements)
for s in range(max_usage):
H.reset_counter()
for i in range(step_size):
H.insert(s*step_size + i)
avg[s] += H.report_avg()
avg2[s] += H.report_avg() ** 2
for i in range(max_usage):
avg[i] /= retry;
avg2[i] /= retry;
std_dev = sqrt(avg2[i] - avg[i]**2)
print("%i %.03f %.03f" % ((i + 1), avg[i], std_dev))
def grow_test(hash_fun_factory, usage = 60, retry = 40, begin = 7, end = 22):
for n in range(begin, end):
avg = 0.0
avg2 = 0.0
N = 2 ** n
elements = list(range(N))
for _ in range(retry):
H = HashTable(hash_fun_factory, N)
permute_list(elements)
for x in elements[:N * usage // 100]:
H.insert(x)
for i in range(N):
H.lookup(i)
avg += H.report_avg()
avg2 += H.report_avg() ** 2
avg /= retry
avg2 /= retry
std_dev = sqrt(avg2 - avg**2)
print("%i %.03f %.03f" % (N, avg, std_dev))
tests = {
"usage-ms-low": lambda: usage_test(MultiplyShiftLowHash),
"usage-ms-high": lambda: usage_test(MultiplyShiftHighHash),
"usage-poly-1": lambda: usage_test(LinearHash),
"usage-poly-2": lambda: usage_test(QuadraticHash),
"usage-tab": lambda: usage_test(TabulationHash),
"grow-ms-low": lambda: grow_test(MultiplyShiftLowHash),
"grow-ms-high": lambda: grow_test(MultiplyShiftHighHash),
"grow-poly-1": lambda: grow_test(LinearHash),
"grow-poly-2": lambda: grow_test(QuadraticHash),
"grow-tab": lambda: grow_test(TabulationHash),
}
if len(sys.argv) == 3:
test, student_id = sys.argv[1], sys.argv[2]
rng_init(int(student_id))
if test in tests:
tests[test]()
else:
raise ValueError("Unknown test {}".format(test))
else:
raise ValueError("Usage: {} <test> <student-id>".format(sys.argv[0]))
## Goal
The goal of this assignment is to experimentally evaluate Linear probing
hash table with different systems of hash functions.
You are given a test program (`hash_experiment`) which implements everything
needed to perform the following experiments:
- _Grow test:_ This test tries different sizes $N$ of the hash table and for each size
it inserts small keys in random order until 60% of the table is used
and then it performs lookup operation for keys $0,\ldots,N-1$.
- _Usage test:_ This test uses hash table of size $2^20$. It performs insertions
to increase usage of the table by 1%, reports efficiency of the insert operation,
and repeats until usage of the table reaches 90%.
Both test measure number of probed buckets per operation, are repeated 40 times
and report average and standard deviation. Note that even with 40 repetitions
the reported numbers still depend quite a lot on the random seed used.
You should perform these experiments for 5 different classes of hash functions –
tabulation, multiply-shift which uses top bits of 32-bit word (`ms-low`),
multiply-shift which uses low bits of u