Commit 82e31d7f authored by Martin Mareš's avatar Martin Mareš
Browse files

Hash experiment

parent 88492598
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
.PHONY: clean
clean:
rm -f hash_experiment
#include <vector>
#include <functional>
#include <algorithm>
#include <utility>
#include <stdexcept>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "random.h"
using namespace std;
RandomGen rng(42);
typedef uint32_t uint;
typedef function<uint(uint)> HashFunction;
typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
class TabulationHash {
unsigned num_buckets;
vector<uint> tables;
TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
for (uint& x : tables) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(TabulationHash(num_buckets));
}
uint operator()(uint key) {
return (
tables[key & 0xff] ^
tables[((key >> 8) & 0xff) | 0x100] ^
tables[((key >> 16) & 0xff) | 0x200] ^
tables[((key >> 24) & 0xff) | 0x300]
) % num_buckets;
}
};
// Hash function using polynomial modulo a prime.
template < int degree, uint prime = 2147483647 >
class PolynomialHash {
unsigned num_buckets;
vector<uint> coefs;
PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
for (uint& x : coefs) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(PolynomialHash(num_buckets));
}
uint operator()(uint key) {
uint64_t acc = 0;
for (uint c : coefs) acc = (acc * key + c) % prime;
return (uint)(acc % num_buckets);
}
};
typedef PolynomialHash<1> LinearHash;
typedef PolynomialHash<2> QuadraticHash;
// Multiply-shift hash function taking top bits of 32-bit word
class MultiplyShiftLowHash {
uint mult;
uint mask;
int shift = 0;
MultiplyShiftLowHash(unsigned num_buckets) {
mult = rng.next_u32() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
unsigned tmp = num_buckets - 1;
while ((0x80000000U & tmp) == 0) {
tmp <<= 1;
shift++;
}
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftLowHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> shift) & mask;
}
};
// Multiply-shift hash function taking low bits of upper half of 64-bit word
class MultiplyShiftHighHash {
uint mask;
uint64_t mult;
MultiplyShiftHighHash(unsigned num_buckets) {
mult = rng.next_u64() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftHighHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> 32) & mask;
}
};
// Hash table with linear probing
class HashTable {
HashFunction hash;
vector<uint> table;
unsigned size = 0;
unsigned ops;
unsigned max_;
uint64_t steps;
public:
// We reserve one integer to mark unused buckets. This integer
// cannot be stored in the table.
static constexpr uint UNUSED = ~((uint)0);
HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
reset_counter();
}
// Check whether key is present in the table.
bool lookup(uint key) {
if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
bool ret = false;
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) {
ret = true;
break;
}
steps++;
b = next_bucket(b);
}
update_counter(steps);
return ret;
}
// Add the key in the table.
void insert(uint key) {
if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
if (size >= table.size()) throw runtime_error("Insert: Table is full");
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) goto key_found;
steps++;
b = next_bucket(b);
}
table[b] = key;
size++;
key_found:
update_counter(steps);
}
void reset_counter() { ops = steps = max_ = 0; }
double report_avg() { return ((double)steps) / max(1U, ops); }
double report_max() { return max_; }
private:
void update_counter(unsigned steps) {
ops++;
this->steps += steps;
max_ = max(steps, max_);
}
unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
};
void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) {
vector<double> avg(max_usage, 0.0);
vector<double> avg2(max_usage, 0.0);
unsigned N = 1 << 20;
unsigned step_size = N / 100;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (int s = 0; s < max_usage; s++) {
H.reset_counter();
for (unsigned i = 0; i < step_size; i++)
H.insert(elements[s*step_size + i]);
avg[s] += H.report_avg();
avg2[s] += H.report_avg() * H.report_avg();
}
}
for (int i = 0; i < max_usage; i++) {
avg[i] /= retry;
avg2[i] /= retry;
double std_dev = sqrt(avg2[i] - avg[i]*avg[i]);
printf("%i %.03lf %.03lf\n", i+1, avg[i], std_dev);
}
}
void grow_test(HashFunctionFactory factory, int usage = 60, int retry = 40,
int begin = 7, int end = 22) {
for (int n = begin; n < end; n++) {
double avg = 0;
double avg2 = 0;
unsigned N = 1 << n;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;
for (int t = 0; t < retry; t++) {
HashTable H(factory, N);
for (unsigned i = 0; i < N-1; i++)
swap(elements[i], elements[i + (rng.next_u32() % (N-i))]);
for (unsigned i = 0; i < ((uint64_t)N) * usage / 100; i++)
H.insert(elements[i]);
for (unsigned i = 0; i < N; i++)
H.lookup(i);
avg += H.report_avg();
avg2 += H.report_avg() * H.report_avg();
}
avg /= retry;
avg2 /= retry;
double std_dev = sqrt(avg2 - avg*avg);
printf("%i %.03lf %.03lf\n", N, avg, std_dev);
}
}
int main(int argc, char** argv) {
vector<pair<string, HashFunctionFactory>> grow_tests = {
{"grow-ms-low", MultiplyShiftLowHash::factory},
{"grow-ms-high", MultiplyShiftHighHash::factory},
{"grow-poly-1", LinearHash::factory},
{"grow-poly-2", QuadraticHash::factory},
{"grow-tab", TabulationHash::factory}
};
vector<pair<string, HashFunctionFactory>> usage_tests = {
{"usage-ms-low", MultiplyShiftLowHash::factory},
{"usage-ms-high", MultiplyShiftHighHash::factory},
{"usage-poly-1", LinearHash::factory},
{"usage-poly-2", QuadraticHash::factory},
{"usage-tab", TabulationHash::factory}
};
if (argc != 3) goto fail;
rng = RandomGen(atoi(argv[2]));
for (auto t : grow_tests) {
if (t.first == argv[1]) {
grow_test(t.second);
return 0;
}
}
for (auto t : usage_tests) {
if (t.first == argv[1]) {
usage_test(t.second);
return 0;
}
}
fail:
printf("Usage: %s <test> <seed>\nAvailable tests are:", argv[0]);
for (auto t : grow_tests) printf(" %s", t.first.c_str());
for (auto t : usage_tests) printf(" %s", t.first.c_str());
return 1;
}
#ifndef DS1_RANDOM_H
#define DS1_RANDOM_H
#include <cstdint>
/*
* This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
* and Sebastiano Vigna, distributed under the CC-0 license. For more details,
* see http://vigna.di.unimi.it/xorshift/.
*
* Rewritten to C++ by Martin Mares, also placed under CC-0.
*/
class RandomGen {
uint64_t state[2];
uint64_t rotl(uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
public:
// Initialize the generator, set its seed and warm it up.
RandomGen(unsigned int seed)
{
state[0] = seed * 0xdeadbeef;
state[1] = seed ^ 0xc0de1234;
for (int i=0; i<100; i++)
next_u64();
}
// Generate a random 64-bit number.
uint64_t next_u64(void)
{
uint64_t s0 = state[0], s1 = state[1];
uint64_t result = s0 + s1;
s1 ^= s0;
state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
state[1] = rotl(s1, 36);
return result;
}
// Generate a random 32-bit number.
uint32_t next_u32(void)
{
return next_u64() >> 11;
}
// Generate a number between 0 and range-1.
unsigned int next_range(unsigned int range)
{
/*
* This is not perfectly uniform, unless the range is a power of two.
* However, for 64-bit random values and 32-bit ranges, the bias is
* insignificant.
*/
return next_u64() % range;
}
};
#endif
#!/usr/bin/env python3
import random, sys
from math import sqrt
# Our wrapper of random so we can substitute it with another random generator
rng_init = lambda x: random.seed(x)
rng_next_u32 = lambda: random.randint(0, 2**32 - 1)
class TabulationHash:
"""Hash function for hashing by tabulation.
The 32-bit key is split to four 8-bit parts. Each part indexes
a separate table of 256 randomly generated values. Obtained values
are XORed together.
"""
def __init__(self, num_buckets):
self.num_buckets = num_buckets
self.tables = [None] * 4
for i in range(4):
self.tables[i] = [ rng_next_u32() for _ in range(256) ]
def __call__(self, key):
h0 = key & 0xff;
h1 = (key >> 8) & 0xff;
h2 = (key >> 16) & 0xff;
h3 = (key >> 24) & 0xff;
t = self.tables
return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
class PolynomialHash:
"""Hash function using polynomial modulo a prime."""
def __init__(self, num_buckets, degree, prime = 2147483647):
self.num_buckets = num_buckets
self.prime = prime
self.coefs = [ rng_next_u32() for _ in range(degree + 1) ]
def __call__(self, key):
acc = 0
for c in self.coefs:
acc = (acc * key + c) % self.prime
return acc % self.num_buckets
LinearHash = lambda num_buckets: PolynomialHash(num_buckets, 1)
QuadraticHash = lambda num_buckets: PolynomialHash(num_buckets, 2)
class MultiplyShiftLowHash:
"""Multiply-shift hash function taking top bits of 32-bit word"""
def __init__(self, num_buckets):
self.mask = num_buckets - 1
assert (num_buckets & self.mask == 0), \
"MultiplyShiftLowHash: num_buckets must be power of 2"
self.mult = rng_next_u32() | 0x1
self.shift = 0;
tmp = num_buckets - 1
while 0x80000000 & tmp == 0:
tmp <<= 1
self.shift += 1
def __call__(self, key):
return ((key * self.mult) >> self.shift) & self.mask
class MultiplyShiftHighHash:
"""Multiply-shift hash function taking low bits of upper half of 64-bit word"""
def __init__(self, num_buckets):
self.mask = num_buckets - 1
assert (num_buckets & self.mask == 0), \
"MultiplyShiftLowHash: num_buckets must be power of 2"
self.mult = (rng_next_u32() << 32) | rng_next_u32() | 0x1
def __call__(self, key):
return ((key * self.mult) >> 32) & self.mask
class HashTable:
"""Hash table with linear probing"""
def __init__(self, hash_fun_factory, num_buckets):
self._hash = hash_fun_factory(num_buckets)
self._num_buckets = num_buckets
self._table = [None] * num_buckets
self._size = 0
self.reset_counter()
def _next_bucket(self, b):
return (b + 1) % self._num_buckets
def lookup(self, key):
"""Check whether key is present in the table."""
ret = False
steps = 1
b = self._hash(key)
while self._table[b] is not None:
if self._table[b] == key:
ret = True
break
steps += 1
b = self._next_bucket(b)
self._update_counter(steps)
return ret
def insert(self, key):
"""Add the key in the table."""
assert self._size < self._num_buckets, "Cannot insert into a full table."
steps = 1
b = self._hash(key)
while self._table[b] is not None:
if self._table[b] == key: break
steps += 1
b = self._next_bucket(b)
else:
self._table[b] = key
self._update_counter(steps)
def _update_counter(self, steps):
self._ops += 1
self._steps += steps
self._max = max(self._max, steps)
def reset_counter(self):
self._steps = 0
self._ops = 0
self._max = 0
def report_avg(self): return self._steps / max(1, self._ops)
def report_max(self): return self._max
def permute_list(l):
N = len(l)
for i in range(N - 1):
dst = i + (rng_next_u32() % (N-i))
l[i], l[dst] = l[dst], l[i]
def usage_test(hash_fun_factory, max_usage = 90, retry = 40):
avg = [0.0] * max_usage
avg2 = [0.0] * max_usage
N = 2**19
step_size = N // 100
elements = list(range(N))
for _ in range(retry):
H = HashTable(hash_fun_factory, N)
permute_list(elements)
for s in range(max_usage):
H.reset_counter()
for i in range(step_size):
H.insert(s*step_size + i)
avg[s] += H.report_avg()
avg2[s] += H.report_avg() ** 2
for i in range(max_usage):
avg[i] /= retry;
avg2[i] /= retry;
std_dev = sqrt(avg2[i] - avg[i]**2)
print("%i %.03f %.03f" % ((i + 1), avg[i], std_dev))
def grow_test(hash_fun_factory, usage = 60, retry = 40, begin = 7, end = 21):
for n in range(begin, end):
avg = 0.0
avg2 = 0.0
N = 2 ** n
elements = list(range(N))
for _ in range(retry):
H = HashTable(hash_fun_factory, N)
permute_list(elements)
for x in elements[:N * usage // 100]:
H.insert(x)
for i in range(N):
H.lookup(i)
avg += H.report_avg()
avg2 += H.report_avg() ** 2
avg /= retry
avg2 /= retry
std_dev = sqrt(avg2 - avg**2)
print("%i %.03f %.03f" % (N, avg, std_dev))
tests = {
"usage-ms-low": lambda: usage_test(MultiplyShiftLowHash),
"usage-ms-high": lambda: usage_test(MultiplyShiftHighHash),
"usage-poly-1": lambda: usage_test(LinearHash),
"usage-poly-2": lambda: usage_test(QuadraticHash),
"usage-tab": lambda: usage_test(TabulationHash),
"grow-ms-low": lambda: grow_test(MultiplyShiftLowHash),
"grow-ms-high": lambda: grow_test(MultiplyShiftHighHash),
"grow-poly-1": lambda: grow_test(LinearHash),
"grow-poly-2": lambda: grow_test(QuadraticHash),