Commit 43914683 authored by Martin Mareš's avatar Martin Mareš
Browse files

Cuckoo hash and Hash experiment

parent 490de451
test: cuckoo_hash_test
./$<
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
cuckoo_hash_test: cuckoo_hash_test.cpp cuckoo_hash.h test_main.cpp $(INCLUDE)/random.h
$(CXX) $(CXXFLAGS) $^ -o $@
clean:
rm -f cuckoo_hash_test
.PHONY: clean test
#include <string>
#include <vector>
#include <cstdint>
#include <iostream>
#include "random.h"
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message);
class TabulationHash {
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
unsigned num_buckets;
uint32_t tables[4][256];
public:
TabulationHash(unsigned num_buckets, RandomGen *random_gen)
{
this->num_buckets = num_buckets;
for (int i=0; i<4; i++)
for (int j=0; j<256; j++)
tables[i][j] = random_gen->next_u32();
}
uint32_t hash(uint32_t key)
{
unsigned h0 = key & 0xff;
unsigned h1 = (key >> 8) & 0xff;
unsigned h2 = (key >> 16) & 0xff;
unsigned h3 = (key >> 24) & 0xff;
return (tables[0][h0] ^ tables[1][h1] ^ tables[2][h2] ^ tables[3][h3]) % num_buckets;
}
};
class CuckooTable {
/*
* Hash table with Cuckoo hashing.
*
* We have two hash functions, which map 32-bit keys to buckets of a common
* hash table. Unused buckets contain 0xffffffff.
*/
const uint32_t UNUSED = 0xffffffff;
// The array of buckets
vector<uint32_t> table;
unsigned num_buckets;
// Hash functions and the random generator used to create them
TabulationHash *hashes[2];
RandomGen *random_gen;
public:
CuckooTable(unsigned num_buckets)
{
// Initialize the table with the given number of buckets.
// The number of buckets is expected to stay constant.
this->num_buckets = num_buckets;
table.resize(num_buckets, UNUSED);
// Obtain two fresh hash functions.
random_gen = new RandomGen(42);
for (int i=0; i<2; i++)
hashes[i] = new TabulationHash(num_buckets, random_gen);
}
~CuckooTable()
{
for (int i=0; i<2; i++)
delete hashes[i];
delete random_gen;
}
bool lookup(uint32_t key)
{
// Check if the table contains the given key. Returns True or False.
unsigned h0 = hashes[0]->hash(key);
unsigned h1 = hashes[1]->hash(key);
return (table[h0] == key || table[h1] == key);
}
void insert(uint32_t key)
{
// Insert a new key to the table. Assumes that the key is not present yet.
EXPECT(key != UNUSED, "Keys must differ from UNUSED.");
// TODO: Implement
}
};
#include <functional>
#include <cstdlib>
#include <vector>
#include "cuckoo_hash.h"
void simple_test(unsigned n, unsigned table_size_percentage)
{
CuckooTable table(n * table_size_percentage / 100);
for (unsigned i=0; i < n; i++)
table.insert(37*i);
for (unsigned i=0; i < n; i++) {
EXPECT(table.lookup(37*i), "Item not present in table, but it should be.");
EXPECT(!table.lookup(37*i+1), "Item present in table, even though it should not be.");
}
}
void multiple_test(unsigned min_n, unsigned max_n, unsigned step_n, unsigned table_size_percentage)
{
for (unsigned n=min_n; n < max_n; n += step_n) {
printf("\tn=%u\n", n);
simple_test(n, table_size_percentage);
}
}
/*** A list of all tests ***/
vector<pair<string, function<void()>>> tests = {
{ "small", [] { simple_test(100, 400); } },
{ "middle", [] { simple_test(31415, 300); } },
{ "big", [] { simple_test(1000000, 300); } },
{ "tight", [] { multiple_test(20000, 40000, 500, 205); } },
};
#define DS1_RANDOM_H
#include <cstdint>
/*
* This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
* and Sebastiano Vigna, distributed under the CC-0 license. For more details,
* see http://vigna.di.unimi.it/xorshift/.
*
* Rewritten to C++ by Martin Mares, also placed under CC-0.
*/
class RandomGen {
uint64_t state[2];
uint64_t rotl(uint64_t x, int k)
{
return (x << k) | (x >> (64 - k));
}
public:
// Initialize the generator, set its seed and warm it up.
RandomGen(unsigned int seed)
{
state[0] = seed * 0xdeadbeef;
state[1] = seed ^ 0xc0de1234;
for (int i=0; i<100; i++)
next_u64();
}
// Generate a random 64-bit number.
uint64_t next_u64(void)
{
uint64_t s0 = state[0], s1 = state[1];
uint64_t result = s0 + s1;
s1 ^= s0;
state[0] = rotl(s0, 55) ^ s1 ^ (s1 << 14);
state[1] = rotl(s1, 36);
return result;
}
// Generate a random 32-bit number.
uint32_t next_u32(void)
{
return next_u64() >> 11;
}
// Generate a number between 0 and range-1.
unsigned int next_range(unsigned int range)
{
/*
* This is not perfectly uniform, unless the range is a power of two.
* However, for 64-bit random values and 32-bit ranges, the bias is
* insignificant.
*/
return next_u64() % range;
}
};
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
extern vector<pair<string, function<void()>>> tests;
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
int main(int argc, char* argv[]) {
vector<string> required_tests;
if (argc > 1) {
required_tests.assign(argv + 1, argv + argc);
} else {
for (const auto& test : tests)
required_tests.push_back(test.first);
}
for (const auto& required_test : required_tests) {
bool found = false;
for (const auto& test : tests)
if (required_test == test.first) {
cerr << "Running test " << required_test << endl;
test.second();
found = true;
break;
}
if (!found) {
cerr << "Unknown test " << required_test << endl;
return 1;
}
}
return 0;
}
import random
import math
class TabulationHash:
"""Hash function for hashing by tabulation.
The 32-bit key is split to four 8-bit parts. Each part indexes
a separate table of 256 randomly generated values. Obtained values
are XORed together.
"""
def __init__(self, num_buckets):
self.tables = [None] * 4
for i in range(4):
self.tables[i] = [random.randint(0, 0xffffffff) for _ in range(256)]
self.num_buckets = num_buckets
def hash(self, key):
h0 = key & 0xff
h1 = (key >> 8) & 0xff
h2 = (key >> 16) & 0xff
h3 = (key >> 24) & 0xff
t = self.tables
return (t[0][h0] ^ t[1][h1] ^ t[2][h2] ^ t[3][h3]) % self.num_buckets
class CuckooTable:
"""Hash table with Cuckoo hashing.
We have two hash functions, which map 32-bit keys to buckets of a common
hash table. Unused buckets contain None.
"""
def __init__(self, num_buckets):
"""Initialize the table with the given number of buckets.
The number of buckets is expected to stay constant."""
# The array of buckets
self.num_buckets = num_buckets
self.table = [None] * num_buckets
# Create two fresh hash functions
self.hashes = [TabulationHash(num_buckets), TabulationHash(num_buckets)]
def lookup(self, key):
"""Check if the table contains the given key. Returns True or False."""
b0 = self.hashes[0].hash(key)
b1 = self.hashes[1].hash(key)
# print("## Lookup key={} b0={} b1={}".format(key, b0, b1))
return self.table[b0] == key or self.table[b1] == key
def insert(self, key):
"""Insert a new key to the table. Assumes that the key is not present yet."""
# TODO: Implement
raise NotImplementedError
#!/usr/bin/env python3
import sys
import random
from cuckoo_hash import CuckooTable
def simple_test(n, table_size_percentage):
random.seed(42)
table = CuckooTable(n*table_size_percentage//100)
# Insert an arithmetic progression
for i in range(n):
table.insert(37*i)
# Verify contents of the table
for i in range(n):
assert table.lookup(37*i), "Item not present in table, but it should be."
assert not table.lookup(37*i+1), "Item present in table, even though it should not be."
def multiple_test(min_n, max_n, step_n, table_size_percentage):
for n in range(min_n, max_n, step_n):
print("\tn={}".format(n))
simple_test(n, table_size_percentage)
# A list of all tests
tests = [
("small", lambda: simple_test(100, 400)),
("middle", lambda: simple_test(31415, 300)),
("big", lambda: simple_test(1000000, 300)),
("tight", lambda: multiple_test(20000, 40000, 500, 205)),
]
if __name__ == "__main__":
for required_test in sys.argv[1:] or [name for name, _ in tests]:
for name, test in tests:
if name == required_test:
print("Running test {}".format(name), file=sys.stderr)
test()
break
else:
raise ValueError("Unknown test {}".format(name))
Implement Cuckoo hash table with simple tabulation hashing.
You are given a skeleton code which defines the table, implements
`lookup()`, and provides hash functions. You have to add an `insert()`
method.
If too many elements are moved during a single insert, the table must
be rehashed with new hash functions. See lecture notes for the particular
bounds.
The size of the table should stay constant
throughout the existence of the data structure.
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
STUDENT_ID ?= PLEASE_SET_STUDENT_ID
HASHFUNCS=ms-low ms-high poly-1 poly-2 tab
.PHONY: test
test: $(addprefix out/t-grow-, $(HASHFUNCS)) $(addprefix out/t-usage-, $(HASHFUNCS))
out/t-%: hash_experiment
@mkdir -p out
./hash_experiment $* $(STUDENT_ID) >$@
hash_experiment: hash_experiment.cpp $(INCLUDE)/random.h
$(CXX) $(CPPFLAGS) $(CXXFLAGS) hash_experiment.cpp -o $@
.PHONY: clean
clean:
rm -f hash_experiment
rm -rf out
#include <vector>
#include <functional>
#include <algorithm>
#include <utility>
#include <stdexcept>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "random.h"
using namespace std;
RandomGen rng(42);
typedef uint32_t uint;
typedef function<uint(uint)> HashFunction;
typedef function<HashFunction(unsigned num_buckets)> HashFunctionFactory;
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
class TabulationHash {
unsigned num_buckets;
vector<uint> tables;
TabulationHash(unsigned num_buckets) : num_buckets(num_buckets), tables(4 * 256) {
for (uint& x : tables) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(TabulationHash(num_buckets));
}
uint operator()(uint key) {
return (
tables[key & 0xff] ^
tables[((key >> 8) & 0xff) | 0x100] ^
tables[((key >> 16) & 0xff) | 0x200] ^
tables[((key >> 24) & 0xff) | 0x300]
) % num_buckets;
}
};
// Hash function using polynomial modulo a prime.
template < int degree, uint prime = 2147483647 >
class PolynomialHash {
unsigned num_buckets;
vector<uint> coefs;
PolynomialHash(unsigned num_buckets) : num_buckets(num_buckets), coefs(degree + 1) {
for (uint& x : coefs) x = rng.next_u32();
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(PolynomialHash(num_buckets));
}
uint operator()(uint key) {
uint64_t acc = 0;
for (uint c : coefs) acc = (acc * key + c) % prime;
return (uint)(acc % num_buckets);
}
};
typedef PolynomialHash<1> LinearHash;
typedef PolynomialHash<2> QuadraticHash;
// Multiply-shift hash function taking top bits of 32-bit word
class MultiplyShiftLowHash {
uint mult;
uint mask;
int shift = 0;
MultiplyShiftLowHash(unsigned num_buckets) {
mult = rng.next_u32() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftLowHash: num_buckets must be power of 2");
unsigned tmp = num_buckets - 1;
while ((0x80000000U & tmp) == 0) {
tmp <<= 1;
shift++;
}
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftLowHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> shift) & mask;
}
};
// Multiply-shift hash function taking low bits of upper half of 64-bit word
class MultiplyShiftHighHash {
uint mask;
uint64_t mult;
MultiplyShiftHighHash(unsigned num_buckets) {
mult = rng.next_u64() | 0x1;
mask = num_buckets - 1;
if (mask & num_buckets)
throw runtime_error("MultiplyShiftHighHash: num_buckets must be power of 2");
}
public:
static HashFunction factory(unsigned num_buckets) {
return HashFunction(MultiplyShiftHighHash(num_buckets));
}
uint operator()(uint key) {
return ((key * mult) >> 32) & mask;
}
};
// Hash table with linear probing
class HashTable {
HashFunction hash;
vector<uint> table;
unsigned size = 0;
unsigned ops;
unsigned max_;
uint64_t steps;
public:
// We reserve one integer to mark unused buckets. This integer
// cannot be stored in the table.
static constexpr uint UNUSED = ~((uint)0);
HashTable(const HashFunctionFactory& factory, unsigned num_buckets) :
hash(factory(num_buckets)), table(num_buckets, +UNUSED) {
reset_counter();
}
// Check whether key is present in the table.
bool lookup(uint key) {
if (key == UNUSED) throw runtime_error("Cannot lookup UNUSED");
bool ret = false;
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) {
ret = true;
break;
}
steps++;
b = next_bucket(b);
}
update_counter(steps);
return ret;
}
// Add the key in the table.
void insert(uint key) {
if (key == UNUSED) throw runtime_error("Cannot insert UNUSED");
if (size >= table.size()) throw runtime_error("Insert: Table is full");
unsigned steps = 1;
uint b = hash(key);
while (table[b] != UNUSED) {
if (table[b] == key) goto key_found;
steps++;
b = next_bucket(b);
}
table[b] = key;
size++;
key_found:
update_counter(steps);
}
void reset_counter() { ops = steps = max_ = 0; }
double report_avg() { return ((double)steps) / max(1U, ops); }
double report_max() { return max_; }
private:
void update_counter(unsigned steps) {
ops++;
this->steps += steps;
max_ = max(steps, max_);
}
unsigned next_bucket(unsigned b) { return (b + 1) % table.size(); }
};
void usage_test(HashFunctionFactory factory, int max_usage = 90, int retry = 40) {
vector<double> avg(max_usage, 0.0);
vector<double> avg2(max_usage, 0.0);
unsigned N = 1 << 20;
unsigned step_size = N / 100;
vector<uint> elements(N);
for (unsigned i = 0; i < N; i++) elements[i] = i;