Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
datovky
assignments
Commits
a13eda18
Commit
a13eda18
authored
Apr 19, 2022
by
David Mareček
Browse files
hash experiment
parent
925841cc
Changes
6
Hide whitespace changes
Inline
Side-by-side
09-hash_experiment/cpp/Makefile
0 → 100644
View file @
a13eda18
INCLUDE
?=
.
CXXFLAGS
=
-std
=
c++11
-O2
-Wall
-Wextra
-g
-Wno-sign-compare
-I
$(INCLUDE)
STUDENT_ID
?=
PLEASE_SET_STUDENT_ID
HASHFUNCS
=
ms-low ms-high poly-1 poly-2 tab
.PHONY
:
test
test
:
$(addprefix out/t-grow-
,
$(HASHFUNCS)) $(addprefix out/t-usage-
,
$(HASHFUNCS))
out/t-%
:
hash_experiment
@
mkdir
-p
out
./hash_experiment
$*
$(STUDENT_ID)
>
$@
hash_experiment
:
hash_experiment.cpp $(INCLUDE)/random.h
$(CXX)
$(CPPFLAGS)
$(CXXFLAGS)
hash_experiment.cpp
-o
$@
.PHONY
:
clean
clean
:
rm
-f
hash_experiment
rm
-rf
out
09-hash_experiment/cpp/hash_experiment.cpp
0 → 100644
View file @
a13eda18
#include <vector>
#include <functional>
#include <algorithm>
#include <utility>
#include <stdexcept>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "random.h"
using
namespace
std
;
RandomGen
rng
(
42
);
typedef
uint32_t
uint
;
typedef
function
<
uint
(
uint
)
>
HashFunction
;
typedef
function
<
HashFunction
(
unsigned
num_buckets
)
>
HashFunctionFactory
;
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
class
TabulationHash
{
unsigned
num_buckets
;
vector
<
uint
>
tables
;
TabulationHash
(
unsigned
num_buckets
)
:
num_buckets
(
num_buckets
),
tables
(
4
*
256
)
{
for
(
uint
&
x
:
tables
)
x
=
rng
.
next_u32
();
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
TabulationHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
return
(
tables
[
key
&
0xff
]
^
tables
[((
key
>>
8
)
&
0xff
)
|
0x100
]
^
tables
[((
key
>>
16
)
&
0xff
)
|
0x200
]
^
tables
[((
key
>>
24
)
&
0xff
)
|
0x300
]
)
%
num_buckets
;
}
};
// Hash function using polynomial modulo a prime.
template
<
int
degree
,
uint
prime
=
2147483647
>
class
PolynomialHash
{
unsigned
num_buckets
;
vector
<
uint
>
coefs
;
PolynomialHash
(
unsigned
num_buckets
)
:
num_buckets
(
num_buckets
),
coefs
(
degree
+
1
)
{
for
(
uint
&
x
:
coefs
)
x
=
rng
.
next_u32
();
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
PolynomialHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
uint64_t
acc
=
0
;
for
(
uint
c
:
coefs
)
acc
=
(
acc
*
key
+
c
)
%
prime
;
return
(
uint
)(
acc
%
num_buckets
);
}
};
typedef
PolynomialHash
<
1
>
LinearHash
;
typedef
PolynomialHash
<
2
>
QuadraticHash
;
// Multiply-shift hash function taking top bits of 32-bit word
class
MultiplyShiftLowHash
{
uint
mult
;
uint
mask
;
int
shift
=
0
;
MultiplyShiftLowHash
(
unsigned
num_buckets
)
{
mult
=
rng
.
next_u32
()
|
0x1
;
mask
=
num_buckets
-
1
;
if
(
mask
&
num_buckets
)
throw
runtime_error
(
"MultiplyShiftLowHash: num_buckets must be power of 2"
);
unsigned
tmp
=
num_buckets
-
1
;
while
((
0x80000000U
&
tmp
)
==
0
)
{
tmp
<<=
1
;
shift
++
;
}
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
MultiplyShiftLowHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
return
((
key
*
mult
)
>>
shift
)
&
mask
;
}
};
// Multiply-shift hash function taking low bits of upper half of 64-bit word
class
MultiplyShiftHighHash
{
uint
mask
;
uint64_t
mult
;
MultiplyShiftHighHash
(
unsigned
num_buckets
)
{
mult
=
rng
.
next_u64
()
|
0x1
;
mask
=
num_buckets
-
1
;
if
(
mask
&
num_buckets
)
throw
runtime_error
(
"MultiplyShiftHighHash: num_buckets must be power of 2"
);
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
MultiplyShiftHighHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
return
((
key
*
mult
)
>>
32
)
&
mask
;
}
};
// Hash table with linear probing
class
HashTable
{
HashFunction
hash
;
vector
<
uint
>
table
;
unsigned
size
=
0
;
unsigned
ops
;
unsigned
max_
;
uint64_t
steps
;
public:
// We reserve one integer to mark unused buckets. This integer
// cannot be stored in the table.
static
constexpr
uint
UNUSED
=
~
((
uint
)
0
);
HashTable
(
const
HashFunctionFactory
&
factory
,
unsigned
num_buckets
)
:
hash
(
factory
(
num_buckets
)),
table
(
num_buckets
,
+
UNUSED
)
{
reset_counter
();
}
// Check whether key is present in the table.
bool
lookup
(
uint
key
)
{
if
(
key
==
UNUSED
)
throw
runtime_error
(
"Cannot lookup UNUSED"
);
bool
ret
=
false
;
unsigned
steps
=
1
;
uint
b
=
hash
(
key
);
while
(
table
[
b
]
!=
UNUSED
)
{
if
(
table
[
b
]
==
key
)
{
ret
=
true
;
break
;
}
steps
++
;
b
=
next_bucket
(
b
);
}
update_counter
(
steps
);
return
ret
;
}
// Add the key in the table.
void
insert
(
uint
key
)
{
if
(
key
==
UNUSED
)
throw
runtime_error
(
"Cannot insert UNUSED"
);
if
(
size
>=
table
.
size
())
throw
runtime_error
(
"Insert: Table is full"
);
unsigned
steps
=
1
;
uint
b
=
hash
(
key
);
while
(
table
[
b
]
!=
UNUSED
)
{
if
(
table
[
b
]
==
key
)
goto
key_found
;
steps
++
;
b
=
next_bucket
(
b
);
}
table
[
b
]
=
key
;
size
++
;
key_found:
update_counter
(
steps
);
}
void
reset_counter
()
{
ops
=
steps
=
max_
=
0
;
}
double
report_avg
()
{
return
((
double
)
steps
)
/
max
(
1U
,
ops
);
}
double
report_max
()
{
return
max_
;
}
private:
void
update_counter
(
unsigned
steps
)
{
ops
++
;
this
->
steps
+=
steps
;
max_
=
max
(
steps
,
max_
);
}
unsigned
next_bucket
(
unsigned
b
)
{
return
(
b
+
1
)
%
table
.
size
();
}
};
void
usage_test
(
HashFunctionFactory
factory
,
int
max_usage
=
90
,
int
retry
=
40
)
{
vector
<
double
>
avg
(
max_usage
,
0.0
);
vector
<
double
>
avg2
(
max_usage
,
0.0
);
unsigned
N
=
1
<<
20
;
unsigned
step_size
=
N
/
100
;
vector
<
uint
>
elements
(
N
);
for
(
unsigned
i
=
0
;
i
<
N
;
i
++
)
elements
[
i
]
=
i
;
for
(
int
t
=
0
;
t
<
retry
;
t
++
)
{
HashTable
H
(
factory
,
N
);
for
(
unsigned
i
=
0
;
i
<
N
-
1
;
i
++
)
swap
(
elements
[
i
],
elements
[
i
+
(
rng
.
next_u32
()
%
(
N
-
i
))]);
for
(
int
s
=
0
;
s
<
max_usage
;
s
++
)
{
H
.
reset_counter
();
for
(
unsigned
i
=
0
;
i
<
step_size
;
i
++
)
H
.
insert
(
elements
[
s
*
step_size
+
i
]);
avg
[
s
]
+=
H
.
report_avg
();
avg2
[
s
]
+=
H
.
report_avg
()
*
H
.
report_avg
();
}
}
for
(
int
i
=
0
;
i
<
max_usage
;
i
++
)
{
avg
[
i
]
/=
retry
;
avg2
[
i
]
/=
retry
;
double
std_dev
=
sqrt
(
avg2
[
i
]
-
avg
[
i
]
*
avg
[
i
]);
printf
(
"%i %.03lf %.03lf
\n
"
,
i
+
1
,
avg
[
i
],
std_dev
);
}
}
void
grow_test
(
HashFunctionFactory
factory
,
int
usage
=
60
,
int
retry
=
40
,
int
begin
=
7
,
int
end
=
22
)
{
for
(
int
n
=
begin
;
n
<
end
;
n
++
)
{
double
avg
=
0
;
double
avg2
=
0
;
unsigned
N
=
1
<<
n
;
vector
<
uint
>
elements
(
N
);
for
(
unsigned
i
=
0
;
i
<
N
;
i
++
)
elements
[
i
]
=
i
;
for
(
int
t
=
0
;
t
<
retry
;
t
++
)
{
HashTable
H
(
factory
,
N
);
for
(
unsigned
i
=
0
;
i
<
N
-
1
;
i
++
)
swap
(
elements
[
i
],
elements
[
i
+
(
rng
.
next_u32
()
%
(
N
-
i
))]);
for
(
unsigned
i
=
0
;
i
<
((
uint64_t
)
N
)
*
usage
/
100
;
i
++
)
H
.
insert
(
elements
[
i
]);
for
(
unsigned
i
=
0
;
i
<
N
;
i
++
)
H
.
lookup
(
i
);
avg
+=
H
.
report_avg
();
avg2
+=
H
.
report_avg
()
*
H
.
report_avg
();
}
avg
/=
retry
;
avg2
/=
retry
;
double
std_dev
=
sqrt
(
avg2
-
avg
*
avg
);
printf
(
"%i %.03lf %.03lf
\n
"
,
N
,
avg
,
std_dev
);
}
}
int
main
(
int
argc
,
char
**
argv
)
{
vector
<
pair
<
string
,
HashFunctionFactory
>>
grow_tests
=
{
{
"grow-ms-low"
,
MultiplyShiftLowHash
::
factory
},
{
"grow-ms-high"
,
MultiplyShiftHighHash
::
factory
},
{
"grow-poly-1"
,
LinearHash
::
factory
},
{
"grow-poly-2"
,
QuadraticHash
::
factory
},
{
"grow-tab"
,
TabulationHash
::
factory
}
};
vector
<
pair
<
string
,
HashFunctionFactory
>>
usage_tests
=
{
{
"usage-ms-low"
,
MultiplyShiftLowHash
::
factory
},
{
"usage-ms-high"
,
MultiplyShiftHighHash
::
factory
},
{
"usage-poly-1"
,
LinearHash
::
factory
},
{
"usage-poly-2"
,
QuadraticHash
::
factory
},
{
"usage-tab"
,
TabulationHash
::
factory
}
};
if
(
argc
!=
3
)
goto
fail
;
rng
=
RandomGen
(
atoi
(
argv
[
2
]));
for
(
auto
t
:
grow_tests
)
{
if
(
t
.
first
==
argv
[
1
])
{
grow_test
(
t
.
second
);
return
0
;
}
}
for
(
auto
t
:
usage_tests
)
{
if
(
t
.
first
==
argv
[
1
])
{
usage_test
(
t
.
second
);
return
0
;
}
}
fail:
printf
(
"Usage: %s <test> <seed>
\n
Available tests are:"
,
argv
[
0
]);
for
(
auto
t
:
grow_tests
)
printf
(
" %s"
,
t
.
first
.
c_str
());
for
(
auto
t
:
usage_tests
)
printf
(
" %s"
,
t
.
first
.
c_str
());
return
1
;
}
09-hash_experiment/cpp/random.h
0 → 100644
View file @
a13eda18
#ifndef DS1_RANDOM_H
#define DS1_RANDOM_H
#include <cstdint>
/*
* This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
* and Sebastiano Vigna, distributed under the CC-0 license. For more details,
* see http://vigna.di.unimi.it/xorshift/.
*
* Rewritten to C++ by Martin Mares, also placed under CC-0.
*/
class
RandomGen
{
uint64_t
state
[
2
];
uint64_t
rotl
(
uint64_t
x
,
int
k
)
{
return
(
x
<<
k
)
|
(
x
>>
(
64
-
k
));
}
public:
// Initialize the generator, set its seed and warm it up.
RandomGen
(
unsigned
int
seed
)
{
state
[
0
]
=
seed
*
0xdeadbeef
;
state
[
1
]
=
seed
^
0xc0de1234
;
for
(
int
i
=
0
;
i
<
100
;
i
++
)
next_u64
();
}
// Generate a random 64-bit number.
uint64_t
next_u64
(
void
)
{
uint64_t
s0
=
state
[
0
],
s1
=
state
[
1
];
uint64_t
result
=
s0
+
s1
;
s1
^=
s0
;
state
[
0
]
=
rotl
(
s0
,
55
)
^
s1
^
(
s1
<<
14
);
state
[
1
]
=
rotl
(
s1
,
36
);
return
result
;
}
// Generate a random 32-bit number.
uint32_t
next_u32
(
void
)
{
return
next_u64
()
>>
11
;
}
// Generate a number between 0 and range-1.
unsigned
int
next_range
(
unsigned
int
range
)
{
/*
* This is not perfectly uniform, unless the range is a power of two.
* However, for 64-bit random values and 32-bit ranges, the bias is
* insignificant.
*/
return
next_u64
()
%
range
;
}
};
#endif
09-hash_experiment/python/Makefile
0 → 100644
View file @
a13eda18
STUDENT_ID
?=
PLEASE_SET_STUDENT_ID
HASHFUNCS
=
ms-low ms-high poly-1 poly-2 tab
.PHONY
:
test
test
:
$(addprefix out/t-grow-
,
$(HASHFUNCS)) $(addprefix out/t-usage-
,
$(HASHFUNCS))
out/t-%
:
hash_experiment.py
@
mkdir
-p
out
./hash_experiment.py
$*
$(STUDENT_ID)
>
$@
.PHONY
:
clean
clean
:
rm
-rf
out
09-hash_experiment/python/hash_experiment.py
0 → 100644
View file @
a13eda18
#!/usr/bin/env python3
import
random
,
sys
from
math
import
sqrt
# Our wrapper of random so we can substitute it with another random generator
rng_init
=
lambda
x
:
random
.
seed
(
x
)
rng_next_u32
=
lambda
:
random
.
randint
(
0
,
2
**
32
-
1
)
class
TabulationHash
:
"""Hash function for hashing by tabulation.
The 32-bit key is split to four 8-bit parts. Each part indexes
a separate table of 256 randomly generated values. Obtained values
are XORed together.
"""
def
__init__
(
self
,
num_buckets
):
self
.
num_buckets
=
num_buckets
self
.
tables
=
[
None
]
*
4
for
i
in
range
(
4
):
self
.
tables
[
i
]
=
[
rng_next_u32
()
for
_
in
range
(
256
)
]
def
__call__
(
self
,
key
):
h0
=
key
&
0xff
;
h1
=
(
key
>>
8
)
&
0xff
;
h2
=
(
key
>>
16
)
&
0xff
;
h3
=
(
key
>>
24
)
&
0xff
;
t
=
self
.
tables
return
(
t
[
0
][
h0
]
^
t
[
1
][
h1
]
^
t
[
2
][
h2
]
^
t
[
3
][
h3
])
%
self
.
num_buckets
class
PolynomialHash
:
"""Hash function using polynomial modulo a prime."""
def
__init__
(
self
,
num_buckets
,
degree
,
prime
=
2147483647
):
self
.
num_buckets
=
num_buckets
self
.
prime
=
prime
self
.
coefs
=
[
rng_next_u32
()
for
_
in
range
(
degree
+
1
)
]
def
__call__
(
self
,
key
):
acc
=
0
for
c
in
self
.
coefs
:
acc
=
(
acc
*
key
+
c
)
%
self
.
prime
return
acc
%
self
.
num_buckets
LinearHash
=
lambda
num_buckets
:
PolynomialHash
(
num_buckets
,
1
)
QuadraticHash
=
lambda
num_buckets
:
PolynomialHash
(
num_buckets
,
2
)
class
MultiplyShiftLowHash
:
"""Multiply-shift hash function taking top bits of 32-bit word"""
def
__init__
(
self
,
num_buckets
):
self
.
mask
=
num_buckets
-
1
assert
(
num_buckets
&
self
.
mask
==
0
),
\
"MultiplyShiftLowHash: num_buckets must be power of 2"
self
.
mult
=
rng_next_u32
()
|
0x1
self
.
shift
=
0
;
tmp
=
num_buckets
-
1
while
0x80000000
&
tmp
==
0
:
tmp
<<=
1
self
.
shift
+=
1
def
__call__
(
self
,
key
):
return
((
key
*
self
.
mult
)
>>
self
.
shift
)
&
self
.
mask
class
MultiplyShiftHighHash
:
"""Multiply-shift hash function taking low bits of upper half of 64-bit word"""
def
__init__
(
self
,
num_buckets
):
self
.
mask
=
num_buckets
-
1
assert
(
num_buckets
&
self
.
mask
==
0
),
\
"MultiplyShiftLowHash: num_buckets must be power of 2"
self
.
mult
=
(
rng_next_u32
()
<<
32
)
|
rng_next_u32
()
|
0x1
def
__call__
(
self
,
key
):
return
((
key
*
self
.
mult
)
>>
32
)
&
self
.
mask
class
HashTable
:
"""Hash table with linear probing"""
def
__init__
(
self
,
hash_fun_factory
,
num_buckets
):
self
.
_hash
=
hash_fun_factory
(
num_buckets
)
self
.
_num_buckets
=
num_buckets
self
.
_table
=
[
None
]
*
num_buckets
self
.
_size
=
0
self
.
reset_counter
()
def
_next_bucket
(
self
,
b
):
return
(
b
+
1
)
%
self
.
_num_buckets
def
lookup
(
self
,
key
):
"""Check whether key is present in the table."""
ret
=
False
steps
=
1
b
=
self
.
_hash
(
key
)
while
self
.
_table
[
b
]
is
not
None
:
if
self
.
_table
[
b
]
==
key
:
ret
=
True
break
steps
+=
1
b
=
self
.
_next_bucket
(
b
)
self
.
_update_counter
(
steps
)
return
ret
def
insert
(
self
,
key
):
"""Add the key in the table."""
assert
self
.
_size
<
self
.
_num_buckets
,
"Cannot insert into a full table."
steps
=
1
b
=
self
.
_hash
(
key
)
while
self
.
_table
[
b
]
is
not
None
:
if
self
.
_table
[
b
]
==
key
:
break
steps
+=
1
b
=
self
.
_next_bucket
(
b
)
else
:
self
.
_table
[
b
]
=
key
self
.
_update_counter
(
steps
)
def
_update_counter
(
self
,
steps
):
self
.
_ops
+=
1
self
.
_steps
+=
steps
self
.
_max
=
max
(
self
.
_max
,
steps
)
def
reset_counter
(
self
):
self
.
_steps
=
0
self
.
_ops
=
0
self
.
_max
=
0
def
report_avg
(
self
):
return
self
.
_steps
/
max
(
1
,
self
.
_ops
)
def
report_max
(
self
):
return
self
.
_max
def
permute_list
(
l
):
N
=
len
(
l
)
for
i
in
range
(
N
-
1
):
dst
=
i
+
(
rng_next_u32
()
%
(
N
-
i
))
l
[
i
],
l
[
dst
]
=
l
[
dst
],
l
[
i
]
def
usage_test
(
hash_fun_factory
,
max_usage
=
90
,
retry
=
40
):
avg
=
[
0.0
]
*
max_usage
avg2
=
[
0.0
]
*
max_usage
N
=
2
**
19
step_size
=
N
//
100
elements
=
list
(
range
(
N
))
for
_
in
range
(
retry
):
H
=
HashTable
(
hash_fun_factory
,
N
)
permute_list
(
elements
)
for
s
in
range
(
max_usage
):
H
.
reset_counter
()
for
i
in
range
(
step_size
):
H
.
insert
(
s
*
step_size
+
i
)
avg
[
s
]
+=
H
.
report_avg
()
avg2
[
s
]
+=
H
.
report_avg
()
**
2
for
i
in
range
(
max_usage
):
avg
[
i
]
/=
retry
;
avg2
[
i
]
/=
retry
;
std_dev
=
sqrt
(
avg2
[
i
]
-
avg
[
i
]
**
2
)
print
(
"%i %.03f %.03f"
%
((
i
+
1
),
avg
[
i
],
std_dev
))
def
grow_test
(
hash_fun_factory
,
usage
=
60
,
retry
=
40
,
begin
=
7
,
end
=
21
):
for
n
in
range
(
begin
,
end
):
avg
=
0.0
avg2
=
0.0
N
=
2
**
n
elements
=
list
(
range
(
N
))
for
_
in
range
(
retry
):
H
=
HashTable
(
hash_fun_factory
,
N
)
permute_list
(
elements
)
for
x
in
elements
[:
N
*
usage
//
100
]:
H
.
insert
(
x
)
for
i
in
range
(
N
):
H
.
lookup
(
i
)
avg
+=
H
.
report_avg
()
avg2
+=
H