Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
datovky
assignments
Commits
e80b4475
Commit
e80b4475
authored
Nov 28, 2021
by
Tung Anh Vu
Browse files
Publish hash_experiment
parent
37755580
Changes
6
Hide whitespace changes
Inline
Side-by-side
09-hash_experiment/cpp/Makefile
0 → 100644
View file @
e80b4475
INCLUDE
?=
.
CXXFLAGS
=
-std
=
c++11
-O2
-Wall
-Wextra
-g
-Wno-sign-compare
-I
$(INCLUDE)
STUDENT_ID
?=
PLEASE_SET_STUDENT_ID
HASHFUNCS
=
ms-low ms-high poly-1 poly-2 tab
.PHONY
:
test
test
:
$(addprefix out/t-grow-
,
$(HASHFUNCS)) $(addprefix out/t-usage-
,
$(HASHFUNCS))
out/t-%
:
hash_experiment
@
mkdir
-p
out
./hash_experiment
$*
$(STUDENT_ID)
>
$@
hash_experiment
:
hash_experiment.cpp $(INCLUDE)/random.h
$(CXX)
$(CPPFLAGS)
$(CXXFLAGS)
hash_experiment.cpp
-o
$@
.PHONY
:
clean
clean
:
rm
-f
hash_experiment
rm
-rf
out
09-hash_experiment/cpp/hash_experiment.cpp
0 → 100644
View file @
e80b4475
#include <vector>
#include <functional>
#include <algorithm>
#include <utility>
#include <stdexcept>
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include "random.h"
using
namespace
std
;
RandomGen
rng
(
42
);
typedef
uint32_t
uint
;
typedef
function
<
uint
(
uint
)
>
HashFunction
;
typedef
function
<
HashFunction
(
unsigned
num_buckets
)
>
HashFunctionFactory
;
/*
* Hash function for hashing by tabulation.
*
* The 32-bit key is split to four 8-bit parts. Each part indexes
* a separate table of 256 randomly generated values. Obtained values
* are XORed together.
*/
class
TabulationHash
{
unsigned
num_buckets
;
vector
<
uint
>
tables
;
TabulationHash
(
unsigned
num_buckets
)
:
num_buckets
(
num_buckets
),
tables
(
4
*
256
)
{
for
(
uint
&
x
:
tables
)
x
=
rng
.
next_u32
();
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
TabulationHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
return
(
tables
[
key
&
0xff
]
^
tables
[((
key
>>
8
)
&
0xff
)
|
0x100
]
^
tables
[((
key
>>
16
)
&
0xff
)
|
0x200
]
^
tables
[((
key
>>
24
)
&
0xff
)
|
0x300
]
)
%
num_buckets
;
}
};
// Hash function using polynomial modulo a prime.
template
<
int
degree
,
uint
prime
=
2147483647
>
class
PolynomialHash
{
unsigned
num_buckets
;
vector
<
uint
>
coefs
;
PolynomialHash
(
unsigned
num_buckets
)
:
num_buckets
(
num_buckets
),
coefs
(
degree
+
1
)
{
for
(
uint
&
x
:
coefs
)
x
=
rng
.
next_u32
();
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
PolynomialHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
uint64_t
acc
=
0
;
for
(
uint
c
:
coefs
)
acc
=
(
acc
*
key
+
c
)
%
prime
;
return
(
uint
)(
acc
%
num_buckets
);
}
};
typedef
PolynomialHash
<
1
>
LinearHash
;
typedef
PolynomialHash
<
2
>
QuadraticHash
;
// Multiply-shift hash function taking top bits of 32-bit word
class
MultiplyShiftLowHash
{
uint
mult
;
uint
mask
;
int
shift
=
0
;
MultiplyShiftLowHash
(
unsigned
num_buckets
)
{
mult
=
rng
.
next_u32
()
|
0x1
;
mask
=
num_buckets
-
1
;
if
(
mask
&
num_buckets
)
throw
runtime_error
(
"MultiplyShiftLowHash: num_buckets must be power of 2"
);
unsigned
tmp
=
num_buckets
-
1
;
while
((
0x80000000U
&
tmp
)
==
0
)
{
tmp
<<=
1
;
shift
++
;
}
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
MultiplyShiftLowHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
return
((
key
*
mult
)
>>
shift
)
&
mask
;
}
};
// Multiply-shift hash function taking low bits of upper half of 64-bit word
class
MultiplyShiftHighHash
{
uint
mask
;
uint64_t
mult
;
MultiplyShiftHighHash
(
unsigned
num_buckets
)
{
mult
=
rng
.
next_u64
()
|
0x1
;
mask
=
num_buckets
-
1
;
if
(
mask
&
num_buckets
)
throw
runtime_error
(
"MultiplyShiftHighHash: num_buckets must be power of 2"
);
}
public:
static
HashFunction
factory
(
unsigned
num_buckets
)
{
return
HashFunction
(
MultiplyShiftHighHash
(
num_buckets
));
}
uint
operator
()(
uint
key
)
{
return
((
key
*
mult
)
>>
32
)
&
mask
;
}
};
// Hash table with linear probing
class
HashTable
{
HashFunction
hash
;
vector
<
uint
>
table
;
unsigned
size
=
0
;
unsigned
ops
;
unsigned
max_
;
uint64_t
steps
;
public:
// We reserve one integer to mark unused buckets. This integer
// cannot be stored in the table.
static
constexpr
uint
UNUSED
=
~
((
uint
)
0
);
HashTable
(
const
HashFunctionFactory
&
factory
,
unsigned
num_buckets
)
:
hash
(
factory
(
num_buckets
)),
table
(
num_buckets
,
+
UNUSED
)
{
reset_counter
();
}
// Check whether key is present in the table.
bool
lookup
(
uint
key
)
{
if
(
key
==
UNUSED
)
throw
runtime_error
(
"Cannot lookup UNUSED"
);
bool
ret
=
false
;
unsigned
steps
=
1
;
uint
b
=
hash
(
key
);
while
(
table
[
b
]
!=
UNUSED
)
{
if
(
table
[
b
]
==
key
)
{
ret
=
true
;
break
;
}
steps
++
;
b
=
next_bucket
(
b
);
}
update_counter
(
steps
);
return
ret
;
}
// Add the key in the table.
void
insert
(
uint
key
)
{
if
(
key
==
UNUSED
)
throw
runtime_error
(
"Cannot insert UNUSED"
);
if
(
size
>=
table
.
size
())
throw
runtime_error
(
"Insert: Table is full"
);
unsigned
steps
=
1
;
uint
b
=
hash
(
key
);
while
(
table
[
b
]
!=
UNUSED
)
{
if
(
table
[
b
]
==
key
)
goto
key_found
;
steps
++
;
b
=
next_bucket
(
b
);
}
table
[
b
]
=
key
;
size
++
;
key_found:
update_counter
(
steps
);
}
void
reset_counter
()
{
ops
=
steps
=
max_
=
0
;
}
double
report_avg
()
{
return
((
double
)
steps
)
/
max
(
1U
,
ops
);
}
double
report_max
()
{
return
max_
;
}
private:
void
update_counter
(
unsigned
steps
)
{
ops
++
;
this
->
steps
+=
steps
;
max_
=
max
(
steps
,
max_
);
}
unsigned
next_bucket
(
unsigned
b
)
{
return
(
b
+
1
)
%
table
.
size
();
}
};
void
usage_test
(
HashFunctionFactory
factory
,
int
max_usage
=
90
,
int
retry
=
40
)
{
vector
<
double
>
avg
(
max_usage
,
0.0
);
vector
<
double
>
avg2
(
max_usage
,
0.0
);
unsigned
N
=
1
<<
20
;
unsigned
step_size
=
N
/
100
;
vector
<
uint
>
elements
(
N
);
for
(
unsigned
i
=
0
;
i
<
N
;
i
++
)
elements
[
i
]
=
i
;
for
(
int
t
=
0
;
t
<
retry
;
t
++
)
{
HashTable
H
(
factory
,
N
);
for
(
unsigned
i
=
0
;
i
<
N
-
1
;
i
++
)
swap
(
elements
[
i
],
elements
[
i
+
(
rng
.
next_u32
()
%
(
N
-
i
))]);
for
(
int
s
=
0
;
s
<
max_usage
;
s
++
)
{
H
.
reset_counter
();
for
(
unsigned
i
=
0
;
i
<
step_size
;
i
++
)
H
.
insert
(
elements
[
s
*
step_size
+
i
]);
avg
[
s
]
+=
H
.
report_avg
();
avg2
[
s
]
+=
H
.
report_avg
()
*
H
.
report_avg
();
}
}
for
(
int
i
=
0
;
i
<
max_usage
;
i
++
)
{
avg
[
i
]
/=
retry
;
avg2
[
i
]
/=
retry
;
double
std_dev
=
sqrt
(
avg2
[
i
]
-
avg
[
i
]
*
avg
[
i
]);
printf
(
"%i %.03lf %.03lf
\n
"
,
i
+
1
,
avg
[
i
],
std_dev
);
}
}
void
grow_test
(
HashFunctionFactory
factory
,
int
usage
=
60
,
int
retry
=
40
,
int
begin
=
7
,
int
end
=
22
)
{
for
(
int
n
=
begin
;
n
<
end
;
n
++
)
{
double
avg
=
0
;
double
avg2
=
0
;
unsigned
N
=
1
<<
n
;
vector
<
uint
>
elements
(
N
);
for
(
unsigned
i
=
0
;
i
<
N
;
i
++
)
elements
[
i
]
=
i
;
for
(
int
t
=
0
;
t
<
retry
;
t
++
)
{
HashTable
H
(
factory
,
N
);
for
(
unsigned
i
=
0
;
i
<
N
-
1
;
i
++
)
swap
(
elements
[
i
],
elements
[
i
+
(
rng
.
next_u32
()
%
(
N
-
i
))]);
for
(
unsigned
i
=
0
;
i
<
((
uint64_t
)
N
)
*
usage
/
100
;
i
++
)
H
.
insert
(
elements
[
i
]);
for
(
unsigned
i
=
0
;
i
<
N
;
i
++
)
H
.
lookup
(
i
);
avg
+=
H
.
report_avg
();
avg2
+=
H
.
report_avg
()
*
H
.
report_avg
();
}
avg
/=
retry
;
avg2
/=
retry
;
double
std_dev
=
sqrt
(
avg2
-
avg
*
avg
);
printf
(
"%i %.03lf %.03lf
\n
"
,
N
,
avg
,
std_dev
);
}
}
int
main
(
int
argc
,
char
**
argv
)
{
vector
<
pair
<
string
,
HashFunctionFactory
>>
grow_tests
=
{
{
"grow-ms-low"
,
MultiplyShiftLowHash
::
factory
},
{
"grow-ms-high"
,
MultiplyShiftHighHash
::
factory
},
{
"grow-poly-1"
,
LinearHash
::
factory
},
{
"grow-poly-2"
,
QuadraticHash
::
factory
},
{
"grow-tab"
,
TabulationHash
::
factory
}
};
vector
<
pair
<
string
,
HashFunctionFactory
>>
usage_tests
=
{
{
"usage-ms-low"
,
MultiplyShiftLowHash
::
factory
},
{
"usage-ms-high"
,
MultiplyShiftHighHash
::
factory
},
{
"usage-poly-1"
,
LinearHash
::
factory
},
{
"usage-poly-2"
,
QuadraticHash
::
factory
},
{
"usage-tab"
,
TabulationHash
::
factory
}
};
if
(
argc
!=
3
)
goto
fail
;
rng
=
RandomGen
(
atoi
(
argv
[
2
]));
for
(
auto
t
:
grow_tests
)
{
if
(
t
.
first
==
argv
[
1
])
{
grow_test
(
t
.
second
);
return
0
;
}
}
for
(
auto
t
:
usage_tests
)
{
if
(
t
.
first
==
argv
[
1
])
{
usage_test
(
t
.
second
);
return
0
;
}
}
fail:
printf
(
"Usage: %s <test> <seed>
\n
Available tests are:"
,
argv
[
0
]);
for
(
auto
t
:
grow_tests
)
printf
(
" %s"
,
t
.
first
.
c_str
());
for
(
auto
t
:
usage_tests
)
printf
(
" %s"
,
t
.
first
.
c_str
());
return
1
;
}
09-hash_experiment/cpp/random.h
0 → 100644
View file @
e80b4475
#ifndef DS1_RANDOM_H
#define DS1_RANDOM_H
#include <cstdint>
/*
* This is the xoroshiro128+ random generator, designed in 2016 by David Blackman
* and Sebastiano Vigna, distributed under the CC-0 license. For more details,
* see http://vigna.di.unimi.it/xorshift/.
*
* Rewritten to C++ by Martin Mares, also placed under CC-0.
*/
class
RandomGen
{
uint64_t
state
[
2
];
uint64_t
rotl
(
uint64_t
x
,
int
k
)
{
return
(
x
<<
k
)
|
(
x
>>
(
64
-
k
));
}
public:
// Initialize the generator, set its seed and warm it up.
RandomGen
(
unsigned
int
seed
)
{
state
[
0
]
=
seed
*
0xdeadbeef
;
state
[
1
]
=
seed
^
0xc0de1234
;
for
(
int
i
=
0
;
i
<
100
;
i
++
)
next_u64
();
}
// Generate a random 64-bit number.
uint64_t
next_u64
(
void
)
{
uint64_t
s0
=
state
[
0
],
s1
=
state
[
1
];
uint64_t
result
=
s0
+
s1
;
s1
^=
s0
;
state
[
0
]
=
rotl
(
s0
,
55
)
^
s1
^
(
s1
<<
14
);
state
[
1
]
=
rotl
(
s1
,
36
);
return
result
;
}
// Generate a random 32-bit number.
uint32_t
next_u32
(
void
)
{
return
next_u64
()
>>
11
;
}
// Generate a number between 0 and range-1.
unsigned
int
next_range
(
unsigned
int
range
)
{
/*
* This is not perfectly uniform, unless the range is a power of two.
* However, for 64-bit random values and 32-bit ranges, the bias is
* insignificant.
*/
return
next_u64
()
%
range
;
}
};
#endif
09-hash_experiment/python/Makefile
0 → 100644
View file @
e80b4475
STUDENT_ID
?=
PLEASE_SET_STUDENT_ID
HASHFUNCS
=
ms-low ms-high poly-1 poly-2 tab
.PHONY
:
test
test
:
$(addprefix out/t-grow-
,
$(HASHFUNCS)) $(addprefix out/t-usage-
,
$(HASHFUNCS))
out/t-%
:
hash_experiment.py
@
mkdir
-p
out
./hash_experiment.py
$*
$(STUDENT_ID)
>
$@
.PHONY
:
clean
clean
:
rm
-rf
out
09-hash_experiment/python/hash_experiment.py
0 → 100644
View file @
e80b4475
#!/usr/bin/env python3
import
random
,
sys
from
math
import
sqrt
# Our wrapper of random so we can substitute it with another random generator
rng_init
=
lambda
x
:
random
.
seed
(
x
)
rng_next_u32
=
lambda
:
random
.
randint
(
0
,
2
**
32
-
1
)
class
TabulationHash
:
"""Hash function for hashing by tabulation.
The 32-bit key is split to four 8-bit parts. Each part indexes
a separate table of 256 randomly generated values. Obtained values
are XORed together.
"""
def
__init__
(
self
,
num_buckets
):
self
.
num_buckets
=
num_buckets
self
.
tables
=
[
None
]
*
4
for
i
in
range
(
4
):
self
.
tables
[
i
]
=
[
rng_next_u32
()
for
_
in
range
(
256
)
]
def
__call__
(
self
,
key
):
h0
=
key
&
0xff
;
h1
=
(
key
>>
8
)
&
0xff
;
h2
=
(
key
>>
16
)
&
0xff
;
h3
=
(
key
>>
24
)
&
0xff
;
t
=
self
.
tables
return
(
t
[
0
][
h0
]
^
t
[
1
][
h1
]
^
t
[
2
][
h2
]
^
t
[
3
][
h3
])
%
self
.
num_buckets
class
PolynomialHash
:
"""Hash function using polynomial modulo a prime."""
def
__init__
(
self
,
num_buckets
,
degree
,
prime
=
2147483647
):
self
.
num_buckets
=
num_buckets
self
.
prime
=
prime
self
.
coefs
=
[
rng_next_u32
()
for
_
in
range
(
degree
+
1
)
]
def
__call__
(
self
,
key
):
acc
=
0
for
c
in
self
.
coefs
:
acc
=
(
acc
*
key
+
c
)
%
self
.
prime
return
acc
%
self
.
num_buckets
LinearHash
=
lambda
num_buckets
:
PolynomialHash
(
num_buckets
,
1
)
QuadraticHash
=
lambda
num_buckets
:
PolynomialHash
(
num_buckets
,
2
)
class
MultiplyShiftLowHash
:
"""Multiply-shift hash function taking top bits of 32-bit word"""
def
__init__
(
self
,
num_buckets
):
self
.
mask
=
num_buckets
-
1
assert
(
num_buckets
&
self
.
mask
==
0
),
\
"MultiplyShiftLowHash: num_buckets must be power of 2"
self
.
mult
=
rng_next_u32
()
|
0x1
self
.
shift
=
0
;
tmp
=
num_buckets
-
1
while
0x80000000
&
tmp
==
0
:
tmp
<<=
1
self
.
shift
+=
1
def
__call__
(
self
,
key
):
return
((
key
*
self
.
mult
)
>>
self
.
shift
)
&
self
.
mask
class
MultiplyShiftHighHash
:
"""Multiply-shift hash function taking low bits of upper half of 64-bit word"""
def
__init__
(
self
,
num_buckets
):
self
.
mask
=
num_buckets
-
1
assert
(
num_buckets
&
self
.
mask
==
0
),
\
"MultiplyShiftLowHash: num_buckets must be power of 2"
self
.
mult
=
(
rng_next_u32
()
<<
32
)
|
rng_next_u32
()
|
0x1
def
__call__
(
self
,
key
):
return
((
key
*
self
.
mult
)
>>
32
)
&
self
.
mask
class
HashTable
:
"""Hash table with linear probing"""
def
__init__
(
self
,
hash_fun_factory
,
num_buckets
):
self
.
_hash
=
hash_fun_factory
(
num_buckets
)
self
.
_num_buckets
=
num_buckets
self
.
_table
=
[
None
]
*
num_buckets
self
.
_size
=
0
self
.
reset_counter
()
def
_next_bucket
(
self
,
b
):
return
(
b
+
1
)
%
self
.
_num_buckets
def
lookup
(
self
,
key
):
"""Check whether key is present in the table."""
ret
=
False
steps
=
1
b
=
self
.
_hash
(
key
)
while
self
.
_table
[
b
]
is
not
None
:
if
self
.
_table
[
b
]
==
key
:
ret
=
True
break
steps
+=
1
b
=
self
.
_next_bucket
(
b
)
self
.
_update_counter
(
steps
)
return
ret
def
insert
(
self
,
key
):
"""Add the key in the table."""
assert
self
.
_size
<
self
.
_num_buckets
,
"Cannot insert into a full table."
steps
=
1
b
=
self
.
_hash
(
key
)
while
self
.
_table
[
b
]
is
not
None
:
if
self
.
_table
[
b
]
==
key
:
break
steps
+=
1
b
=
self
.
_next_bucket
(
b
)
else
:
self
.
_table
[
b
]
=
key
self
.
_update_counter
(
steps
)
def
_update_counter
(
self
,
steps
):
self
.
_ops
+=
1
self
.
_steps
+=
steps
self
.
_max
=
max
(
self
.
_max
,
steps
)
def
reset_counter
(
self
):
self
.
_steps
=
0
self
.
_ops
=
0
self
.
_max
=
0
def
report_avg
(
self
):
return
self
.
_steps
/
max
(
1
,
self
.
_ops
)
def
report_max
(
self
):
return
self
.
_max
def
permute_list
(
l
):
N
=
len
(
l
)
for
i
in
range
(
N
-
1
):
dst
=
i
+
(
rng_next_u32
()
%
(
N
-
i
))
l
[
i
],
l
[
dst
]
=
l
[
dst
],
l
[
i
]
def
usage_test
(
hash_fun_factory
,
max_usage
=
90
,
retry
=
40
):
avg
=
[
0.0
]
*
max_usage
avg2
=
[
0.0
]
*
max_usage
N
=
2
**
19
step_size
=
N
//
100
elements
=
list
(
range
(
N
))
for
_
in
range
(
retry
):
H
=
HashTable
(
hash_fun_factory
,
N
)
permute_list
(
elements
)
for
s
in
range
(
max_usage
):
H
.
reset_counter
()
for
i
in
range
(
step_size
):
H
.
insert
(
s
*
step_size
+
i
)
avg
[
s
]
+=
H
.
report_avg
()
avg2
[
s
]
+=
H
.
report_avg
()
**
2
for
i
in
range
(
max_usage
):
avg
[
i
]
/=
retry
;
avg2
[
i
]
/=
retry
;
std_dev
=
sqrt
(
avg2
[
i
]
-
avg
[
i
]
**
2
)
print
(
"%i %.03f %.03f"
%
((
i
+
1
),
avg
[
i
],
std_dev
))
def
grow_test
(
hash_fun_factory
,
usage
=
60
,
retry
=
40
,
begin
=
7
,
end
=
21
):
for
n
in
range
(
begin
,
end
):
avg
=
0.0
avg2
=
0.0
N
=
2
**
n
elements
=
list
(
range
(
N
))
for
_
in
range
(
retry
):
H
=
HashTable
(
hash_fun_factory
,
N
)
permute_list
(
elements
)
for
x
in
elements
[:
N
*
usage
//
100
]:
H
.
insert
(
x
)
for
i
in
range
(
N
):
H
.
lookup
(
i
)
avg
+=
H
.
report_avg
()
avg2
+=