Select Git revision
-
Martin Mareš authoredMartin Mareš authored
hash.tex 45.37 KiB
\ifx\chapter\undefined
\input adsmac.tex
\singlechapter{6}
\fi
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Terminology:
% • Things in the universe are called elements.
% • Elements stored in the data structure (or searched for etc.) are items.
% • Hash function maps elements to buckets.
% • In open addressing, we call the buckets cells (of the array).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter[hash]{Hashing}
\section{Systems of hash functions}
\nota{
\tightlist{o}
\:The \em{universe}~$\Uu$. Usually, the universe is a~set of integers
$\{0,\ldots,U-1\}$, which will be denoted by $[U]$.
\:The set of \em{buckets}~${\cal B} = [m]$.
\:The set $X\subset\Uu$ of $n$~items stored in the data structure.
\:Hash function $h: \Uu \rightarrow {\cal B}$.
\endlist
}
\defn{Let $\cal H$ be a~family of functions from~$\Uu$ to~$[m]$.
We say that the family is \em{$c$-universal} for some $c>0$ if
for every pair $x,y$ of distinct elements of~$\Uu$ we have
$$
\Prsub{h\in{\cal H}} [h(x) = h(y)] \le {c\over m}.
$$
In other words, if we pick a~hash function~$h$ uniformly at random from~$\cal H$,
the probability that $x$ and~$y$ collide is at most $c$-times more than for
a~completely random function~$h$.
Occasionally, we are not interested in the concrete value of~$c$,
so we simply say that the family is \em{universal.}
}
\note{
We will assume that a~function can be chosen from the family uniformly
at random in constant time. Once chosen, it can be evaluated for any~$x$
in constant time. Typically, we define a~single parametrized function $h_a(x)$
and let the family $\cal H$ consist of all~$h_a$ for all possible choices of
the parameter~$a$. Picking a~random $h\in{\cal H}$ is therefore implemented
by picking a~random~$a$. Technically, this makes~$\cal H$ a~multi-set, since
different values of~$a$ may lead to the same function~$h$.
}
\theorem{Let $\cal H$ be a~$c$-universal family of functions from~$\Uu$ to~$[m]$,
$X=\{x_1,\ldots,x_n\} \subseteq \Uu$ a~set of items stored in the data structure,
and $y\in\Uu\setminus X$ another item not stored in the data structure. Then
we have
$$
\E_{h\in{\cal H}} [\#i: h(x_i) = h(y)] \le {cn\over m}.
$$
That is, the expected number of items that collide with~$y$ is at most $cn/m$.
}
\proof
Let~$h$ be a~function picked uniformly at random from~$\cal H$.
We introduce indicator random variables
$$
A_i = \cases{
1 & \hbox{if $h(x_i) = h(y)$,} \cr
0 & \hbox{otherwise.} \cr
}
$$
Expectation of zero-one random variables is easy to calculate:
$\E[A_i] = 0\cdot\Pr[A_i=0] + 1\cdot\Pr[A_i=1] = \Pr[A_i=1] = \Pr[h(x_i) = h(y)]$.
Because $\cal H$ is universal, this probability is at most $c/m$.
The theorem asks for the expectation of a~random variable~$A$, which counts~$i$
such that $h(x_i) = h(y)$. This variable is a~sum of the indicators~$A_i$.
By linearity of expectation, we have
$\E[A] = \E[\sum_i A_i] = \sum_i \E[A_i] \le \sum_i c/m = cn/m$.
\qed
\corrn{Complexity of hashing with chaining}{
Consider a~hash table with chaining which uses $m$~buckets and a~hash function~$h$
picked at random from a~$c$-universal family. Suppose that the hash table contains
items $x_1,\ldots,x_n$.
\tightlist{o}
\:Unsuccessful search for an~item $y$~distinct from all~$x_i$
visits all items in the bucket $h(y)$. By the previous theorem,
the expected number of such items is at most $cn/m$.
\:Insertion of a~new item~$y$ to its bucket takes constant time, but we have to verify
that the item was not present yet. If it was not, this is equivalent
to unsuccessful search.
\:In case of a~successful search for~$x_i$, all items visited in $x_i$'s bucket
were there when $x_i$~was inserted (this assumes that we always add new
items at the end of the chain). So the expected time complexity of the
search is bounded by the expected time complexity of the previous
insert of~$x_i$.
\:Unsuccessful insertion (the item is already there) takes the same time
as successful search.
\:Finally, deletion takes the same time as search (either successful or unsuccessful).
\endlist
Hence, if the number of buckets~$m$ is $\Omega(n)$, expected time complexity
of all operations is constant. If we do not know how many items will be inserted, we can
resize the hash table and re-hash all items whenever~$n$ grows too much. This is similar to the
flexible arrays from section \secref{amortsec} and likewise we can prove that
the amortized cost of resizing is constant.
}
\defn{Let $\cal H$ be a~family of functions from~$\Uu$ to~$[m]$.
The family is \em{$(k,c)$-independent} for integer $k\ge 1$ and real $c>0$ iff
for every $k$-tuple $x_1,\ldots,x_k$ of distinct elements of~$\Uu$
and every $k$-tuple $a_1,\ldots,a_k$ of buckets in~$[m]$, we have
$$
\Prsub{h\in{\cal H}} [h(x_1) = a_1 \land \ldots \land h(x_k) = a_k] \le {c\over m^k}.
$$
That is, if we pick a~hash function~$h$ uniformly at random from~$\cal H$,
the probability that the given items are mapped to the given buckets is
at most $c$-times more than for a~completely random function~$h$.
Sometimes, we do not care about the exact value of~$c$, so we simply
say that a~family is \em{$k$-independent,} if it is $(k,c)$-independent for some~$c$.
}
\obs{
\tightlist{n.}
\:If $\cal H$ is $(k,c)$-independent for $k>1$, then it is also $(k-1,c)$-independent.
\:If $\cal H$ is $(2,c)$-independent, then it is $c$-universal.
\endlist
}
\subsection{Constructions from linear congruence}
\defn{For any prime~$p$ and $m\le p$, we define the family of linear functions
${\cal L} = \{ h_{a,b} \mid a,b\in [p] \}$ from $[p]$ to $[m]$, where
$h_{a,b}(x) = ((ax+b) \bmod p) \bmod m$.
}
\theorem{The family~$\cal L$ is 2-universal.}
\proof
Let $x,y$ be two distinct numbers in $[p]$.
First, we will examine the behavior of linear functions taken modulo~$p$
without the final modulo~$m$.
For an arbitrary pair $(a,b)\in [p]^2$ of parameters, we define
$$\eqalign{
r & = (ax+b) \bmod p, \cr
s & = (ay+b) \bmod p. \cr
}$$
This maps pairs $(a,b)\in [p]^2$ to pairs $(r,s)\in [p]^2$.
We can verify that this mapping is bijective: for given $(r,s)$,
we can solve the above equations for $(a,b)$. Since integers modulo~$p$
form a~field, the system of linear equations (which is regular thanks to $x\ne y$)
has a~unique solution.
So we have a~bijection between $(a,b)$ and $(r,s)$. This makes picking $(a,b)$
uniformly at random equivalent to picking $(r,s)$ uniformly at random.
Now, we resurrect the final modulo~$m$.
Let us count \em{bad pairs} $(a,b)$, for which we have $h_{a,b}(x) = h_{a,b}(y)$.
These correspond to bad pairs $(r,s)$ such that $r\equiv s$ modulo~$m$.
Now we fix~$r$ and count~$s$ for which $(r,s)$ is bad.
If we partition the set $[p]$ to $m$-tuples, we get $\lceil p/m\rceil$ $m$-tuples,
last of which is incomplete. A~complete $m$-tuple contains exactly one number
congruent to~$r$, the final $m$-tuple one or zero.
Therefore for each~$r$, we have at most $\lceil p/m\rceil \le (p+m-1)/m$ bad pairs.
Since $m\le p$, this is at most $2p/m$.
Altogether, we have~$p$ choices for~$r$, so the number of all bad pairs is at most
$2p^2/m$.
As there are $p^2$ pairs in total, the probability that a~pair $(r,s)$ is bad is at
most $2/m$.
Since there is a~bijection between pairs $(a,b)$ and $(r,s)$, the probability of
a~bad pair $(a,b)$ is the same. The family~$\cal L$ is therefore 2-universal.
\qed
Surprisingly, a~slight modification of the family yields even 1-universality.
\defn{For any prime~$p$ and $m\le p$, we define the family of linear functions
${\cal L}' = \{ h_{a,b} \mid a,b\in[p] \land a\ne 0\}$ from $[p]$ to $[m]$, where
$h_{a,b}(x) = ((ax+b) \bmod p) \bmod m$.
}
\theorem{The family~${\cal L'}$ is 1-universal.}
\proof
We shall modify the previous proof. The bijection between pairs $(a,b)$
and $(r,s)$ stays. The requirement that $a\ne 0$ translates to $r\ne s$.
We therefore have $p(p-1)$ pairs $(r,s)$. How many of those are bad?
For a~single~$r$, we have one less bad choice of~$s$ (in the tuple
containing~$r$), so at most $\lceil p/m\rceil - 1 \le (p+m-1)/m - 1 = (p-1)/m$
bad choices. For all~$r$ together, we have at most $p(p-1)/m$ bad pairs,
so the probability that a~random pair is bad is at most~$1/m$.
\qed
Now, we turn back to the original linear family~$\cal L$.
\theorem{The family~${\cal L}$ is $(2,4)$-independent.}
\proof
We fix $x,y\in [p]$ distinct and $i,j\in [p]$. To verify $(2,4)$-independence,
we need to prove that $\Pr[h_{a,b}(x) = i \land h_{a,b}(x) = j] \le 4/m^2$ for a~random
choice of parameters $(a,b)$.
As in the proof of universality of~$\cal L$, we consider the bijection between
pairs $(a,b)$ and $(r,s)$. Hence the event $h_{a,b}(x) = i$ is equivalent to
$r\equiv i$ modulo~$m$ and $h_{a,b}(y) = j$ is the same as $s\equiv j$. As the
choices of~$r$ and~$s$ are independent, we can consider each event separately.
Let us count the values of~$r$ which are congruent to~$i$ modulo~$m$.
If we divide the set $[p]$ to $m$-tuples, each $m$-tuple contains at most one
such~$r$. As we have $\lceil p/m\rceil$ $m$-tuples and $p$~possible values of~$r$,
the probability that $r\equiv i$ is at most $\lceil p/m\rceil / p
\le (p+m-1)/mp$. As $m\le p$, we have $p+m-1 \le 2p$, so the probability
is at most $2p/mp = 2/m$.
Similarly, the probability that $s\equiv j$ is at most $2/m$. Since the events
are independent, the probability of their conjunction is at most $4/m^2$. This
is what we needed for $(2,4)$-independence.
\qed
\subsection{Composition of function families}
The proofs of 2-universality and (2,4)-independence of the family~$\cal L$
have a~clear structure: they start with a~$(2,1)$-independent family from
a~field to the same field. Then they reduce the family's functions modulo~$m$
and prove that independence/universality degrades only slightly. This approach
can be easily generalized.
\lemmaxn{M}{composition modulo~m}{
Let~$\cal H$ be a~$(2,c)$-independent family of functions from~$\Uu$ to $[r]$
and $m<r$. Then the family ${\cal H} \bmod m = \{ h \bmod m \mid h \in {\cal H} \}$
is $2c$-universal and $(2,4c)$-independent.
}
\proof
Consider universality first. For two given items $x_1\ne x_2$, we should show that
$\Prsub{h\in{\cal H}}[h(x_1) = h(x_2)] \le 2c/m$. The event $h(x_1) = h(x_2)$ can be
written as a~union of disjoint events $h(x_1)=i_1 \land h(x_2)=i_2$ over all
pairs $(i_1,i_2)$ such that $i_1$ is congruent to~$i_2$ modulo~$m$. So we have
$$
\Pr[h(x_1) = h(x_2)] = \sum_{i_1 \equiv i_2} \Pr[h(x_1)=i_1 \land h(x_2)=i_2].
$$
By $(2,c)$-independence of~$\cal H$, every term of the sum is at most $c/r^2$.
How many terms are there? We have $r$ choices of~$i_1$, for each of them there
are at most $\lceil r/m\rceil \le (r+m-1)/m \le 2r/m$ congruent choices of~$i_2$.
Therefore the whole sum is bounded by $r\cdot (2r/m)\cdot (c/r^2) = 2c/m$ as we
needed.
For 2-independence, we proceed in a~similar way. We are given two items $x_1\ne x_2$
and two buckets $j_1,j_2\in [m]$. We are bounding
$$
\Prsub{h}[h(x_1) \bmod m = j_1 \land h(x_2) \bmod m = j_2] =
\sum_{i_1\equiv j_1\atop i_2\equiv j_2} \Pr[h(x_1) = i_1 \land h(h_2) = i_2].
$$
Again, each term of the sum is at most $c/r^2$. There are at most $\lceil r/m\rceil \le (r+m-1)/m$
choices of~$i_1$ and independently also of~$i_2$. The sum is therefore bounded by
$$
{c\over r^2} \cdot \left( r+m-1 \over m \right)^2 \le
{c\over r^2} \cdot {(2r)^2 \over m^2} =
{4c \over m^2},
$$
which guarantees $(2,4c)$-independence of ${\cal H}\bmod m$.
\qed
The previous proof can be extended to general $k$-independence. However,
instead of $4c$, we get $2^kc$, which grows steeply with~$k$. This is
unavoidable for $m$~close to~$r$, but if we assume $r\gg m$, we can actually
improve the constant even beyond the previous lemma.
\lemmaxn{K}{k-independent composition modulo~m}{
Let~$\cal H$ be a~$(k,c)$-independent family of functions from~$\Uu$ to $[r]$
and $m$ integer such that $r \ge 2km$.
Then the family ${\cal H} \bmod m = \{ h \bmod m \mid h \in {\cal H} \}$
is $(k,2c)$-independent.
}
\proof
Let us extend the previous proof. The family ${\cal H}\bmod m$ is $(k,c')$-independent for
$$
c' =
{c\over r^k} \cdot \left( r+m-1 \over m \right)^k =
{c\over m^k} \cdot \left( r+m-1 \over r \right)^k \le
{c\over m^k} \cdot \left( 1 + {m\over r} \right)^k.
$$
Since $\e^x \ge 1+x$ for all real~$x$, we have $(1+m/r)^k \le (\e^{m/r})^k = \e^{mk/r}$.
By premise of the lemma, $mk/r \le 1/2$, so $\e^{mk/r} \le \e^{1/2} \le 2$. Hence $c'\le 2c$.
\qed
\example{
Let us test this machinery on the linear family~$\cal L$.
The family of all linear functions in a~field is $(2,1)$-independent
(by the bijection between $(a,b)$ and $(r,s)$ from the original proofs).
Taken modulo~$m$, the family is 2-universal and $(2,4)$-independent by Lemma~\xx{M}.
Whenever $p\ge 4m$, it is even $(2,2)$-independent by Lemma~\xx{K}.
}
The drawback of the reduction modulo~$m$ is that we needed a~2-independent family
to start with. If it is merely universal, the modulo can destroy universality
(exercise~\exref{modfail}). However, composing an~universal family with a~2-independent
family (typically~$\cal L$) yields a~2-independent family.
\lemmaxn{G}{general composition}{
Let~$\cal F$ be a~$c$-universal family of functions from~$\Uu$ to $[r]$.
Let~$\cal G$ be a~$(2,d)$-independent family of functions from $[r]$ to $[m]$.
Then the family ${\cal H} = {\cal F}\circ{\cal G} = \{ f\circ g \mid f\in {\cal F}, g\in {\cal G} \}$
is $(2,c')$-independent for $c' = (cm/r+1)d$.
}
\proof
Given distinct $x_1, x_2\in \Uu$ and $i_1,i_2\in [m]$, we should bound
$$
\Prsub{h\in{\cal H}} [h(x_1) = i_1 \land h(x_2) = i_2] =
\Prsub{f\in{\cal F}, g\in{\cal G}} \; [g(f(x_1)) = i_1 \land g(f(x_2)) = i_2].
$$
It is tempting to apply 2-independence of~$\cal G$ on the intermediate results $f(x_1)$
and $f(x_2)$, but unfortunately we cannot be sure that they are distinct. Still,
universality of~$\cal F$ should guarantee that they are almost always distinct. We will
do it precisely now.
Let~$M$ be the \em{match} event $g(f(x_1)) = i_1 \land g(f(x_2)) = i_2$ and $C$ the \em{collision}
event $f(x_1) = f(x_2)$. For a~random choice of~$f$ and~$g$, we have
$$\eqalign{
\Pr[M] &= \Pr[M\land\lnot C] + \Pr[M\land C] \cr
&= \Pr[M\mid\lnot C]\cdot\Pr[\lnot C] + \Pr[M\mid C]\cdot\Pr[C]. \cr
}$$
by elementary probability. (If $\Pr[C]$ or $\Pr[\lnot C]$ is~0, the conditional
probabilities are not defined, but then the lemma holds trivially.)
Each term can be handled separately:
$$\vbox{\halign{\hfil $\displaystyle{#}$&$\displaystyle{{}#}$\hfil&\quad #\hfil\cr
\Pr[M\mid\lnot C] &\le d/m^2 &by $(2,d)$-independence of~${\cal G}$ \cr
\Pr[\lnot C] &\le 1 &trivially \cr
\Pr[M\mid C] &\le d/m &because $(2,d)$-independence implies $(1,d)$-independence \cr
\Pr[C] &\le c/r &by $c$-universality of~${\cal F}$ \cr
}}$$
So $\Pr[M] \le d/m^2 + cd/mr$. We want to write this as $c' / m^2$, so
$c' = d + cdm/r = (1 + cm/r)d$.
\qed
\corr{
The family ${\cal F}\circ {\cal G}$ is also $(2,c')$-independent for $c' = (c+1)d$.
}
\proof
Existence of a~2-independent family from $[r]$ to $[m]$ implies $r\ge m$,
so $(1 + cm/r)d \le (1+c)d$.
\qed
\subsection{Polynomial hashing}
So far, we constructed $k$-independent families only for $k=2$. Families with
higher independence can be obtained from polynomials of degree~$k$ over a~field.
\defn{For any field $\Zp$ and any $k\ge 1$, we define the family of polynomial
hash functions ${\cal P}_k = \{ h_\t \mid \t \in \Zp^k \}$ from $\Zp$ to~$\Zp$,
where $h_\t(x) = \sum_{i=0}^{k-1} t_ix^i$ and the $k$-tuple~$\t$ is indexed from~0.}
\lemma{The family ${\cal P}$ is $(k,1)$-independent.}
\proof
Let $x_1,\ldots,x_k\in\Zp$ be distinct items and $a_1,\ldots,a_n\in\Zp$ buckets.
By standard results on polynomials, there is exactly one polynomial~$h$ of degree at most~$k$
such that $h(x_i) = a_i$ for every~$i$. Hence the probability than a~random polynomial
of degree at most~$k$ has this property is $1/p^k$.
\qed
Of course hashing from a~field to the same field is of little use, so we usually
consider the family ${\cal P}_k\bmod m$ instead. If we set~$p$ large enough, Lemma~\xx{K}
guarantees:
\corr{If $p \ge 2km$, the family ${\cal P}_k\bmod m$ is $(k,2)$-independent.}
The downside is that we need time $\Theta(k)$ both to pick a~function from the family
and to evaluate it for a~single~$x$.
% TODO: Mention Siegel's constant-time functions.
\subsection{Multiply-shift hashing}
Simple and fast hash functions can be obtained from the combination of multiplication
with bitwise shifts. On a~32-bit machine, we multiply a~32-bit element~$x$ by a~random
odd number~$a$. We let the machine truncate the result to bottom 32 bits and then we
use a~bitwise shift by $32-\ell$ bits to the right, which extracts topmost $\ell$~bits
of the truncated result. Surprisingly, this yields a~good hash function from $[2^{32}]$
to $[2^\ell]$.
We provide a~more general definition using the bit slice operator:
a~\em{bit slice} $x\slice{a:b}$ extracts bits at positions $a$ to~$b-1$ from the
binary expansion of~$x$. Therefore, $x\slice{a:b} = \lfloor x/2^a\rfloor \bmod 2^{b-a}$.
\defn{
For any $w$ (word length of the machine) and $\ell$ (width of the result), we define
the multiply-shift family ${\cal M} = \{ h_a \mid a\in [2^w], \hbox{$a$~odd} \}$
from $[2^w]$ to $[2^\ell]$,
where $h_a(x) = (ax) \slice{w-\ell : w}$.
}
\claim{The family $\cal M$ is 2-universal.}
Obviously, $\cal M$~is not 2-independent, because $h_a(0)=0$ for all~$a$.
However, 2-independence requires only minor changes: increasing precision from $w$~bits
to roughly $w+\ell$ and adding a~random number to make all buckets equiprobable.
\defn{
Let~$\Uu=[2^w]$ be the universe of $w$-bit words, $\ell$ the number of bits of result,
and $w' \ge w + \ell - 1$.
We define the multiply-shift family ${\cal M'} = \{ h_{a,b} \mid a,b\in 2^{w'},
\hbox{$a$~odd} \}$
from $[2^w]$ to $[2^\ell]$, where
$h_{a,b}(x) = (ax+b) \slice{w'-\ell : w'}$.
}
This is still easy to implement: for 32-bit keys and $\ell\le 32$ bits of result,
we can set $w'=64$, compute multiplication and addition with a~64-bit result
and then extract the topmost $\ell$~bits by shifting to the right by $64-\ell$ bits.
\claim{The family ${\cal M}'$ is 2-independent.}
\subsection{Tabulation hashing}
To describe a~completely random function from $[2^u]$ to $[2^\ell]$, we need a~table
of $2^u$ entries of $\ell$~bits each. This is usually infeasible (and if it were, there would
be no reason for hashing), but we can split the $u$-bit argument to multiple parts,
index smaller tables by the parts, and combine the tabulated values to the final result.
This leads to a~simple construction of hash functions with quite strong properties.
More precisely, let the universe be $\Uu = [2^{kt}]$ for some $t\ge 1$ (number of parts)
and $k\ge 1$ (part size), and $[2^\ell]$ the set of buckets.
We generate random tables $T_0$, \dots, $T_{t-1}$ of $2^k$ entries per $\ell$~bits.
To evaluate the hash function for $x\in\Uu$, we split~$x$ to parts $x\slice{ik : (i+1)k}$,
where $0\le i<t$. We index each table by the corresponding part and we \XOR{} the results:
$$
h(x) = \bigoplus_{0\le i<t} T_i[ \, x\slice{ik : (i+1)k} \,].
$$
Tabulation hashing can be considered a~family of hash functions, parametrized by the
contents of the tables. Picking a~function from the family takes time $\Theta(t\cdot 2^k)$
to initialize the tables, evaluating it takes $\Theta(t)$.
\claim{
Tabulation hashing is 3-independent, but not 4-independent. (Assuming that it uses
at least~2 tables.)
}
Nevertheless, we will see that tabulation hashing possesses some properties, which
generally require higher independence.
\subsection{Hashing of vectors using scalar products}
So far, our universe consisted of simple numbers. We now turn our attention to
hashing of more complex data, namely $d$-tuples of integers. We will usually view
the tuples as $d$-dimensional vectors over some field~$\Zp$.
Similarly to the family of linear functions, we can hash vectors by taking scalar
products with a~random vector.
\defn{For a~prime~$p$ and vector size $d\ge 1$, we define the family of
scalar product hash functions
${\cal S} = \{ h_\t \mid \t \in \Zp^d \}$ from~$\Zp^d$ to~$\Zp$, where
$h_\t(\x) = \t \cdot \x$.
}
\theorem{The family $\cal S$ is 1-universal. A~function can be picked at random
from~$\cal S$ in time $\Theta(d)$ and evaluated in the same time.}
\proof
Consider two distinct vectors $\x, \y \in \Zp^d$. Let $k$ be a~coordinate
for which $x_k \ne y_k$. As the vector product does not depend on ordering
of components, we can renumber the components, so that $k=d$.
For a~random choice of the parameter~$\t$, we have (in~$\Zp$):
$$\eqalign{
&\Pr[ h_\t(\x) = h_\t(\y) ] =
\Pr [ \x\cdot\t = \y\cdot\t ] =
\Pr [ (\x-\y)\cdot\t = 0 ] = \cr
&= \Pr \left[ \sum_{i=1}^d (x_i-y_i)t_i = 0 \right] =
\Pr \left[ (x_d-y_d)t_d = -\sum_{i=1}^{d-1} (x_i-y_i)t_i \right]. \cr
}$$
For every choice of $t_1,\ldots,t_{d-1}$, there exists exactly one
value of~$t_d$ for which the last equality holds. Therefore it holds
with probability $1/p$.
\qed
\note{
There is clear intuition behind the last step of the proof: in a~field,
multiplication of a~non-zero number by a~uniformly random number
yields a~uniformly random number; similarly, adding a~uniformly random number
to any number yields a~uniformly random result.
}
What if the field is too large and we want to reduce the result modulo some~$m$?
To use Lemma~\xx{M}, we need a~2-independent family, but~$\cal S$ is merely universal.
We can use Lemma~\xx{G} and compose~$\cal S$ with the family~$\cal L$ of linear
functions from~$\Zp$ to~$[m]$. As~$\cal L$ is $(2,4)$-independent, ${\cal G}\circ{\cal L}$
is $(2,8)$-independent. If $p\ge 4m$, $\cal L$ is $(2,2)$-independent and ${\cal G}\circ{\cal H}$
$(2,c')$-independent for $c' = (1 + 1/4)\cdot 2 = 5/2$.
Alternatively, we can add an~additive constant to the scalar product:
\defn{For a~prime~$p$, vector size $d\ge 1$,
we define the family of scalar product hash functions
${\cal S}' = \{ h_{\t,\beta} \mid \t\in \Zp^d, \beta\in\Zp \}$
from~$\Zp^d$ to $\Zp$, where
$h_{\t,\beta}(x) = \t\cdot \x + \beta$.
}
\theorem{
If $p\ge 4m$, the family ${\cal S}' \bmod m$ is $(2,2)$-independent.
A~function can be picked at random from the family in time $\Theta(d)$
and evaluated in the same time.
}
\proof
By exercise~\exref{sprimeindep}, the family $\cal S'$ is $(2,1)$-independent.
By Lemma~\xx{K}, ${\cal S}' \bmod m$ is $(2,2)$-independent.
\qed
\subsection{Rolling hashes from polynomials}
Another construction of hashing functions of vectors is based on polynomials,
but this time the role of coefficients is played by the components of the vector
and the point where the polynomial is evaluated is chosen randomly.
\defn{
For a~prime~$p$ and vector size~$d$, we define the family of polynomial hash functions
${\cal R} = \{ h_a \mid a\in\Zp \}$ from $\Zp^d$ to~$\Zp$, where
$h_a(\x) = \sum_{i=0}^{d-1} x_{i+1} \cdot a^i$.
}
\theorem{The family~$\cal R$ is $d$-universal.
A~function can be picked from~$\cal R$ at random in constant time
and evaluated for a~given vector in $\Theta(d)$ time.}
\proof
Consider two vectors $\x \ne \y$ and a~hash function~$h_a$ chosen at random
from~$\cal R$. A~collision happens whenever $\sum_i x_{i+1} a^i = \sum_i y_{i+1}
a^i$. This is the same condition as $\sum_i (\x-\y)_{i+1} a^i = 0$, that is if
the number~$a$ is a~root of the polynomial $\x - \y$. Since a~polynomial
of degree at most~$d$ can have at most~$d$ roots (unless it is identically zero),
the probability that $a$~is a~root is at most $d/p$. This implies $d$-universality.
\qed
As usually, we can reduce the range of the function modulo~$m$. However, it is better
to compose the family~$\cal R$ with the $(2,4)$-independent family~${\cal L}$ of
linear functions. Not only we get a~2-independent family as a~result, but Lemma~\xx{G}
guarantees that if $p$~is sufficiently large, the big constant from $d$-universality
disappears.
\corr{Given a~prime~$p$ and the number of buckets~$m$ such that $p \ge 4km$, the
compound family ${\cal R}\circ {\cal L}$ is $(2,5)$-independent.}
Hash functions of this kind play important role in the \em{Rabin-Karp string search
algorithm.} Suppose that we are searching for a~$d$-character substring~$\nu$ (the needle)
in a~long text~$\sigma$ (the haystack). We pick a~hash function~$h$ and calculate
the hash $h(\nu)$ of the needle. Then we slide a~window of size~$d$ over the haystack and
for each position of the window, we hash the contents of the window. Only if the hash of
the window is equal to $h(\nu)$, we compare the window with the needle character-by-character.
For this algorithm, we need a~hash function which can be recalculated in constant time
when the window shifts one position to the right. Such functions are called \em{rolling hashes}
and they are usually chosen from the family~$\cal R$. Compare hashes of the window at positions
$j$ and $j+1$ (we read each window from right to left):
$$\eqalign{
H_j = h(\sigma_{j},\ldots,\sigma_{j+d-1}) &= \sigma_{j}a^{d-1} + \sigma_{j+1}a^{d-2} + \ldots + \sigma_{j+d-1}a^0, \cr
H_{j+1} = h(\sigma_{j+1},\ldots,\sigma_{j+d}) &= \sigma_{j+1}a^{d-1} + \sigma_{j+2}a^{d-2} + \ldots + \sigma_{j+d}a^0, \cr
}$$
We can observe that $H_{j+1} = aH_j - \sigma_{j}a^d + \sigma_{j+d}$, everything calculated
in the field~$\Zp$. This is constant-time if we pre-calculate~$a^d$.
\subsection{Hashing strings}
Finally, let us consider hashing of strings of variable length, up to a~certain limit~$L$.
We will pad the strings to length exactly~$L$ by appending blank characters, reducing the
problem to hashing of $L$-tuples.
We have to make sure that a~blank does not occur anywhere else in the string --- otherwise we get
systematic collisions. The obvious choice is to encode the blank as~0, which allows to skip
the parts of hash function which deal with blanks altogether.
The $L$-tuples can be hashed using the family of polynomial-based rolling hash functions.
For large~$L$, this is preferred over scalar-product hashing, which requires to generate
a~random $L$-component vector of parameters.
\exercises
\ex{Show that the family of all constant functions from~$\Uu$ to~$[m]$
is 1-independent.}
\ex{Show that the family ${\cal L}'$ is not 1-universal. Find the smallest~$c$
such that it is $c$-universal.}
\ex{What if we modify the definition of ${\cal L}$, so that it forces $b=0$?
Is the modified family $c$-universal for some~$c$? Is it 2-independent?}
\ex{Prove that the family ${\cal L}$ is not 3-independent.}
\ex[modfail]{Show that taking an~universal modulo~$m$ sometimes destroys universality.
Specifically, show that for each~$c$ and~$m>1$, there is a~family~$\cal H$ from some~$\Uu$
to the same~$\Uu$ which is 2-universal, but ${\cal H}\bmod m$ is not $c$-universal.
}
\ex[sprimeindep]{Show that the family~${\cal S}'$ is $(2,1)$-independent.}
\ex{Analyze expected time complexity of the Rabin-Karp algorithm, depending
on haystack length, needle length, and the number of buckets.}
\hint{Assign a~random variable to each position of the window, which will
indicate a~false match (hash collision).}
% TODO: More exercises. Prove claims.
\endexercises
\section{Cuckoo hashing}
\subsection{Sketch}
We have two functions $f$,~$g$ mapping the universe~$\Uu$ to $[m]$.
Each bucket contains at most one item.
An item $x\in \Uu$ can be located only in buckets $f(x)$ and $g(x)$.
Lookup checks only these two buckets, so it takes constant time in the worst case.
Insert looks inside both buckets. If one is empty, we put the new item there.
Otherwise we \uv{kick out} an item from one of these buckets and we replace it
by the new item. We place the kicked-out item using the other hash function,
which may lead to kicking out another item, and so on. After roughly $\log n$
attempts (this is called insertion timeout), we give up and rehash everything
with new choice of $f$ and~$g$.
\theorem{
Let $m\ge (2+\varepsilon)n$, insertion timeout be set to $\lceil 6\log n\rceil$ and
$f$,~$g$ chosen at random from a~$\lceil 6\log n\rceil$-independent family.
Then the expected time complexity of \alg{Insert} is $\O(1)$.
}
\note{
It is also known that a~6-independent family is not sufficient to guarantee
expected constant insertion time, while tabulation hashing (even though
it is not 4-independent) is sufficient.
}
\section{Linear probing}
\em{Open addressing} is a~form of hashing where each bucket contains at
most one item --- the items are stored in an array and the hash function
produces the index of an array cell where the particular item should be stored.
Of course, multiple items can collide in the same cell. So we have
to consider alternative locations of items.
In general, we can define a~\em{probe sequence} for each element of the universe.
This is a~sequence of possible locations of the element in the array in decreasing
order of preference. The cells will be numbered from~0 to~$m-1$ and indexed modulo~$m$.
\alg{Insert} follows the probe sequence until it finds an~empty cell.
\alg{Find} follows the probe sequence until it either finds a~matching item,
or it finds an~empty cell.
\alg{Delete} is more complicated, since we generally cannot remove items
from their cells --- a~probe sequence for a~different item could have been
interrupted. Instead, we will replace deleted items with ``tombstones'' and
rebuild the whole table occasionally.
We will study a~particularly simple probe sequence called \em{linear probing}. We fix a~hash function~$h$
from the universe to~$[m]$. The probe sequence for~$x$ will be $h(x)$, $h(x)+1$,
$h(x)+2$, \dots{}, taken modulo~$m$.
Historically, linear probing was considered inferior to other probing methods,
because it tends to produce \em{clusters} --- continuous intervals of cells
occupied by items. Once a~large cluster forms, it is probable that the next item
inserted will be hashed to a~cell inside the cluster, extending the cluster even further.
On the other hand, linear probing is quite friendly to caches, since it accesses
the array sequentially. We will prove that if we use a~strong hash function and
we keep the table sparse enough (let us say at most half full), the expected
size of clusters will be constant.
\claimn{basic properties of linear probing}{Suppose that $m\ge (1+\varepsilon)\cdot n$. Then
the expected number of probes during an~operation is:
\tightlist{o}
\:$\O(1/\varepsilon^2)$ for a~completely random hash functions
\:$\O(f(\varepsilon))$ for a~function chosen from a~$(\log n)$-independent family
(here~$f$ is some function)
\:$\O(1/\varepsilon^{13/6})$ for hash function chosen from a~5-independent family
\:$\Omega(\log n)$ for at least one 4-independent family
\:$\Omega(\sqrt n)$ for at least one 2-independent family
\:$\Omega(\log n)$ for multiply-shift hashing
\:$\O(1/\varepsilon^2)$ for tabulation hashing
\endlist
}
We will prove a~slightly weaker version of the first bound. (All restrictions
can be removed at the expense of making the technical details more cumbersome.)
\theorem{Let $m$ (table size) be a~power of two, $n\le m/3$ (the number of items),
$h$~a~completely random hash function, and~$x$ an~item. Then the expected number
of probes when searching for~$x$ is bounded by a~constant independent of $n$, $m$, $h$, and~$x$.
}
The rest of this section is dedicated to the proof of this theorem.
Without loss of generality, we can assume that $n=m/3$, since it clearly maximizes
the expected number of probes. We need not worry that $m$~is actually not divisible
by~3 --- errors introduced by rouding are going to be negligible.
We build a~complete binary tree over the table cells. A~node at height~$t$ corresponds
to an~interval of size $2^t$, whose start is aligned on a~multiple of the size. We will
call such intervals \em{blocks.} A~block is \em{criticial} if at least $2/3\cdot 2^t$
items are hashed to it. (We have to distinguish carefully between items \em{hashed}
to a~block by the hash function and those really \em{stored} in it.)
We will calculate probability that a~given block is critical. We will use the standard
tool for estimating tail probabilities --- the Chernoff inequality (the proof can be found
in most probability theory textbooks).
\theoremn{Chernoff bound for the right tail}{
Let $X_1,\ldots,X_k$ be independent random variables taking values in $\{0,1\}$.
Let further $X = X_1 + \ldots + X_k$, $\mu = \E[X]$, and $c>1$. Then
$$\Pr[X > c\mu] \le \left( \e^{c-1} \over c^c \right) ^ \mu.$$
}
\lemma{
Let~$B$ be a~block of size~$2^t$. The probability that $B$~is critical is at most
$(\e/4)^{2^t/3}$.
}
\proof
For each item~$x_i$, we define an indicator random variable~$X_i$, which will be~1
if~$x_i$ is hashed to the block~$B$. The expectation of an indicator equals the probability
of the indicated event, that is $2^t / m$.
$X = X_1 + \ldots + X_n$ counts the number of items hashed to~$B$. By linearity of
expectation, $\mu = \E[X] = \sum_i \E[X_i] = n\cdot 2^t / m$. As we assume that $n=m/3$,
we have $\mu = 2^t/3$. So if a~block is critical, we must have $X > 2\mu$. By the
Chernoff bound with $c=2$, this happens with probability at most $(\e/4)^\mu
= (\e/4)^{2^t/3}$.
\qed
Now, we are going to analyze \em{runs} --- maximal intervals of consecutive non-empty
cells (indexed modulo~$m$). There is an~empty cell before and after each run. Therefore,
whenever an item is stored in a~run, it must be also hashed there. Our ultimate goal
is to estimate size of runs. In order to do this, we prove that a~long run must contain
at least one large critical block.
\lemma{
Let~$R$ be a~run of size at least $2^{\ell+2}$ and $B_0$, \dots, $B_3$ the first
4~blocks of size $2^\ell$ intersecting~$R$. Then at least one of these blocks is
critical.
}
\proof
The size of~$R$ is exactly 4~times~$2^\ell$, but $R$ is generally not aligned on
a~start of a~block. So $R$ intersects between 4 and~5 blocks of size~$2^\ell$.
The first block~$B_0$ contains at least one cell of~$R$, blocks $B_1$ to~$B_3$
contain $2^\ell$ cells of~$R$ each. So the interval $L = R \cap (B_0 \cup \ldots \cup B_3)$
must contain at least $1 + 3\cdot 2^\ell$ cells, all of them non-empty. Since there
is an~empty cell before~$L$, each item stored in~$L$ is also hashed there.
We will show that if no block~$B_i$ is critical, there cannot be so many items hashed to~$L$.
Each non-critical block of size~$2^\ell$ has at most $2/3\cdot 2^\ell$ items hashed to it,
so our 4~blocks can receive at most $8/3\cdot 2^\ell < 3\cdot 2^\ell$ items.
\qed
We consider the situation when we search for an~item~$x$. We start probing at position
$h(x)$, so the number of probes is at most the size of the run containing the cell $h(x)$.
\lemma{
Let~$R$ be a~run containing $h(x)$ and $\vert R\vert \in [2^{\ell+2}, 2^{\ell+3})$.
Then at least one of the following 12 blocks of size $2^\ell$ is critical:
the block containing $h(x)$,
8~blocks preceding it, and
3~blocks following it.
}
\proof
The run~$R$ intersects between 4 and~9 blocks of the given size,
so it begins at most 8~blocks before the block containing $h(x)$.
By the previous lemma, one of the first 4 blocks of~$R$ is critical.
\qed
Hence, the probability that run length lies between $2^{\ell+2}$ and $2^{\ell+3}$
is bounded by the probability that one of the 12 blocks
(chosen independently of the actual length of the run)
is critical. By union bound and our estimate on the probability that a~block is critical,
we get:
\corr{
Let~$R$ be a~run containing $h(x)$. The probability that
$\vert R\vert \in [2^{\ell+2}, 2^{\ell+3})$ is at most
$12 \cdot (\e/4)^{2^\ell / 3}$ = $12\cdot q^{2^\ell}$, where $q = (\e/4)^{1/3} \doteq 0.879$.
}
Finally, we will use this to prove that the expected size of~$R$ is at most a~constant.
We cover all possible sizes by intervals $[2^{\ell+2},2^{\ell+3})$ together with an exceptional interval
$[0,3]$. We replace each size by the upper bound of its interval, which only increases
the expectation. So we get:
$$
\E[\vert R\vert] \le 3\cdot\Pr[\vert R\vert \le 3] + \sum_{\ell\ge 0} 2^{\ell+3} \cdot \Pr[\vert R\vert \in [2^{\ell+2},2^{\ell+3}]].
$$
We bound the former probability simply by~1 and use the previous corrolary to bound
the latter probability:
$$
\E[\vert R\vert] \le 3 + \sum_{\ell\ge 0} 2^{\ell+3} \cdot q^{2^\ell}
= 3 + 8 \cdot \sum_{\ell\ge 0} 2^\ell \cdot q^{2^\ell}
\le 3 + 8 \cdot \sum_{i\ge 1} i\cdot q^i.
$$
Since the last sum converges for an arbitrary $q\in(0,1)$, the expectation of~$\vert R\vert$
is at most a~constant. This concludes the proof of the theorem.
\qed
% TODO: Concentration inequalities and 5-independence.
\section{Bloom filters}
Bloom filters are a~family of data structures for approximate representation of sets
in a~small amount of memory. A~Bloom filter starts with an empty set. Then it supports
insertion of new elements and membership queries. Sometimes, the filter gives a~\em{false
positive} answer: it answers {\csc yes} even though the element is not in the set.
We will calculate the probability of false positives and decrease it at the expense of
making the structure slightly larger. False negatives will never occur.
\subsection{A trivial example}
We start with a~very simple filter. Let~$h$ be a~hash function from a~universe~$\cal U$
to $[m]$, picked at random from a~$c$-universal family. For simplicity, we will assume
that $c=1$. The output of the hash function will serve as an~index to an~array
$B[0\ldots m-1]$ of bits.
At the beginning, all bits of the array are zero.
When we insert an element~$x$, we simply set the bit $B[h(x)]$ to~1.
A~query for~$x$ tests the bit $B[h(x)]$ and answers {\csc yes} iff the bit is set to~1.
(We can imagine that we are hashing items to $m$~buckets, but we store only which
buckets are non-empty.)
Suppose that we have already inserted items $x_1,\ldots,x_n$. If we query the filter
for any~$x_i$, it always answers {\csc yes.} But if we ask for a~$y$ different from
all~$x_i$'s, we can get a~false positive answer if $x$~falls to the same bucket
as one of the $x_i$'s.
Let us calculate the probability of a~false positive answer.
For a~concrete~$i$, we have $\Pr_h[h(y) = h(x_i)] \le 1/m$ by 1-universality.
By union bound, the probability that $h(y) = h(x_i)$ for least one~$i$
is at most $n/m$.
We can ask an~inverse question, too: how large filter do we need to push error
probability under some $\varepsilon>0$? By our calculation, $\lceil n/\varepsilon\rceil$
bits suffice. It is interesting that this size does not depend on the size of the universe
--- all previous data structures required at least $\log\vert{\cal U}\vert$ bits per item.
On the other hand, the size scales badly with error probability: for example,
a~filter for $10^6$ items with $\varepsilon = 0.01$ requires 100\thinspace Mb.
\subsection{Multi-band filters}
To achieve the same error probability in smaller space, we can simply run
multiple filters in parallel. We choose $k$~hash functions $h_1,\ldots,h_k$,
where $h_i$~maps the universe to a~separate array~$B_i$ of $m$~bits. Each
pair $(B_i,h_i)$ is called a~\em{band} of the filter.
Insertion adds the new item to all bands. A~query asks all bands and it answers
{\csc yes} only if each band answered {\csc yes}.
We shall calculate error probability of the $k$-band filter. Suppose that we set
$m=2n$, so that each band gives a~false positive with probability at most $1/2$.
The whole filter gives a~false positive only if all bands did, which happens with
probability at most $2^{-k}$ if the functons $h_1,\ldots,h_k$ where chosen independently.
This proves the following theorem.
\theorem{
Let $\varepsilon > 0$ be the desired error probability
and $n$~the maximum number of items in the set.
The $k$-band Bloom filter with $m=2n$ and $k = \lceil \log (1/\varepsilon)\rceil$
gives false positives with probability at most~$\varepsilon$.
It requires $2m\lceil\log (1/\varepsilon)\rceil$ bits of memory and both
\alg{Insert} and \alg{Lookup} run in time $\O(k)$.
}
In the example with $n=10^6$ and $\varepsilon=0.01$, we get $m=2\cdot 10^6$
and $k=7$, so the whole filter requires 14\thinspace Mb. If we decrease
$\varepsilon$ to $0.001$, we have to increase~$k$ only to~10, so the memory
consumption reaches only 20\thinspace Mb.
\subsection{Optimizing parameters}
The multi-band filter works well, but it turns out that we can fine-tune its parameters
to improve memory consumption by a~constant factor. We can view it as
an~optimization problem: given a~memory budget of~$M$ bits, set the parameters
$m$ and~$k$ such that the filter fits in memory ($mk \le M$) and the error
probability is minimized. We will assume that all hash functions are perfectly
random.
Let us focus on a~single band first. If we select its size~$m$, we can easily
calculate probability that a~given bit is zero. We have $n$~items, each of them hashed
to this bit with probability $1/m$. So the bit remains zero with probability $(1-1/m)^n$.
This is approximately $p = \e^{-n/m}$.
We will show that if we set~$p$, all other parameters are uniquely determined and so
is the probability of false positives. We will find~$p$ such that this probability is
minimized.
If we set~$p$, it follows that $m \approx -n / \ln p$. Since all bands must fit in $M$~bits
of memory, we want to use $k = \lfloor M/m\rfloor \approx -M/n \cdot \ln p$ bands. False
positives occur if we find~1 in all bands, which has probability
$$
(1-p)^k =
\e^{k\ln(1-p)} \approx
\e^{-M/n \cdot \ln p \cdot \ln(1-p)}.
$$
As $\e^{-x}$ is a~decreasing function, it suffices to maximize $\ln p \cdot \ln (1-p)$
for $p\in(0,1)$. By elementary calculus, the maximum is attained for $p = 1/2$. This
leads to false positive probability $(1/2)^k = 2^{-k}$. If we want to push this under~$\varepsilon$,
we set $k = \lceil\log(1/\varepsilon)\rceil$,
so $M = kn / \ln 2 \approx n \cdot \log(1/\varepsilon) \cdot (1/\ln 2) \doteq
n \cdot \log(1/\varepsilon) \cdot 1.44$.
% TODO: Plot ln(p)*ln(1-p)
This improves the constant from the previous theorem from~2 to circa 1.44.
\note{It is known that any approximate membership data structure with false positive
probability~$\varepsilon$ and no false negatives must use at least $n\log(1/\varepsilon)$
bits of memory. The optimized Bloom filter is therefore within a~factor of 1.44 from the
optimum.}
\subsection{Single-table filters}
It is also possible to construct a~Bloom filter, where multiple hash functions
point to bits in a~shared table. (In fact, this was the original construction by Bloom.)
Consider $k$~hash functions $h_1,\ldots,h_k$ mapping the universe to~$[m]$ and a~bit array
$B[0,\ldots,m-1]$. $\alg{Insert}(x)$ sets the bits $B[h_1(x)],\ldots,B[h_k(x)]$ to~1.
$\alg{Lookup}(x)$ returns {\csc yes}, if all these bits are set.
This filter can be analysed similarly to the $k$-band version. We will assume that
all hash functions are perfectly random and mutually independent.
Insertion of $n$~elements sets $kn$ bits (not necessarily distinct), so the
probability that a~fixed bit $B[i]$ is set is $(1-1/m)^{nk}$, which is approximately
$p = \e^{-nk/m}$. We will find the optimum value of~$p$, for which the probability
of false positives is minimized. For fixed~$m$, we get $k = -m/n\cdot\ln p$.
We get a~false positive if all bits $B[h_i(x)]$ are set. This happens with probability
approximately\foot{We are cheating a~little bit here: the events $B[i]=1$
for different~$i$ are not mutually independent. However, further analysis shows that
they are very little correlated, so our approximation holds.}
$(1-p)^k = (1-p)^{-m/n\cdot\ln p} = \exp(-m/n\cdot\ln p\cdot\ln (1-p))$.
Again, this is minimized for $p = 1/2$. So for a~fixed error probability~$\varepsilon$,
we get $k = \lceil\log(1/\varepsilon)\rceil$ and $m = kn / \ln 2 \doteq 1.44\cdot
n\cdot\lceil\log(1/\varepsilon)\rceil$.
We see that as far as our approximation can tell, single-table Bloom filters
achieve the same performance as the $k$-band version.
% TODO
% \subsection{Set operations}
\subsection{Counting filters}
An~ordinary Bloom filter does not support deletion: when we delete an~item, we do not
know if some of its bits are shared with other items. There is an~easy solution: instead
of bits, keep $b$-bit counters $C[0\ldots m-1]$. \alg{Insert} increments the counters, \alg{Delete} decrements
them, and \alg{Lookup} returns {\csc yes} if all counters are non-zero.
However, since the counters have limited range, they can overflow. We will handle overflows
by keeping the counter at the maximum allowed value $2^b-1$, which will not be changed by
subsequent insertions nor deletions. We say that the counter is \em{stuck.} Obviously,
too many stuck counters will degrade the data structure. We will show that this happens
with small probability only.
We will assume a~single-band filter with one fully random hash function and $m$~counters after
insertion of~$n$ items. For fixed counter value~$t$, we have
$$
\Pr[C[i]=t] = {n\choose t}\cdot \left(1\over m\right)^t \cdot \left(1 - {1\over m}\right)^{n-t},
$$
because for each of $n\choose t$ $t$-tuples we have probability $(1/m)^t$ that the
tuple is hashed to~$i$ and probability $(1-1/m)^{n-t}$ that all other items are
hashed elsewhere.
If $C[i]\ge t$, there must exist a~$t$-tuple hashed to~$i$ and the remaining items
can be hashed anywhere. Therefore:
$$
\Pr[C[i]\ge t] \le {n\choose t}\cdot \left(1\over m\right)^t.
$$
Since ${n\choose t} \le (n\e/t)^t$, we have
$$
\Pr[C[i]\ge t] \le \left( n\e \over t \right)^t \cdot \left( 1\over m \right)^t
= \left( n\e \over mt \right)^t.
$$
As we already know that the optimum~$m$ is approximately $n / \ln 2$, the probability is
at most $(\e\ln 2 / t)^t$.
By union bound, the probability that there exists a~stuck counter is at most $m$-times more.
\example{
A~4-bit counter is stuck when it reaches $t=15$, which by our bound happens with probability at most $3.06\cdot 10^{-14}$.
If we have $m = 10^9$ counters, the probability that any is stuck is at most $3.06\cdot 10^{-5}$.
So for any reasonably large table, 4-bit counters are sufficient and they seldom get stuck.
Of course, for a~very long sequence of operations, stuck counters eventually accumulate,
so we should preferably rebuild the structure occasionally.
}
% TODO
% \subsection{Representing functions: the Bloomier filters}
\endchapter