Merge branch 'master' of gitlab.kam.mff.cuni.cz:mj/dsbook

6c1dac32 · Martin Mareš · 43f9bcc7 · bf861ee3 · 6c1dac32 · 6c1dac32
Commit 6c1dac32 authored 6 years ago by Martin Mareš
--- a/06-hash/hash.tex
+++ b/06-hash/hash.tex
@@ -3,6 +3,14 @@
 \singlechapter{6}
 \fi

+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+% Terminology:
+%   • Things in the universe are called elements.
+%   • Elements stored in the data structure (or searched for etc.) are items.
+%   • Hash function maps elements to buckets.
+%   • In open addressing, we call the buckets cells (of the array).
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 \chapter[hash]{Hashing}

 \section{Systems of hash functions}
@@ -21,7 +29,7 @@
 We say that the family is \em{$c$-universal} for some $c>0$ if
 for every pair $x,y\in {\cal U}$ of dictinct items we have
 $$
-	\Pr_{h\in{\cal H}} [h(x) = h(y)] \le {c\over m}.
+	\Prsub{h\in{\cal H}} [h(x) = h(y)] \le {c\over m}.
 $$
 In other words, if we pick a~hash function~$h$ uniformly at random from~$\cal H$,
 the probability that $x$ and~$y$ collide is at most $c$-times more than for
@@ -90,7 +98,7 @@ The family is \em{$(k,c)$-independent} for integer $k\ge 1$ and real $c>0$ iff
 for every $k$-tuple $x_1,\ldots,x_k$ of distinct items of~$\cal U$
 and every $k$-tuple $a_1,\ldots,a_k$ of buckets in~$[m]$, we have
 $$
-	\Pr_{h\in{\cal H}} [h(x_1) = a_1 \land \ldots \land h(x_k) = a_k] \le {c\over m^k}.
+	\Prsub{h\in{\cal H}} [h(x_1) = a_1 \land \ldots \land h(x_k) = a_k] \le {c\over m^k}.
 $$
 That is, if we pick a~hash function~$h$ uniformly at random from~$\cal H$,
 the probability that the given items are mapped to the given buckets is
@@ -193,7 +201,7 @@ is $2c$-universal and $(2,4)$-independent.

 \proof
 Consider universality first. For two given items $x_1\ne x_2$, we should show that
-$\Pr_{h\in{\cal H}}[h(x_1) = h(x_2)] \le 2c/m$. The event $h(x_1) = h(x_2)$ can be
+$\Prsub{h\in{\cal H}}[h(x_1) = h(x_2)] \le 2c/m$. The event $h(x_1) = h(x_2)$ can be
 written as a~union of disjoint events $h(x_1)=i_1 \land h(x_2)=i_2$ over all
 pairs $(i_1,i_2)$ such that $i_1$ is congruent to~$i_2$ modulo~$m$. So we have
 $$
@@ -208,7 +216,7 @@ needed.
 For 2-independence, we proceed in a~similar way. We are given two items $x_1\ne x_2$
 and two buckets $j_1,j_2\in [m]$. We are bounding
 $$
-	\Pr_h[h(x_1) \bmod m = j_1 \land h(x_2) \bmod m = j_2] =
+	\Prsub{h}[h(x_1) \bmod m = j_1 \land h(x_2) \bmod m = j_2] =
 	\sum_{i_1\equiv j_1\atop i_2\equiv j_2} \Pr[h(x_1) = i_1 \land h(h_2) = i_2].
 $$
 Again, each term of the sum is at most $c/r^2$. There are at most $\lceil r/m\rceil \le (r+m-1)/m$
@@ -255,8 +263,8 @@ is $(2,c')$-independent for $c' = (cm/r+1)d$.
 \proof
 Given distinct $x_1, x_2\in {\cal U}$ and $i_1,i_2\in [m]$, we should bound
 $$
-	\Pr_{h\in{\cal H}} [h(x_1) = i_1 \land h(x_2) = i_2] =
-	\Pr_{f\in{\cal F}, g\in{\cal G}} \; [g(f(x_1)) = i_1 \land g(f(x_2)) = i_2].
+	\Prsub{h\in{\cal H}} [h(x_1) = i_1 \land h(x_2) = i_2] =
+	\Prsub{f\in{\cal F}, g\in{\cal G}} \; [g(f(x_1)) = i_1 \land g(f(x_2)) = i_2].
 $$
 It is tempting to apply 2-independence of~$\cal G$ on the intermediate results $f(x_1)$
 and $f(x_2)$, but unfortunately we cannot be sure that they are distinct. Fortunately,
@@ -294,14 +302,14 @@ so $(1 + cm/r)d \le (1+c)d$.
 So far, we constructed $k$-independent families only for $k=2$. Families with
 higher independence can be obtained from polynomials of degree~$k$ over a~field.

-\defn{For any field $\Z_p$ and any $k\ge 1$, we define the family of polynomial
-hash functions ${\cal P}_k = \{ h_{\bf a} \mid {\bf a} \in \Z_p^k \}$ from $\Z_p$ to~$\Z_p$,
-where $h_{\bf a}(x) = \sum_{i=0}^{k-1} a_ix^i$.}
+\defn{For any field $\Zp$ and any $k\ge 1$, we define the family of polynomial
+hash functions ${\cal P}_k = \{ h_\a \mid \a \in \Zp^k \}$ from $\Zp$ to~$\Zp$,
+where $h_\a(x) = \sum_{i=0}^{k-1} a_ix^i$.}

 \lemma{The family ${\cal P}$ is $(k,1)$-independent.}

 \proof
-Let $x_1,\ldots,x_k\in\Z_p$ be distinct items and $a_1,\ldots,a_n\in Z_p$ buckets.
+Let $x_1,\ldots,x_k\in\Zp$ be distinct items and $a_1,\ldots,a_n\in Zp$ buckets.
 By standard results on polynomials, there is exactly one polynomial~$h$ of degree at most~$k$
 such that $h(x_i) = a_i$ for every~$i$. Hence the probability than a~random polynomial
 of degree at most~$k$ satisfies this property is $1/p^k$.
@@ -398,40 +406,64 @@ a~random vector.

 \defn{For a~prime~$p$ and vector size $d\ge 1$, we define the family of
 scalar product hash functions
-${\cal S} = \{ h_{\bf a} \mid {\bf a} \in \Z_p^d \}$ from~$\Z_p^d$ to~$\Z_p$, where
-$h_{\bf a}({\bf x}) = {\bf a} \cdot {\bf x}$.
+${\cal S} = \{ h_\a \mid \a \in \Zp^d \}$ from~$\Zp^d$ to~$\Zp$, where
+$h_\a(\x) = \a \cdot \x$.
 }

-\theorem{The family $\cal S$ is 1-universal.}
+\theorem{The family $\cal S$ is 1-universal. A~function can be picked at random
+from~$\cal S$ in time $\Theta(d)$ and evaluated in the same time.}

 \proof
-TODO
+Consider two distinct vectors $\x, \y \in \Zp^d$. Let $i$ be a~coordinate
+for which $\x_i \ne \y_i$. As the vector product does not depend on ordering
+of components, we can renumber the components, so that $i=d$.
+
+For a~random choice of the parameter~$\bf a$, we have (in~$\Zp$):
+$$\eqalign{
+&\Prsub{\a\in\Zp^d} [ h_\a(\x) = h_\a(\y) ] =
+\Pr [ \x\cdot\a = \y\cdot\a ] =
+\Pr [ (\x-\y)\cdot\a = 0 ] = \cr
+&= \Pr \left[ \sum_{i=1}^d (\x_i-\y_i)\a_i = 0 \right] =
+\Pr \left[ (\x_d-\y_d)\a_d = -\sum_{i=1}^{d-1} (\x_i-\y_i)\a_i \right]. \cr
+}$$
+For every choice of $\a_1,\ldots,\a_{d-1}$, the exists exactly one
+value of~$\a_d$ for which the last equality holds. Therefore it holds
+with probability $1/p$.
 \qed

-As usually, we can reduce the result modulo~$m<p$. By Lemma~\xx{M}, a~family
-${\cal S}\bmod m$ from~$\Z_p^k$ to $[m]$ is 2-universal.
+\note{
+There is clear intuition behind the last step of the proof: in a~field,
+multiplication of a~non-zero number by a~uniformly random number
+yields a~uniformly random number; similarly, adding a~uniformly random number
+to any number yields a~uniformly random result.
+}
+
+As usually, we can reduce the result modulo~$m<p$. By Lemma~\xx{M}, the family
+${\cal S}\bmod m$ from~$\Zp^k$ to $[m]$ is 2-universal.

 To obtain 2-independence, we simply compose ${\cal S}$ with the $(2,4)$-independent
 family~${\cal L}'$. By Lemma~\xx{G}, the result will be a~$(2,8)$-independent family,
 or even $(2,5)$-independent if $p\ge 4m$.

 The compound hash functions can be written as
-$(\alpha({\bf a}\cdot {\bf x}) + \beta) \bmod m$, where
-${\bf a}$ is a~vector parameter, and $\alpha$ and~$\beta$ are scalar parameters.
-However, $\alpha({\bf a} \cdot {\bf x}) = {\bf a}' \cdot {\bf x}$ and if
-$\bf a$ and $\alpha$ were uniformly distributed, so is~$\bf a'$.
-So we can define the compound family in a~more compact way:
-
-\defn{For a~prime~$p$, vector size $d\ge 1$, and the number of buckets~$m$
-such that $p\ge 4m$, we define the family of scalar product hash functions
-${\cal S}' = \{ h_{{\bf a},\beta} \mid {\bf a}\in \Z_p^d, \beta\in\Z_p \}$
-from~$\Z_p^d$ to $[m]$, where
-$h_{{\bf a},\beta}(x) = ({\bf a}\cdot {\bf x} + \beta) \bmod m$.
-(The operations in parentheses are performed in the field~$\Z_p$.)
+$(\alpha(\a\cdot \x) + \beta) \bmod m$, where
+$\a$ is a~vector parameter, and $\alpha$ and~$\beta$ are scalar parameters.
+However, $\alpha(\a \cdot \x)$ can be written as $\a' \cdot \x$
+for some vector $\bf a'$ and if $\bf a$ and $\alpha$ were uniformly
+distributed, so is~$\bf a'$. So we can define the compound family in a~more
+compact way:
+
+\defn{For a~prime~$p$, vector size $d\ge 1$, and the number of buckets~$m$,
+we define the family of scalar product hash functions
+${\cal S}' = \{ h_{\a,\beta} \mid \a\in \Zp^d, \beta\in\Zp \}$
+from~$\Zp^d$ to $[m]$, where
+$h_{\a,\beta}(x) = (\a\cdot \x + \beta) \bmod m$.
+(The operations in parentheses are performed in the field~$\Zp$.)
 }

-A~scalar product hash function of either kind can be picked at random
-in time $\Theta(d)$ and then evaluated for a~single vector in the same time.
+\theorem{If $p\ge 4m$, the family ${\cal S}'$ is $(2,5)$-independent.
+A~function can be picked at random from~${\cal S}'$ in time $\Theta(d)$
+and evaluated in the same time.}

 \subsection{Rolling hashes from polynomials}

@@ -441,17 +473,19 @@ the polynomial is evaluated is chosen randomly.

 \defn{
 For a~prime~$p$ and vector size~$d$, we define the family of polynomial hash functions
-${\cal R} = \{ h_a \mid a\in\Z_p \}$ from $\Z_p^d$ to~$\Z_p$, where
-$h_a({\bf x}) = \sum_{i=0}^{d-1} {\bf x}_{i+1} \cdot a^i$.
+${\cal R} = \{ h_a \mid a\in\Zp \}$ from $\Zp^d$ to~$\Zp$, where
+$h_a(\x) = \sum_{i=0}^{d-1} \x_{i+1} \cdot a^i$.
 }

-\lemma{The family~$\cal R$ is $d$-universal.}
+\theorem{The family~$\cal R$ is $d$-universal.
+A~function can be picked from~$\cal R$ at random in constant time
+and evaluated on a~given vector in $\Theta(d)$ time.}

 \proof
-Consider two vectors ${\bf x} \ne {\bf y}$ and a~hash function~$h_a$ chosen at random
-from~$\cal R$. A~collision happens whenever $\sum_i {\bf x}_{i+1} a^i = \sum_i {\bf y}_{i+1}
-a^i$. This is the same condition as $\sum_i ({\bf x}-{\bf y})_{i+1} a^i = 0$, that is if
-the number~$a$ is a~root of the polynomial ${\bf x} - {\bf y}$. Since a~polynomial
+Consider two vectors $\x \ne \y$ and a~hash function~$h_a$ chosen at random
+from~$\cal R$. A~collision happens whenever $\sum_i \x_{i+1} a^i = \sum_i \y_{i+1}
+a^i$. This is the same condition as $\sum_i (\x-\y)_{i+1} a^i = 0$, that is if
+the number~$a$ is a~root of the polynomial $\x - \y$. Since a~polynomial
 of degree at most~$d$ can have at most~$d$ roots (unless it is identically zero),
 the probability that $a$~is a~root is at most $d/p$. This implies $d$-universality.
 \qed
@@ -465,10 +499,37 @@ disappears.
 \corr{Given a~prime~$p$ and the number of buckets~$m$ such that $p \ge 4km$, the
 compound family ${\cal R}\circ {\cal L}'$ is $(2,5)$-independent.}

-TODO: Time, rolling
+Hash functions of this kind play important role in the \em{Rabin-Karp string search
+algorithm.} Suppose that we are searching for a~$d$-character substring~$\nu$ (the needle)
+in a~long text~$\sigma$ (the haystack). We pick a~hash function~$h$ and calculate
+the hash $h(\nu)$ of the needle. Then we slide a~window of size~$d$ over the haystack and
+for each position of the window, we hash the contents of the window. Only if the hash of
+the window is equal to $h(\nu)$, we compare the window with the needle character-by-character.
+
+For this algorithm, we need a~hash function which can be recalculated in constant time
+when the window shifts one position to the right. Such functions are called \em{rolling hashes}
+and they are usually chosen from the family~$\cal R$. Compare hashes of the window at positions
+$j$ and $j+1$ (we index each window from the right):
+$$\eqalign{
+	H_j = h(\sigma[j],\ldots,\sigma[j+d-1]) &= \sigma[j]a^{d-1} + \sigma[j+1]a^{d-2} + \ldots + \sigma[j+d-1]a^0, \cr
+	H_{j+1} = h(\sigma[j+1],\ldots,\sigma[j+d]) &= \sigma[j+1]a^{d-1} + \sigma[j+2]a^{d-2} + \ldots + \sigma[j+d]a^0, \cr
+}$$
+We can observe that $H_{j+1} = aH_j - \sigma[j]a^d + \sigma[j+d]$. (Everything calculated
+in the field~$\Zp$.)

 \subsection{Hashing strings}

+Finally, let us consider hashing of strings of variable length up to some limit~$L$.
+We can pad the strings to length exactly~$L$ by appending blank characters and then
+hash the results as $L$-component vectors. We must make sure that a~blank does not
+occur anywhere inside the string --- otherwise we get systematic collisions. The blank
+is usually encoded as character number~0, so the computation on blanks can be skipped
+altogether.
+
+For hashing the resulting $L$-tuples, we prefer polynomial-based rolling hashes
+over scalar product hashes, because scalar product hashes require a~random $L$-component
+vector of parameters.
+
 \exercises

 \ex{Show that the family of all constant functions from~$\cal U$ to~$[m]$
@@ -482,6 +543,11 @@ Is the modified family $c$-universal for some~$c$? Is it 2-independent?}

 \ex{Prove that the family ${\cal L}'$ is not 3-independent.}

+\ex{Analyze expected time complexity of the Rabin-Karp algorithm.}
+
+\hint{Assign a~random variable to each position of the window, which will
+indicate a~false match (hash collision).}
+
 \endexercises

 \section{Cuckoo hashing}
@@ -560,7 +626,7 @@ the expected number of probes during an~operation is:
 }

 We will prove a~slightly weaker version of the first bound. (All restrictions
-can be removed at the expense of making the technical details more complicated.)
+can be removed at the expense of making the technical details more cumbersome.)

 \theorem{Let $m$ (table size) be a~power of two, $n\le m/3$ (the number of items),
 $h$~a~completely random hash function, and~$x$ an~item. Then the expected number

--- a/tex/adsmac.tex
+++ b/tex/adsmac.tex
@@ -178,10 +178,11 @@
 \def\sk#1{{\bf s}^{#1}}
 \def\ck#1{{\bf c}^{#1}}
 \def\ek#1{{\bf e}^{#1}}
+\def\a{{\bf t}}
+\def\t{{\bf t}}
 \def\x{{\bf x}}
 \def\y{{\bf y}}
 \def\z{{\bf z}}
-\def\t{{\bf t}}
 \def\OO{{\bf\Omega}}

 % Transpozice matice