diff --git a/Makefile b/Makefile index 1f5845a1525a5dbd5c153b3c610dc7bbabbd9359..9813328ecd9129f64e3419c1dea1c022122cbc24 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,8 @@ CHAPTERS= \ 06-hash \ 07-geom \ 08-string \ - vk-dynamic + vk-dynamic \ + fs-succinct chapters: for ch in $(CHAPTERS) ; do $(MAKE) -C $$ch pics ; done diff --git a/fs-succinct/Makefile b/fs-succinct/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..12eb9337f0550811280369256a69511a1867d34b --- /dev/null +++ b/fs-succinct/Makefile @@ -0,0 +1,12 @@ +TOP=.. +PICS=sole sole_boxes sole_hilevel mixer composition mixer_chain mixer_tree tree_shapes + + +include ../Makerules + +sole.pdf:: succinct_common.asy +sole_boxes.pdf:: succinct_common.asy +sole_hilevel.pdf:: succinct_common.asy +mixer_chain.pdf:: succinct_common.asy +mixer_tree.pdf:: succinct_common.asy +tree_shapes.pdf:: succinct_common.asy diff --git a/fs-succinct/composition.asy b/fs-succinct/composition.asy new file mode 100644 index 0000000000000000000000000000000000000000..ef955a47192a498e098b4ab9b2678ea292c851fd --- /dev/null +++ b/fs-succinct/composition.asy @@ -0,0 +1,24 @@ +//import ads; +//import flowchart; + +//draw(roundrectangle("f", (0,0))); +//draw(roundrectangle("g", (1,-1))); +//draw(roundrectangle("h", (-1,-2))); + +object f1 = draw("$g_1$", roundbox, (0,0), xmargin=0.5, ymargin=0.5); +object f2 = draw("$g_2$", roundbox, (1cm,-1cm), xmargin=0.5, ymargin=0.5); +object f3 = draw("$g_3$", roundbox, (-1cm,-1.5cm), xmargin=0.5, ymargin=0.5); + +// XXX this does not work when setting unitsize +draw(point(f1, SE) -- point(f2, NW), Arrow); +draw(point(f1, SW) -- point(f3, NE), Arrow); +draw(point(f2, W) -- point(f3, E), Arrow); + +draw(roundbox(bbox(), xmargin=0.35cm)); + +draw(point(f2, S) -- (xpart(point(f2, S)), -2.5cm), Arrow); +draw(point(f3, S) -- (xpart(point(f3, S)), -2.5cm), Arrow); +draw((xpart(point(f1, N)), 1cm) -- point(f1, N), Arrow); + +label("$g$", (xpart(min(currentpicture)), ypart(max(currentpicture))) + (0.25cm, -0.25cm)); + diff --git a/fs-succinct/mixer.asy b/fs-succinct/mixer.asy new file mode 100644 index 0000000000000000000000000000000000000000..d801fbac4b61a983ff4bc26819fcb3077b5bc81b --- /dev/null +++ b/fs-succinct/mixer.asy @@ -0,0 +1,15 @@ +import succinct_common; + +real r = 1.5; +real dist=2; +mixer(0,0,r); +draw((-dist,0)--(-r,0), e_arrow); +draw((r,0)--(dist,0), e_arrow); +draw((0,-r)--(0,-dist), e_arrow); +draw((0,dist)--(0,r), e_arrow); + +label((0, dist), "\vbox{\hbox{$x\in[X]$}\hbox{\eightrm (input)}}", N); +label((-dist, 0), "\vbox{\hbox{$y\in[Y]$}\hbox{\eightrm (carry in)}}", W); +label((dist, 0), "\vbox{\hbox{$s\in[S]$}\hbox{\eightrm (carry out)}}", E); +label((0, -dist), "\vbox{\hbox{$m\in[2^M]$}\hbox{\eightrm (output)}}", S); +label((0, 0), "$c\in [C]$"); diff --git a/fs-succinct/mixer_chain.asy b/fs-succinct/mixer_chain.asy new file mode 100644 index 0000000000000000000000000000000000000000..284942b6b3ba27d5216238e431ca04eb2c58f830 --- /dev/null +++ b/fs-succinct/mixer_chain.asy @@ -0,0 +1,27 @@ +import succinct_common; + +real mixgrid = 2.5; +int nmixers = 5; + +for (int i = 0; i < nmixers; ++i) { + real x = mixgrid * i; + if (i == 3) { + label((x,0), "$\cdots$"); + } else { + draw((x, 1.25) -- (x, 0.5), e_arrow); + label((x, 1.25), "$X$", N); + mixer(x, 0); + draw((x, -0.5) -- (x, -1.25), e_arrow); + label((x, -1.25), (i == 4) ? "$2^{M_n}$" : "$2^{M_"+((string)(i+1))+"}$", S); + } +} + +string[] alphas = {"", "Y_1", "Y_2", "Y_3", "Y_{N-1}", "Y_N"}; + +for (int i = 1; i < nmixers; ++i) { + carry_arrow((mixgrid * (i-1), 0), (mixgrid*i, 0), alphas[i]); +} + +pair endb = (mixgrid * (nmixers-1), 0) + (0.5,0); +draw(endb -- endb + (0.5,0) {E} .. {S} endb + (1.5,-1) -- endb + (1.5,-1.25), e_arrow); +label(endb + (1.5,-1.25), "$2^{M_{n+1}}$", S); diff --git a/fs-succinct/mixer_tree.asy b/fs-succinct/mixer_tree.asy new file mode 100644 index 0000000000000000000000000000000000000000..1bf91def0ff9300c459cfd9879f7fb013f44d3fd --- /dev/null +++ b/fs-succinct/mixer_tree.asy @@ -0,0 +1,31 @@ +import succinct_common; + +pair C = (0,0); +PENTAMIXER_R = 0.75; +pentamixer(C); +pm_arrow(C, A_IN, 1, lbl="\vbox{\hbox{$x\in[X]$}\hbox{\eightrm (input)}}"); +pm_arrow(C, A_CIN1, 1, lbl="\vbox{\hbox{$y\in[Y]$}\hbox{\eightrm (carry in 1)}}"); +pm_arrow(C, A_CIN2, 1, lbl="\vbox{\hbox{$z\in[Z]$}\hbox{\eightrm (carry in 2)}}"); +pm_arrow(C, A_OUT, 1, out=true, lbl="\vbox{\hbox{$m\in[2^M]$}\hbox{\eightrm (output)}}"); +pm_arrow(C, A_COUT, 1, out=true, lbl="\vbox{\hbox{$s\in[S]$}\hbox{\eightrm (carry out)}}"); + +PENTAMIXER_R = 0.5; + +pair m1 = (5, -1); +pair m2 = (9, -1); +pair m3 = (7, 1.5); +pair mix[] = {m1, m2, m3}; +pentamixer(m1); +pentamixer(m2); +pentamixer(m3); +draw(pm_dir(m1, A_COUT)--pm_dir(m3, A_CIN1), e_arrow); +draw(pm_dir(m2, A_COUT)--pm_dir(m3, A_CIN2), e_arrow); +for (int i = 0; i < 3; ++i) { + pm_arrow(mix[i], A_IN, 0.5, lbl="in"); + pm_arrow(mix[i], A_OUT, 0.5, out=true, lbl="out"); +} +for (int i = 0; i < 2; ++i) { + pm_arrow(mix[i], A_CIN1, 0.5, "$\vdots$"); + pm_arrow(mix[i], A_CIN2, 0.5, "$\vdots$"); +} +pm_arrow(m3, A_COUT, 0.5, out=true, "$\vdots$"); diff --git a/fs-succinct/sole.asy b/fs-succinct/sole.asy new file mode 100644 index 0000000000000000000000000000000000000000..8b16f0dcdf070309bbb0d5e947028307546bc0a2 --- /dev/null +++ b/fs-succinct/sole.asy @@ -0,0 +1,18 @@ +import succinct_common; +blocks(0 ... concat(array(6,"B"), new string[] {"...", "B", "EOF"})); +thruarrows(0,0,6); +thruarrows(0,7,2); +blocks(1 ... concat(array(6, "B+1"), array(1, "..."), array(2, "B+1")) ); +mixarrows(1,0,6); +mixarrow(1, 7); +block(1, 9, "0"); +blocks(2, "B", "B+3", "B-3", "B+6", "B-6", "B+9", "...", "B-i","B+j", "B-j"); +thruarrow(2, 0); +mixarrows(2, 1, 6); +mixarrow(2, 8); +thruarrow(1, 9); +blocks(3 ... concat(array(6,"B"), array(1, "..."), array(3, "B"))); + +passlabel(0, "Add EOF"); +passlabel(1, "Pass 1"); +passlabel(2, "Pass 2"); diff --git a/fs-succinct/sole_boxes.asy b/fs-succinct/sole_boxes.asy new file mode 100644 index 0000000000000000000000000000000000000000..3723f04df0d9fb962006b9ab5d638a33b4516908 --- /dev/null +++ b/fs-succinct/sole_boxes.asy @@ -0,0 +1,30 @@ +import succinct_common; + +void fillbox(int col, pen pen) { + path p = ( + (col*blockwidth, -rowheight-blockheight) + -- ((col+2)*blockwidth, -rowheight-blockheight) + -- ((col+2)*blockwidth, -2*rowheight) + -- ((col+1)*blockwidth, -2*rowheight) + -- ((col+1)*blockwidth, -3*rowheight) + -- ((col-1)*blockwidth, -3*rowheight) + -- ((col-1)*blockwidth, -2*rowheight-blockheight) + -- ((col)*blockwidth, -2*rowheight-blockheight) + -- cycle + ); + fill(p, pen); + draw(p, thick); +} + +fillbox(2, 0.5*white); +fillbox(4, 0.75*white); +fillbox(6, 0.5*white); + +//blocks(0 ... concat(array(8,"B"), new string[] {"..."})); +//thruarrows(0,0,6); +blocks(1 ... concat(array(8, "B+1"), array(1, "...")) ); +mixarrows(1,0,8); +blocks(2, "B", "B+3", "B-3", "B+6", "B-6", "B+9", "B-9", "..."); +thruarrow(2, 0); +mixarrows(2, 1, 6); +blocks(3 ... concat(array(7,"B"), array(1, "..."))); diff --git a/fs-succinct/sole_hilevel.asy b/fs-succinct/sole_hilevel.asy new file mode 100644 index 0000000000000000000000000000000000000000..47bc193e9776e7fcc1dbf95ad26f2a16f60b2ac1 --- /dev/null +++ b/fs-succinct/sole_hilevel.asy @@ -0,0 +1,19 @@ +import succinct_common; + +real mixgrid = 2.5; +int nmixers = 3; + +for (int i = 0; i < nmixers; ++i) { + real x = mixgrid * i; + draw((x, 1.25) -- (x, 0.5), e_arrow); + label((x, 1.25), "$(B+1)^2$", N); + mixer(x, 0); + draw((x, -0.5) -- (x, -1.25), e_arrow); + label((x, -1.25), "$B^2$", S); +} + +string[] alphas = {"B+3", "B+6", "B+9", "B+12"}; + +for (int i = 0; i < nmixers+1; ++i) { + carry_arrow((mixgrid * (i-1), 0), (mixgrid*i, 0), alphas[i]); +} diff --git a/fs-succinct/succinct.tex b/fs-succinct/succinct.tex new file mode 100644 index 0000000000000000000000000000000000000000..b87d93fe910b32974a3f6e82cd3f75e6fcb7c09f --- /dev/null +++ b/fs-succinct/succinct.tex @@ -0,0 +1,470 @@ +\ifx\chapter\undefined +\input adsmac.tex +\singlechapter{50} +\fi +\input tabto.tex + +\chapter[succinct]{Space-efficient data structures} + +In this chapter, we will explore space-efficient data structures. This may +sound like a boring topic at first -- after all, many of the commonly-used data +sctructures have linear space complexity, which is asymptotically optimal. +However, in this chapter, we shall use a much more fine-grained notion of space +efficiency and measure space requirements in bits. + +Imagine we have a data structure whose size is parametrized by some parameter +$n$ (e.g. number of elements). Let us define $X(n)$ as the universe of all possible +values that a size-$n$ data structure (as a whole) can hold. For example if we +have a data structure for storing strings from a fixed alphabet, $X(n)$ may be the +universe of all length-$n$ strings from this alphabet. + +Let us denote $s(n)$ the number of bits needed to store a size-$n$ data structure. +The information-theoretical optimum is $OPT(n) := \lceil\log |X(n)|\rceil$ +(which is essentially the entropy of a uniform distribution over $X(n)$). + +Note: We will always ignore constant additive factors, so sometimes we will use the +definition $OPT(n) := \log |X(n)|$ (without rounding, differs by at most one from +the original definition) interchangably. + +\defn{{\I Redundancy} of a space-efficient data structure is $r(n) := s(n) - OPT(n)$.} + +Now we can define three classes of data structures based on their fine-grained space +efficiency: + +\defn{A data structure is +\tightlist{o} +\:{\I implicit} when $s(n) \le OPT(n) + \O(1)$,\tabto{7.6cm}i.e., $r(n) = \O(1)$, +\:{\I succinct} when $s(n) \le OPT(n) + {\rm o}(OPT(n))$,\tabto{7.6cm}i.e., $r(n) = {\rm o}(OPT(n))$, +\:{\I compact} when $s(n) \le \O(OPT(n))$. +\endlist +} + +A typical implicit data structure contains just its elements in some order and nothing more. +Examples include sorted arrays and heaps. + +Note that some linear-space data structures are not even compact -- because we +are counting bits now, not words. For example, a linked list representing a +length-$n$ sequence of numbers from range $[m]$ needs $\O(n (\log n + \log m))$ +bits ($\log n$ bits are used to represent a next-pointer), whereas $OPT$ is $n +\log m$. For $n \gg m$, this does not satisfy the requirements for a compact +data structure. + +And of course, as with any data structure, we want to be able to perform reasonably +fast operations on these space-efficient data structures. + +\section{Representation of strings over arbitrary alphabet} + +Let us consider the problem of representing a length-$n$ string over alphabet $[m]$, +for example a string of base-10 digits. The following two naive approaches immediately +come to mind: + +\list{(a)} +\: Consider the whole string as one base-10 number and convert that number into binary. + This achieves the information-theoretically optimum size of $OPT(n) = \lceil n \log 10 \rceil + \approx 3.32n = \Theta(n+1)$. However, this representation does not support local decoding and + modification -- you must always decode and re-encode the whole string. +\: Store the string digit-by-digit. This uses space $n \lceil \log 10 \rceil = 4n = OPT(n) + \Theta(n)$. + For a fixed alphabet size, this is not succinct because $\Theta(n) > o(OPT(n)) = o(n + 1)$\foot{More + formally, if we consider $\O$ and $o$ to be sets of functions, $\Theta(n) \cap o(n + 1) = \emptyset$.}. + However, we get constant-time local decoding and modification for free. +\endlist + +We would like to get the best of both worlds -- achieve close-to-optimum space +requirements while also supporting constant-time local decoding and modification. + +A simple solution that may work in practice is to encode the digits in groups +(e.g. encode each 2 subsequent digits into one number from the range [100] and +convert that number to binary). + +With groups of size $k$, we get $$s(n) = \lceil n/k \rceil \lceil k \log 10 +\rceil \le (n/k + 1)(k \log 10 + 1) = \underbrace{n \log 10}_{OPT(n)} + n/k + +\underbrace{k\log 10 + 1}_{\O(1)}.$$ Thus we see that with increasing $k$, +redundancy goes down, approaching the optimum but never quite reaching it. For a +fixed $k$ it is still linear and thus our scheme is not succinct. Also, with +increasing $k$, local access time goes up. In practice, however, one could +chose a good-compromise value for $k$ and happily use such a scheme. + +We will develop a succinct encoding scheme later in this chapter. + +\section{Intermezzo: Prefix-free encoding of bit strings} + +Let us forget about arbitrary alphabets for a moment and consider a different +problem. We want to encode a binary string of arbitrary length in a way that +allows the decoder to determine when the string ends (it can be followed by +arbitrary other data). Furthermore, we want this to be a streaming encoding +-- i.e., encode the string piece by piece while it is being read from the input. +The length of the string is not known in advance -- it will only be determined +when the input reaches its end\foot{If the length were known in advance, we could +simply store the length using any simple variable-size number encoding, followed by the +string data itself. This would give us $\O(\log n)$ redundancy almost for free.} + +A trivial solution might be to split the string into $b$-bit blocks and encode +each of them into a $(b+1)$-bit block with a simple padding scheme: +\tightlist{o} +\: For a complete block, output its $b$ data bits followed by a one. +\: For an incomplete final block, output its data bits, followed by a one + and then as many zeros as needed to reach $b+1$ bits. +\: If the final block is complete (input length is divisible by $b$), we must + add an extra padding-only block (one followed by $b$ zeros) to signal the + end of the string. +\endlist + +The redundancy of such encoding is at most $n/b + b + 1$ (one bit per block, +$b+1$ for extra padding block). For a fixed $b$, this is $\Theta(n)$, so the +scheme is not succinct. + +\subsection{SOLE (Short-Odd-Long-Even) Encoding} + +In this section we will present a more advanced prefix-free string encoding +that will be succinct. + +First, we split the input into $b$-bit blocks. We will add a padding in the +form of $10\cdots0$ at the end of the last block to make it $b$ bits long. +If the last block was complete, we must add an extra padding-only block to +make the padding scheme reversible. + +Now we will consider each block as a single character from the alphabet $[B]$, +where $B:=2^b$. Then we shall extend this alphabet by adding a special EOF +character. We will add this character at the end of encoding. This gives us +a new string from the alphabet $[B+1]$ that has length at most $n/b + 2$ +($+1$ for padding, $+1$ for added EOF character). + +However, as $B+1$ is not a power of two, now we have a question of how to +encode this string. Note that this is a special case of the problem stated +above, i.e. encoding a string from an arbitrary alphabet. We will try to solve +this special case as a warm-up and then move on to a fully general solution. + +First, we need to introduce a new concept: re-encoding character pairs into +different alphabets. Let's assume for example, that we have two characters from +alphabets [11] and [8], respectivelly. We can turn them into one character from +the alphabet [88] (by the simple transformation of $8x + y$). We can then split +that character again into two in a different way. For example into two characters +from alphabets [9] and [10]. This can be accomplished by simple division with +remainder: if the original character is $z\in [88]$, we transform in into +$\lfloor z / 10\rfloor$ and $(z \bmod 10)$. For example, if we start +with the characters 6 and 5, they first get combined to form $6\cdot 8 + 5 = 53$ +and then split into 5 and 3. + +We can think of these two steps as a single transformation that takes +two characters from alphabets [11] and [8] and transforms them into +two characters from alphabets [9] and [10]. More generally, we can +always transform a pair of characters from alphabets $[A]$ and $[B]$ +into a pair from alphabets $[C]$ and $[D]$ as long as $C\cdot D +\ge A \cdot B$ (we need an output universe large enough to hold all +possible input combinations). + +We will use this kind of alphabet re-encoding by pairs heavily in the SOLE +encoding. The best way to explain the exact scheme is with a diagram (fig. \figref{sole}). + +\figure[sole]{sole.pdf}{}{SOLE alphabet re-encoding scheme} + +There are two re-encoding phases. The first transforms blocks with alphabet +$[B+1]$ into blocks with variable alphabet sizes (of the form of alternating +$[B+3k]$, $[B-3k]$). This is the origin of the name: after this pass, +odd-numbered blocks have smaller alphabets than even-numbered ones. The second +pass runs phase shifted by one block and converts the variable-alphabet blocks +into blocks with alphabet $[B]$. + +What is the redundancy of this scheme? Let us count how the number of blocks +increases throughout the encoding passes: +\tightlist{o} +\: If the original length was a multiple of $b$, we must add one block to complete padding. +\: We always add one block with EOF character. +\: Before the first pass, we may need to add an extra padding block to make number of blocks even + (not shown in fig. \figref{sole}). +\: Before the second pass, we always add an extra padding block to make number of blocks odd. +\endlist + +In total, we add at most 4 blocks. Thus $r(n) \le 4b$. + +For the scheme to work, we need to set $b \ge 2\log n + 2$ (so +$B \ge 4n^2$). This gives us redundancy $r(n) = \O(\log n)$. Thus +we have a succinct scheme. Also, one block fits into $\O(1)$ words +on a RAM, so we can do constant-time arithmetic on the blocks. + +Note that this representation is locally decodable and modifiable -- each input +block affects at most 4 output blocks. + +Now we must check that all the alphabet transformations are valid, i.e., the +output universe of each transformation is always at least as big as the input +universe. + +For the first pass, we want: +$$\eqalign{ +(B+1)^2 &\le (B-3i)(B+3i+3)\cr +B^2 + 2B + 1 &\le B^2 + 3B - 9i^2 - 9i\cr +B &\ge 9i^2 + 9i + 1\cr +}$$ +We know $B \ge 4n^2$ and $i \le {n+1\over 2}$. By plugging $i = {n+1\over 2}$ +and $B=4n^2$ and doing some algebraic manipulation, we can verify that the +inequality holds. For smaller $i$ the right-hand side decreases so it holds +for those too. + +For the second pass, this is trivial, as $(B+i)(B-i) = B^2 - i^2 \le B^2$. + +\section{Mixers as a building block for succinct structures} + + +\subsection{A reinterpretation of the SOLE encoding} + +There is another way of looking at the SOLE encoding from the previous section. +We can group the alphabet translations into ``encoding boxes'' that take input +from the alphabet $(B+1)^2$, output the alphabet $B^2$ and the part of the +information that did not fit into the output is passed as a +``carry''\foot{Sometimes the alternative term {\it spill} is used instead.} to +the next encoding box (similarly to how carrying works when doing addition). +See fig. \figref{sole_boxes}. We will also call these boxes {\it mixers}. + +\figure[sole_boxes]{sole_boxes.pdf}{}{SOLE interpreted as a chain of mixers} + +The start and end of the encoding are irregular, but we will ignore that for now. +An important property of these boxes is that outgoing carry does not depend on incoming +carry (unlike in addition). This allows for local decoding and modification. Otherwise +a single input change could affect the whole output. Now we can describe this scheme +in a more abstract, high-level way (fig. \figref{sole_hilevel}). + +\figure[sole_hilevel]{sole_hilevel.pdf}{}{SOLE high-level mixer diagram} + +In our case, the input alphabet size is always $(B+1)^2$, the output alphabet size +is $B^2$ and the carry alphabet sizes form the sequence $B+3i$. Given that the output +alphabet is smaller than the input alphabet, it makes sense that the carry alphabet +has to increase in size to accomodate the accumulating information that did not fit +into the output. The final carry is then used to output some extra blocks at the end. + +\subsection{Generalizing the mixer concept} + +\figure[mixer]{mixer.pdf}{}{General structure of a mixer} + +At a high level, a mixer can be thought of as a mapping $f: [X]\times[Y] \rightarrow [2^M]\times[S]$ +with the property that when $(m,s) = f(x,y)$, $s$ depends only on $x$. This is the key property +that allows local decoding and modification because carry does not cascade. + +Internally, the a mixer is +always implemented as a composition of two mappings, $f_1$ that transforms $x \rightarrow +(c,s)$ and $f_2$ +that transforms $(y,c) \rightarrow m$. See fig. \figref{mixer}. Both $f_1$ and $f_2$ must be injective +so that the encoding is reversible. + +The mappings $f_1$ and $f_2$ themselves are trivial alphabet translations similar to what we +used in the SOLE encoding. You can for example use $f_1(x) = (\lceil x/S \rceil, x \bmod S)$ +and $f_2(y,c) = c\cdot Y + y$. + +Thus implementing the mixer is simple as long as the parameters allow its existence. A mixer +with parameters $X$, $Y$, $S$, $M$ can exist if and only if there exists $C$ such that +$S\cdot C \ge X$ and $C\cdot Y \le 2^M$ (once again, the alphabet translations need their +range to be as large as their domain in order to work). + +\lemma{ +A mixer $f$ has the following properties (as long as all inputs and outputs fit into a constant +number of words): +\tightlist{o} +\: $f$ can be computed on a RAM in constant time +\: $s$ depends only on $x$, not $y$ +\: $x$ can be decoded given $m$, $s$ in constant time +\: $y$ can be decoded given $m$ in constant time +\endlist +} +All these properties should be evident from the construction. + +\defn{The redundancy of a mixer is $$r(f) := \underbrace{M + \log S}_{\hbox{output entropy}} - \quad \underbrace{(\log X + \log Y)}_{\hbox{input entropy}}.$$ +In general, the redundancy of a mapping (with possibly multiple inputs and multiple outputs) is the sum of the logs of the +output alphabet size, minus the sum of the logs of the input alphabet sizes. Note that there is no rounding (because the inputs and +outputs can be from arbitrary alphabets, not necessarily binary) and the redundancy can be non-integer. Compare this to the concept +of redundancy for space-efficient datastructures defined above.} + +\subsection{On the existence of certain kinds of mixers} + +Now we would like to show that mixers with certain parameters do exist. + +\lemma{For $X,Y$ there exists a mixer $f: [X]\times[Y] \rightarrow [2^M]\times[S]$ +such that: +\tightlist{o} +\: $S = \O(\sqrt{X})$, $2^M = \O(Y\cdot\sqrt{X})$ +\: $r(f) = \O(1/\sqrt{X})$ +\endlist +} +\proof{ +First, let's assume we have chosen an $M$ (which we shall do later). Then we +want to set $C$ so that it satisfies the inequality $C \cdot Y \le 2^M$. Basically +we are asking the question how much information can we fit in $m$ in addition to +the whole of $y$. Clearly we want $C$ to be as high as possibly, thus we set +$C := \lfloor 2^M / Y \rfloor$. + +Now let us calculate the redundancy. First we shall note that we can compute redundancy +for $f_1$ and $f_2$ separately and add them up: +$$\eqalign{r(f) &= M + \log S - \log X - \log Y \cr +&= \left(M - \log C - \log Y\right) + \left(\log C + \log S - \log X\right)\cr +&= r(f_2) + r(f_1)}$$ +} +This is just a telescopic sum. It works similarly for more complex mapping compositions: +as long as each intermediate result is used only once as an input to another mapping, you +can just sum the redundancies of all the mappings involved. + +\figure[composition]{composition.pdf}{}{Mapping composition} +For example, if you have a mapping composition as in fig. \figref{composition}, +you can easily see $r(g) = r(g_1) + r(g_2) + r(g_3)$. For every edge fully inside +the composition, the same number is added once and subtracted once. + +First, we shall estimate $r(f_2)$: +$$\eqalign{r(f_2) &= M - \log(Y\cdot C)= M - \log(\overbrace{Y\cdot \lfloor 2^M / Y \rfloor}^{\ge 2^M - Y})\cr +r(f_2) &\le M - \log(2^M-Y)= \log{2^M\over 2^M-Y} = \log{1 \over 1-{Y \over 2^M}}}$$ +Now we shall use a well-known inequality form analysis: +$$\eqalign{ +e^x &\ge 1+x\cr +x &\ge \log(1+x)\cr +-x &\le \log{1 \over 1+x}}$$ +By substituting $x \rightarrow -x$ we get: +$$x \ge \log{1 \over 1-x}$$ +Thus +$$r(f_2) \le {Y\over 2^M} = \O\left({1 \over C}\right)$$ + +Now to $r(f_1)$: +$$\eqalign{ +r(f_1) &= \log C + \log S - \log X = \log C + \log \left\lceil {X\over C}\right\rceil - \log X += \log\left({C\left\lceil{X \over C}\right\rceil \over X}\right)\cr +r(f_1) &\le \log\left({X+C \over X}\right) = \log\left(1 + {C\over X}\right) \le {C \over X}\qquad\hbox{(because $\log(x) \le x-1$)} +}$$ + +Putting this together: +$$r(f) = r(f_1) + r(f_2) \le \O\left({1 \over C} + {C \over X}\right)$$ + +In order to minimize this sum, we should set $C = \Theta\left(\sqrt{X}\right)$. Then +$r(f) = \O\left({1/\sqrt{X}}\right)$ and $S = \left\lceil{X \over \Theta(\sqrt{X})}\right\rceil = \Theta\left(\sqrt{X}\right)$ +as promised. Note that this holds for any value of $Y$. + +However, we cannot freely set $C$, as we have already decided that $C := \lfloor 2^M / Y \rfloor$. +Instead, we need to set a value for $M$ that gives us the right $C$. + +The whole mixer parameter selection process could be as follows +(it may be useful to refer back to fig. \figref{mixer}): +\tightlist{n.} +\: We are given $X$, $Y$ as parameters. +\: Set $M := \left\lceil\log\left(Y\sqrt{X}\right)\right\rceil$. +\: Set $C := \left\lfloor 2^M / Y \right\rfloor$. This ensures that $2^M \ge C\cdot Y$ and gives us $C = \Theta\left(\sqrt{X}\right)$. +\: Set $S := \left\lceil X / C \right\rceil$. This ensures that $C\cdot S \ge X$ and gives us $S = \Theta\left(\sqrt{X}\right)$. +\endlist +All the inequalities required for mixer existence are satisfied and based on the analysis +above the parameters satisfy what our lemma promised. +\qed +\section{Succinct representation of arbitrary-alphabet strings} + +\subsection{A naive first try} + +We would like to use mixers to encode string from an arbitrary alphabet into +the binary alphabet. Let's assume we have a string $A \in [\Sigma]^n$. +We shall split it into some blocks of size $k$, which gives us a block alphabet +$[X] = [\Sigma^k]$. Then we could use a mixer chain as in fig. \figref{mixer_chain}, +similar to what we did in the SOLE encoding. + +\figure[mixer_chain]{mixer_chain.pdf}{}{Mixer chain for string encoding} + +The intuition behind this is simple: whatever part of $X$ did not fit into a +whole number of bits is sent on as carry and whenever a whole extra bit of +information has accumulated in the chain, it can be output. The final carry +is output at the end using the neccessary number of bits. Here we don't mind +rounding because it is an additive constant. + +Everything is also locally decodable and modifiable -- to decode $i$-th input +block, you only need $i$-th and $(i+1)$-st output blocks. And vice versa, you +only need modify these two output blocks after changing the $i$-th input block. + +Now we just need to set $k$ and calculate redundancy. It will be useful to +set $k \approx 2\log_\Sigma n$. Then $X \approx n^2$ and by previous lemmas, +$Y_i \in \O(n)$ and redundancy of the mixers is $\O(1/n)$. As there is less +than $n$ mixers, the total redundancy is $\O(1)$. + +That all sounds wonderful. However, there is one serious problem. Each of the +mixers will have differrent parameters ($Y_i$, $M_i$, $S_i=Y_{i+1}$). In order +to compute the parameters for $i$-th mixer, we need to know the parameters for +the $(i - 1)$-st, namely the $Y_i=S_{i-1}$. For that, we need the $(i-2)$-nd and +so on... + +If we did encoding / decoding in a streaming fashion, this would not matter -- +we could compute the mixer parameters one by one as we go. +But if we wish for random access in constant time, we would need to store a table +of all the mixer parameters -- i.e., a table with $\Theta(n/\log_\Sigma n)$ rows. +That is impractical. + +Note that this was not an issue for sole as there the $Y_i$'s formed an arithmetic +sequence. They weren't even the optimal $Y_i$'s that would be created by the generic +mixer construction but a close enough approximation that still yielded good results, +up to an additive constant. That was a special case -- in general, we do now know +how to approximate the mixer parameters by something easier to compute locally. + +\subsection{A tree encoding to the rescue} + +To remedy the situation, instead of a chain, we will organize mixers into a +binary tree. Each vertex will contain one mixer whose carry output goes to its +parent (thus most vertices receive two carry inputs but it is trivial to +combine them into one). This is depicted in fig. \figref{mixer_tree}. Now we +need $Y\cdot Z \cdot C \le 2^M$. + +\figure[mixer_tree]{mixer_tree.pdf}{}{A single mixer vertex and the organization of those into a tree} + +Then you can create a linear order on the vertices (e.g. by layers +bottom-to-top), split the input string into blocks and feed the blocks through +the mixer vertices in this order and save the corresponding outputs in the same +order. + +Note that this scheme still has all the nice properties, for example it is locally +decodable. To decode a vertex's input, you only need the output of that vertex and +its parent. + +But how does a tree help us determine individual mixer parameters more easily? +The parameters of a mixer in a vertex are uniquely determined by the shape of the +subtree under that vertex. This is easily seen by induction: all leaves have the +same parameters (as they have dummy carry-in alphabets of size 1) and the parameters +of any vertex are determined by the parameters of its children. + +We will use the same tree shape as for binary heaps: all the levels are full, except +for possibly the last and in the last level all the vertices in one contiguous segement +starting at the very left. + +Now let us consider a level at height $h$ (from the bottom). There are at most three +three vertex types by subtree shape and they appear on the level in a specific order: +\tightlist{n.} +\: a contiguous segment of vertices with full subtrees of height $h$ (type A) +\: one vertex with an irregular subtree (type B) +\: a contiguous segment of vertices with full subtrees of height $h-1$ (type C) +\endlist +See fig. \figref{tree_shapes}. If the last level happens to be full, there are only +type-A vertices. + +\figure[tree_shapes]{tree_shapes.pdf}{}{Vertex types by subtree shape} + +Thus, for each level and each vertex type, it is sufficient to remember: +\tightlist{o} +\: Number of vertices of this type on this level. From this, we can easily determine + vertex type from its index by simple comparison. +\: Mixer parameters. +\: Starting address of the output of first vertex of this type in the output stream. + From this, we can easily compute starting address of any vertex by simple addition + and multiplication as all vertices of a given type on a given level have the same + number of output bits (parameter $M$). This will be useful for local decoding. +\endlist +This a precomputed table of $\O(\log n)$ words. + +Block size and redundancy computation is exactly the same as in the chain case and +we still get $\O(1)$ redundancy. The chain can be thought of as a degenerate case +of the tree construction where the tree has the shape of a path (and thus all subtrees +have distinct shapes and distinct mixer parameters). + +Local decoding of $i$-th input block could be done as follows: +\tightlist{o} +\: Convert block index into a position in the tree (level + index on level) +\: Determine the vertex type and mixer parameters, compute position in output stream and extract the + corresponding output $m \in 2^M$ +\: Do the same for the parent vertex +\: Using the parent mixer, decode the carry going up from our vertex +\: Using our mixer, decode the original input block from our output and carry +\endlist +Local modification can be done in a similar fashion the other way around. Both take +$\O(1)$ time on RAM. + +\theorem{ +On a Word-RAM, we can represent a string $A \in [\Sigma]^n$ in space $\lceil n \log \Sigma \rceil + \O(1)$ bits, +with random-access element read and write operations in $\O(1)$ time, using a precomputed table of +$\O(\log n)$ constants dependent on $n$ and $\Sigma$. +} + +\endchapter diff --git a/fs-succinct/succinct_common.asy b/fs-succinct/succinct_common.asy new file mode 100644 index 0000000000000000000000000000000000000000..974355bbe55542f7385b968d8e9f2c8b8cc3d11d --- /dev/null +++ b/fs-succinct/succinct_common.asy @@ -0,0 +1,98 @@ +import ads; + +real blockwidth = 1; +real blockheight = 0.5; +real arrowheight = 1; +real rowheight = blockheight + arrowheight; + + +void mixarrow(int row, int col) { + real x = col * blockwidth; + real y = -row * rowheight - blockheight; + path arr1 = (x + blockwidth/2, y) {S} .. {S} (x + blockwidth, y - arrowheight / 2) {S} .. {S} (x + 1.5*blockwidth, y - arrowheight); + path arr2 = reflect((x + blockwidth, 0), (x+blockwidth, 1)) * arr1; + draw(arr1, Arrow); + draw(arr2, Arrow); +} + +void thruarrow(int row, int col) { + real x = col * blockwidth; + real y = -row * rowheight - blockheight; + draw((x+blockwidth/2, y)--(x+blockwidth/2, y-arrowheight), Arrow); + +} + +void block(int row, int col, string alphabet) { + real xbase = col * blockwidth; + real ybase = -row * rowheight; + if (alphabet == "...") { + label("$\cdots$", (xbase+blockwidth/2, ybase-blockheight/2), (0,0)); + + } else { + if (alphabet != "EOF" && alphabet != "0") + draw((xbase,ybase)--(xbase+blockwidth, ybase)--(xbase+blockwidth, ybase-blockheight)--(xbase, ybase-blockheight)--cycle); + label("$"+alphabet+"$", (xbase+blockwidth/2, ybase-blockheight/2), (0,0)); + } +} + +void blocks(int row ... string alphabets[]) { + for (int i = 0; i < alphabets.length; ++i) { + block(row, i, alphabets[i]); + } +} + +void thruarrows(int row, int col, int cnt) { + for (int i = 0; i < cnt; ++i) + thruarrow(row, col+i); +} + +void mixarrows(int row, int col, int cnt) { + for (int i = 0; i < cnt; i += 2) + mixarrow(row, col+i); +} + +void passlabel(int row, string lbl) { + label("{\it " + lbl + "}", (-1, -row*rowheight-blockheight - arrowheight/2), W); +} + +void mixer(real x, real y, real r=0.5) { + draw((x-r,y-r) -- (x+r,y-r)--(x+r, y+r) -- (x-r, y+r) -- cycle, halfthick); + draw( (x,y+r) {S} .. {E} (x+r,y), 0.5*white); + draw( (x,y+r) -- (x,y-r), 0.5*white); + draw( (x-r,y) {E} .. {S} (x,y-r), 0.5*white); +} + +real PENTAMIXER_R = 1; + +pair pm_dir(pair c, int angle, real r=PENTAMIXER_R) { + return shift(c)*scale(r)*dir(angle); +} +int A_IN = 180, A_OUT=0, A_COUT=90, A_CIN1=240, A_CIN2=300; +void pentamixer(pair c, real r=PENTAMIXER_R) { + path unitcircle=E..N..W..S..cycle; + pair d(int angle) { + return pm_dir(c, angle, r); + } + draw(shift(c)*scale(r)*unitcircle); + draw( d(A_IN)--d(A_OUT), 0.5*white); + draw( d(A_IN){E}..{N}d(A_COUT), 0.5*white); + draw( d(A_CIN1){dir(60)}..{E}d(A_OUT), 0.5*white); + draw( d(A_CIN2){dir(120)}..{E}d(A_OUT), 0.5*white); +} + + +void pm_arrow(pair c, int angle, real length, bool out=false, real r=PENTAMIXER_R, string lbl="") { + pair p1 = pm_dir(c, angle, r); + pair p2 = pm_dir(c, angle, r+length); + if (out) + draw(p1--p2, e_arrow); + else + draw(p2--p1, e_arrow); + if (lbl != "") + label(lbl, p2, dir(angle)); +} + +void carry_arrow(pair mix1, pair mix2, string alphabet) { + draw(mix1 + (0.5,0) -- mix2 - (0.5, 0), e_arrow); + label((mix1+mix2)/2, "$"+alphabet+"$", N); +} diff --git a/fs-succinct/tree_shapes.asy b/fs-succinct/tree_shapes.asy new file mode 100644 index 0000000000000000000000000000000000000000..d50059bead808bf81fede9d50b29040aece32a0f --- /dev/null +++ b/fs-succinct/tree_shapes.asy @@ -0,0 +1,21 @@ +import succinct_common; + + +draw((-3,-2) -- (3,-2), 0.3*white); +draw((-3.2,-4)--(-3.2,-2), Arrows); +label((-3.2, -3), "$h$", W); +draw((3.2,-3.5)--(3.2,-2), Arrows); +label((3.2, -2.75), "$h-1$", E); + +void subtree(path p) { + filldraw(p, 0.65*white); +} + +subtree((-1.75, -4)--(-0.75,-4)--(-1.25,-2)--cycle); +subtree((-0.5, -4)--(0,-4)--(0,-3.5)--(0.5,-3.5)--(0,-2)--cycle); +subtree((1.75, -3.5)--(0.75,-3.5)--(1.25,-2)--cycle); +label((-1.25, -3), "A"); +label((-0, -3), "B"); +label((1.25, -3), "C"); + +draw((-3, -4) -- (0,-4) -- (0,-3.5) -- (3,-3.5) -- (0, 0) -- cycle, halfthick);