From 1b1eada3f6231b37bb5e76b68916c1263c9bff7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Mare=C4=8Dek?= <marecek@ufal.mff.cuni.cz>
Date: Tue, 9 May 2023 12:58:21 +0200
Subject: [PATCH] kgrams

---
 11-kgrams/cpp/Makefile          | 13 ++++++
 11-kgrams/cpp/kgrams.h          | 59 ++++++++++++++++++++++++
 11-kgrams/cpp/kgrams_test.cpp   | 79 +++++++++++++++++++++++++++++++++
 11-kgrams/cpp/test_main.cpp     | 43 ++++++++++++++++++
 11-kgrams/python/kgrams.py      | 43 ++++++++++++++++++
 11-kgrams/python/kgrams_test.py | 60 +++++++++++++++++++++++++
 11-kgrams/task.md               | 10 +++++
 7 files changed, 307 insertions(+)
 create mode 100644 11-kgrams/cpp/Makefile
 create mode 100644 11-kgrams/cpp/kgrams.h
 create mode 100644 11-kgrams/cpp/kgrams_test.cpp
 create mode 100644 11-kgrams/cpp/test_main.cpp
 create mode 100644 11-kgrams/python/kgrams.py
 create mode 100644 11-kgrams/python/kgrams_test.py
 create mode 100644 11-kgrams/task.md

diff --git a/11-kgrams/cpp/Makefile b/11-kgrams/cpp/Makefile
new file mode 100644
index 0000000..7046027
--- /dev/null
+++ b/11-kgrams/cpp/Makefile
@@ -0,0 +1,13 @@
+test: kgrams_test
+	./$<
+
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+
+kgrams_test: kgrams_test.cpp kgrams.h test_main.cpp
+	$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
+
+clean:
+	rm -f kgrams_test
+
+.PHONY: clean test
diff --git a/11-kgrams/cpp/kgrams.h b/11-kgrams/cpp/kgrams.h
new file mode 100644
index 0000000..bdcd4e8
--- /dev/null
+++ b/11-kgrams/cpp/kgrams.h
@@ -0,0 +1,59 @@
+#include <vector>
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include <functional>
+
+using namespace std;
+
+class SuffixArray {
+  public:
+    string text;
+    int n;                      // Length of text
+    vector<int> S;              // Permutation which sorts suffixes
+    vector<int> R;              // Ranking array (an inverse of S)
+
+    // Construct suffix array and ranking array for the given string
+    // using the doubling algorithm.
+    SuffixArray(const string &orig_text)
+    {
+        text = orig_text;
+        n = text.size();
+        S.resize(n+1);
+        R.resize(n+1);
+
+        sort_and_rank([this](int a, int b) -> bool { return text[a] < text[b]; });
+
+        for (int k=1; k<n; k*=2) {
+            sort_and_rank([this,k](int a, int b) -> bool {
+                    pair<int,int> pa(R[a], (a+k < n) ? R[a+k] : -1);
+                    pair<int,int> pb(R[b], (b+k < n) ? R[b+k] : -1);
+                    return (pa < pb);
+                    });
+        }
+    }
+
+    // An auxiliary function used in the doubling algorithm.
+    void sort_and_rank(function<bool(int a, int b)> comp)
+    {
+        for (size_t i=0; i<S.size(); i++)
+            S[i] = i;
+
+        sort(S.begin(), S.end(), comp);
+
+        vector<int> R2(S.size());
+        for (size_t i=0; i<S.size(); i++) {
+            if (!i || comp(S[i-1], S[i]) || comp(S[i], S[i-1]))
+                R2[S[i]] = i;
+            else
+                R2[S[i]] = R2[S[i-1]];
+        }
+        R.swap(R2);
+    }
+
+    // Return the number of distinct k-grams in the string.
+    int num_kgrams(int k)
+    {
+        // TODO: Implement
+    }
+};
diff --git a/11-kgrams/cpp/kgrams_test.cpp b/11-kgrams/cpp/kgrams_test.cpp
new file mode 100644
index 0000000..bb90a32
--- /dev/null
+++ b/11-kgrams/cpp/kgrams_test.cpp
@@ -0,0 +1,79 @@
+#include <algorithm>
+#include <functional>
+#include <string>
+#include <vector>
+#include <cstdint>
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+void expect_failed(const string& message);
+
+#include "kgrams.h"
+
+void test_generic(const string& text, int k, int expected_kg)
+{
+    SuffixArray sa(text);
+    int num_kg = sa.num_kgrams(k);
+    EXPECT(num_kg == expected_kg, "Expected " + to_string(expected_kg) + " " + to_string(k) + "-grams, found " + to_string(num_kg) + ".");
+
+}
+
+// Test on a fixed string.
+void test_explicit(int k, int expected_kg)
+{
+    test_generic("annbansbananas", k, expected_kg);
+}
+
+// Test on a very non-uniform random string.
+void test_random(int n, int k, int expected_kg)
+{
+    string s(n, ' ');
+    uint32_t state = n;
+
+    for (int i=0; i<n; i++) {
+        state = state*2654289733 + 7;
+        unsigned x = (state >> 28) % 16;
+        char next = "aaaaaaaaaaaabbbc"[x];
+        s[i] = next;
+    }
+
+    test_generic(s, k, expected_kg);
+}
+
+// Test on an almost-constant string.
+void test_trivial(int n, int k, int expected_kg)
+{
+    string s(n, ' ');
+
+    for (int i=0; i<n; i++) {
+        if (i == n/2)
+            s[i] = 'b';
+        else
+            s[i] = 'a';
+    }
+
+    test_generic(s, k, expected_kg);
+}
+
+vector<pair<string, function<void()>>> tests = {
+    {"basic-1",     [] { test_explicit(1, 4); }},
+    {"basic-2",     [] { test_explicit(2, 8); }},
+    {"basic-3",     [] { test_explicit(3, 10); }},
+    {"basic-4",     [] { test_explicit(4, 11); }},
+    {"basic-14",    [] { test_explicit(14, 1); }},
+
+    {"short-5",     [] { test_random(1000, 5, 107); }},
+    {"short-33",    [] { test_random(1000, 33, 968); }},
+    {"short-500",   [] { test_random(1000, 500, 501); }},
+
+    {"long-5",      [] { test_random(100000, 5, 230); }},
+    {"long-33",     [] { test_random(100000, 33, 99767); }},
+    {"long-5000",   [] { test_random(100000, 5000, 95001); }},
+
+    {"triv-1",      [] { test_trivial(1000000, 1, 2); }},
+    {"triv-5",      [] { test_trivial(1000000, 5, 6); }},
+    {"triv-3333",   [] { test_trivial(1000000, 3333, 3334); }},
+    {"triv-500000", [] { test_trivial(1000000, 500000, 500001); }},
+};
diff --git a/11-kgrams/cpp/test_main.cpp b/11-kgrams/cpp/test_main.cpp
new file mode 100644
index 0000000..3f4aff0
--- /dev/null
+++ b/11-kgrams/cpp/test_main.cpp
@@ -0,0 +1,43 @@
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+int main(int argc, char* argv[]) {
+    vector<string> required_tests;
+
+    if (argc > 1) {
+        required_tests.assign(argv + 1, argv + argc);
+    } else {
+        for (const auto& test : tests)
+            required_tests.push_back(test.first);
+    }
+
+    for (const auto& required_test : required_tests) {
+        bool found = false;
+        for (const auto& test : tests)
+            if (required_test == test.first) {
+                cerr << "Running test " << required_test << endl;
+                test.second();
+                found = true;
+                break;
+            }
+        if (!found) {
+            cerr << "Unknown test " << required_test << endl;
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/11-kgrams/python/kgrams.py b/11-kgrams/python/kgrams.py
new file mode 100644
index 0000000..0eefda3
--- /dev/null
+++ b/11-kgrams/python/kgrams.py
@@ -0,0 +1,43 @@
+class SuffixArray:
+    def __init__(self, text):
+        self.text = text
+        # S is the suffix array (a permutation which sorts suffixes)
+        # R is the ranking array (the inverse of S)
+        self.R, self.S = self._build_suffix_array(text)
+
+    def _build_suffix_array(self, text):
+        """
+        Construct the suffix array and ranking array for the given string
+        using the doubling algorithm.
+        """
+
+        n = len(text)
+        R = [None] * (n+1)
+        S = [None] * (n+1)
+
+        R = self._sort_and_rank(S, lambda a: ord(text[a]) if a < len(text) else -1)
+
+        k = 1
+        while k < n:
+            R = self._sort_and_rank(S, lambda a: (R[a], (R[a+k] if a+k < n else -1)))
+            k *= 2
+
+        return (tuple(R), tuple(S))
+
+    # An auxiliary function used in the doubling algorithm.
+    def _sort_and_rank(self, S, key):
+        for i in range(len(S)): S[i] = i
+        S.sort(key = key)
+
+        R = [None] * len(S)
+        for i, s in enumerate(S):
+            prev_s = S[i-1]
+            if i == 0 or key(prev_s) != key(s): R[s] = i
+            else: R[s] = R[prev_s]
+        return R
+
+    def num_kgrams(self, k):
+        """Return the number of distinct k-grams in the string."""
+
+        raise NotImplementedError
+
diff --git a/11-kgrams/python/kgrams_test.py b/11-kgrams/python/kgrams_test.py
new file mode 100644
index 0000000..18646f4
--- /dev/null
+++ b/11-kgrams/python/kgrams_test.py
@@ -0,0 +1,60 @@
+#!/usr/bin/python
+
+import sys
+from kgrams import SuffixArray
+
+def test_generic(text, k, expected_count):
+    sa = SuffixArray(text)
+    count = sa.num_kgrams(k)
+    assert count == expected_count, \
+        "Expected %i %i-grams, found %i." % (expected_count, k, count)
+
+def test_explicit(k, expected_count):
+    test_generic("annbansbananas", k, expected_count)
+
+def test_random(n, k, expected_count):
+    b = bytearray(n)
+    state = n
+    for i in range(n):
+        state = (state*2654289733 + 7) % (1 << 32)
+        x = (state >> 28) % 16
+        next = "aaaaaaaaaaaabbbc"[x]
+        b[i] = ord(next)
+
+    test_generic(b.decode(), k, expected_count)
+
+def test_trivial(n, k, expected_count):
+    s = "".join( "b" if i == n // 2 else "a" for i in range (n) )
+    test_generic(s, k, expected_count)
+
+tests = [
+    ("basic-1",     lambda: test_explicit(1, 4)),
+    ("basic-2",     lambda: test_explicit(2, 8)),
+    ("basic-3",     lambda: test_explicit(3, 10)),
+    ("basic-4",     lambda: test_explicit(4, 11)),
+    ("basic-14",    lambda: test_explicit(14, 1)),
+
+    ("short-5",     lambda: test_random(1000, 5, 107)),
+    ("short-33",    lambda: test_random(1000, 33, 968)),
+    ("short-500",   lambda: test_random(1000, 500, 501)),
+
+    ("long-5",      lambda: test_random(100000, 5, 230)),
+    ("long-33",     lambda: test_random(100000, 33, 99767)),
+    ("long-5000",   lambda: test_random(100000, 5000, 95001)),
+
+    ("triv-1",      lambda: test_trivial(1000000, 1, 2)),
+    ("triv-5",      lambda: test_trivial(1000000, 5, 6)),
+    ("triv-3333",   lambda: test_trivial(1000000, 3333, 3334)),
+    ("triv-500000", lambda: test_trivial(1000000, 500000, 500001)),
+]
+
+if __name__ == "__main__":
+    for required_test in sys.argv[1:] or [name for name, _ in tests]:
+        for name, test in tests:
+            if name == required_test:
+                print("Running test {}".format(name), file=sys.stderr)
+                test()
+                break
+        else:
+            raise ValueError("Unknown test {}".format(name))
+
diff --git a/11-kgrams/task.md b/11-kgrams/task.md
new file mode 100644
index 0000000..22800b4
--- /dev/null
+++ b/11-kgrams/task.md
@@ -0,0 +1,10 @@
+Your task is to write a function which takes a string and an integer K
+and it reports how many different K-grams (K-character substrings) the
+string has.
+
+You are given an algorithm for construction of the suffix array. For
+simplicity, this algorithm has time complexity $O(n \log^2 n)$. Except
+for constructing the suffix array, your algorithm should run in linear
+time.
+
+Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).
-- 
GitLab