diff --git a/11-kgrams/cpp/Makefile b/11-kgrams/cpp/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7046027a2bf4c6dcd8902493bb9e24568db1efd0 --- /dev/null +++ b/11-kgrams/cpp/Makefile @@ -0,0 +1,13 @@ +test: kgrams_test + ./$< + +INCLUDE ?= . +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) + +kgrams_test: kgrams_test.cpp kgrams.h test_main.cpp + $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@ + +clean: + rm -f kgrams_test + +.PHONY: clean test diff --git a/11-kgrams/cpp/kgrams.h b/11-kgrams/cpp/kgrams.h new file mode 100644 index 0000000000000000000000000000000000000000..bdcd4e8e7f744663a1210ea2a0be4c9a5f4d3894 --- /dev/null +++ b/11-kgrams/cpp/kgrams.h @@ -0,0 +1,59 @@ +#include <vector> +#include <iostream> +#include <algorithm> +#include <string> +#include <functional> + +using namespace std; + +class SuffixArray { + public: + string text; + int n; // Length of text + vector<int> S; // Permutation which sorts suffixes + vector<int> R; // Ranking array (an inverse of S) + + // Construct suffix array and ranking array for the given string + // using the doubling algorithm. + SuffixArray(const string &orig_text) + { + text = orig_text; + n = text.size(); + S.resize(n+1); + R.resize(n+1); + + sort_and_rank([this](int a, int b) -> bool { return text[a] < text[b]; }); + + for (int k=1; k<n; k*=2) { + sort_and_rank([this,k](int a, int b) -> bool { + pair<int,int> pa(R[a], (a+k < n) ? R[a+k] : -1); + pair<int,int> pb(R[b], (b+k < n) ? R[b+k] : -1); + return (pa < pb); + }); + } + } + + // An auxiliary function used in the doubling algorithm. + void sort_and_rank(function<bool(int a, int b)> comp) + { + for (size_t i=0; i<S.size(); i++) + S[i] = i; + + sort(S.begin(), S.end(), comp); + + vector<int> R2(S.size()); + for (size_t i=0; i<S.size(); i++) { + if (!i || comp(S[i-1], S[i]) || comp(S[i], S[i-1])) + R2[S[i]] = i; + else + R2[S[i]] = R2[S[i-1]]; + } + R.swap(R2); + } + + // Return the number of distinct k-grams in the string. + int num_kgrams(int k) + { + // TODO: Implement + } +}; diff --git a/11-kgrams/cpp/kgrams_test.cpp b/11-kgrams/cpp/kgrams_test.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bb90a324bbbb399eecec76b9a385e4809efdacff --- /dev/null +++ b/11-kgrams/cpp/kgrams_test.cpp @@ -0,0 +1,79 @@ +#include <algorithm> +#include <functional> +#include <string> +#include <vector> +#include <cstdint> + +using namespace std; + +// If the condition is not true, report an error and halt. +#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) +void expect_failed(const string& message); + +#include "kgrams.h" + +void test_generic(const string& text, int k, int expected_kg) +{ + SuffixArray sa(text); + int num_kg = sa.num_kgrams(k); + EXPECT(num_kg == expected_kg, "Expected " + to_string(expected_kg) + " " + to_string(k) + "-grams, found " + to_string(num_kg) + "."); + +} + +// Test on a fixed string. +void test_explicit(int k, int expected_kg) +{ + test_generic("annbansbananas", k, expected_kg); +} + +// Test on a very non-uniform random string. +void test_random(int n, int k, int expected_kg) +{ + string s(n, ' '); + uint32_t state = n; + + for (int i=0; i<n; i++) { + state = state*2654289733 + 7; + unsigned x = (state >> 28) % 16; + char next = "aaaaaaaaaaaabbbc"[x]; + s[i] = next; + } + + test_generic(s, k, expected_kg); +} + +// Test on an almost-constant string. +void test_trivial(int n, int k, int expected_kg) +{ + string s(n, ' '); + + for (int i=0; i<n; i++) { + if (i == n/2) + s[i] = 'b'; + else + s[i] = 'a'; + } + + test_generic(s, k, expected_kg); +} + +vector<pair<string, function<void()>>> tests = { + {"basic-1", [] { test_explicit(1, 4); }}, + {"basic-2", [] { test_explicit(2, 8); }}, + {"basic-3", [] { test_explicit(3, 10); }}, + {"basic-4", [] { test_explicit(4, 11); }}, + {"basic-14", [] { test_explicit(14, 1); }}, + + {"short-5", [] { test_random(1000, 5, 107); }}, + {"short-33", [] { test_random(1000, 33, 968); }}, + {"short-500", [] { test_random(1000, 500, 501); }}, + + {"long-5", [] { test_random(100000, 5, 230); }}, + {"long-33", [] { test_random(100000, 33, 99767); }}, + {"long-5000", [] { test_random(100000, 5000, 95001); }}, + + {"triv-1", [] { test_trivial(1000000, 1, 2); }}, + {"triv-5", [] { test_trivial(1000000, 5, 6); }}, + {"triv-3333", [] { test_trivial(1000000, 3333, 3334); }}, + {"triv-500000", [] { test_trivial(1000000, 500000, 500001); }}, +}; diff --git a/11-kgrams/cpp/test_main.cpp b/11-kgrams/cpp/test_main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3f4aff0785f636b7fd0ea1a15aa69dafe06f290f --- /dev/null +++ b/11-kgrams/cpp/test_main.cpp @@ -0,0 +1,43 @@ +#include <cstdlib> +#include <functional> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +using namespace std; + +extern vector<pair<string, function<void()>>> tests; + +void expect_failed(const string& message) { + cerr << "Test error: " << message << endl; + exit(1); +} + +int main(int argc, char* argv[]) { + vector<string> required_tests; + + if (argc > 1) { + required_tests.assign(argv + 1, argv + argc); + } else { + for (const auto& test : tests) + required_tests.push_back(test.first); + } + + for (const auto& required_test : required_tests) { + bool found = false; + for (const auto& test : tests) + if (required_test == test.first) { + cerr << "Running test " << required_test << endl; + test.second(); + found = true; + break; + } + if (!found) { + cerr << "Unknown test " << required_test << endl; + return 1; + } + } + + return 0; +} diff --git a/11-kgrams/python/kgrams.py b/11-kgrams/python/kgrams.py new file mode 100644 index 0000000000000000000000000000000000000000..0eefda33b44dbd2690c511c7a8e8c3d59bea8489 --- /dev/null +++ b/11-kgrams/python/kgrams.py @@ -0,0 +1,43 @@ +class SuffixArray: + def __init__(self, text): + self.text = text + # S is the suffix array (a permutation which sorts suffixes) + # R is the ranking array (the inverse of S) + self.R, self.S = self._build_suffix_array(text) + + def _build_suffix_array(self, text): + """ + Construct the suffix array and ranking array for the given string + using the doubling algorithm. + """ + + n = len(text) + R = [None] * (n+1) + S = [None] * (n+1) + + R = self._sort_and_rank(S, lambda a: ord(text[a]) if a < len(text) else -1) + + k = 1 + while k < n: + R = self._sort_and_rank(S, lambda a: (R[a], (R[a+k] if a+k < n else -1))) + k *= 2 + + return (tuple(R), tuple(S)) + + # An auxiliary function used in the doubling algorithm. + def _sort_and_rank(self, S, key): + for i in range(len(S)): S[i] = i + S.sort(key = key) + + R = [None] * len(S) + for i, s in enumerate(S): + prev_s = S[i-1] + if i == 0 or key(prev_s) != key(s): R[s] = i + else: R[s] = R[prev_s] + return R + + def num_kgrams(self, k): + """Return the number of distinct k-grams in the string.""" + + raise NotImplementedError + diff --git a/11-kgrams/python/kgrams_test.py b/11-kgrams/python/kgrams_test.py new file mode 100644 index 0000000000000000000000000000000000000000..18646f44c845e9c2f07f1d5b81dda99bc6eacec9 --- /dev/null +++ b/11-kgrams/python/kgrams_test.py @@ -0,0 +1,60 @@ +#!/usr/bin/python + +import sys +from kgrams import SuffixArray + +def test_generic(text, k, expected_count): + sa = SuffixArray(text) + count = sa.num_kgrams(k) + assert count == expected_count, \ + "Expected %i %i-grams, found %i." % (expected_count, k, count) + +def test_explicit(k, expected_count): + test_generic("annbansbananas", k, expected_count) + +def test_random(n, k, expected_count): + b = bytearray(n) + state = n + for i in range(n): + state = (state*2654289733 + 7) % (1 << 32) + x = (state >> 28) % 16 + next = "aaaaaaaaaaaabbbc"[x] + b[i] = ord(next) + + test_generic(b.decode(), k, expected_count) + +def test_trivial(n, k, expected_count): + s = "".join( "b" if i == n // 2 else "a" for i in range (n) ) + test_generic(s, k, expected_count) + +tests = [ + ("basic-1", lambda: test_explicit(1, 4)), + ("basic-2", lambda: test_explicit(2, 8)), + ("basic-3", lambda: test_explicit(3, 10)), + ("basic-4", lambda: test_explicit(4, 11)), + ("basic-14", lambda: test_explicit(14, 1)), + + ("short-5", lambda: test_random(1000, 5, 107)), + ("short-33", lambda: test_random(1000, 33, 968)), + ("short-500", lambda: test_random(1000, 500, 501)), + + ("long-5", lambda: test_random(100000, 5, 230)), + ("long-33", lambda: test_random(100000, 33, 99767)), + ("long-5000", lambda: test_random(100000, 5000, 95001)), + + ("triv-1", lambda: test_trivial(1000000, 1, 2)), + ("triv-5", lambda: test_trivial(1000000, 5, 6)), + ("triv-3333", lambda: test_trivial(1000000, 3333, 3334)), + ("triv-500000", lambda: test_trivial(1000000, 500000, 500001)), +] + +if __name__ == "__main__": + for required_test in sys.argv[1:] or [name for name, _ in tests]: + for name, test in tests: + if name == required_test: + print("Running test {}".format(name), file=sys.stderr) + test() + break + else: + raise ValueError("Unknown test {}".format(name)) + diff --git a/11-kgrams/task.md b/11-kgrams/task.md new file mode 100644 index 0000000000000000000000000000000000000000..22800b488093a2364481fa33ccb374576f025c4b --- /dev/null +++ b/11-kgrams/task.md @@ -0,0 +1,10 @@ +Your task is to write a function which takes a string and an integer K +and it reports how many different K-grams (K-character substrings) the +string has. + +You are given an algorithm for construction of the suffix array. For +simplicity, this algorithm has time complexity $O(n \log^2 n)$. Except +for constructing the suffix array, your algorithm should run in linear +time. + +Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).