Commit b568c869 by Ondřej Mička

### k-grams

parent 21949468
 test: kgrams_test ./\$< INCLUDE ?= . CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I\$(INCLUDE) kgrams_test: kgrams_test.cpp kgrams.h test_main.cpp \$(CXX) \$(CXXFLAGS) \$(filter %.cpp,\$^) -o \$@ clean: rm -f kgrams_test .PHONY: clean test
 #include #include #include #include #include using namespace std; class SuffixArray { public: string text; int n; // Length of text vector S; // Permutation which sorts suffixes vector R; // Ranking array (an inverse of S) // Construct suffix array and ranking array for the given string // using the doubling algorithm. SuffixArray(const string &orig_text) { text = orig_text; n = text.size(); S.resize(n+1); R.resize(n+1); sort_and_rank([this](int a, int b) -> bool { return text[a] < text[b]; }); for (int k=1; k bool { pair pa(R[a], (a+k < n) ? R[a+k] : -1); pair pb(R[b], (b+k < n) ? R[b+k] : -1); return (pa < pb); }); } } // An auxiliary function used in the doubling algorithm. void sort_and_rank(function comp) { for (size_t i=0; i R2(S.size()); for (size_t i=0; i
 #include #include #include #include #include using namespace std; // If the condition is not true, report an error and halt. #define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) void expect_failed(const string& message); #include "kgrams.h" void test_generic(const string& text, int k, int expected_kg) { SuffixArray sa(text); int num_kg = sa.num_kgrams(k); EXPECT(num_kg == expected_kg, "Expected " + to_string(expected_kg) + " " + to_string(k) + "-grams, found " + to_string(num_kg) + "."); } // Test on a fixed string. void test_explicit(int k, int expected_kg) { test_generic("annbansbananas", k, expected_kg); } // Test on a very non-uniform random string. void test_random(int n, int k, int expected_kg) { string s(n, ' '); uint32_t state = n; for (int i=0; i> 28) % 16; char next = "aaaaaaaaaaaabbbc"[x]; s[i] = next; } test_generic(s, k, expected_kg); } // Test on an almost-constant string. void test_trivial(int n, int k, int expected_kg) { string s(n, ' '); for (int i=0; i>> tests = { {"basic-1", [] { test_explicit(1, 4); }}, {"basic-2", [] { test_explicit(2, 8); }}, {"basic-3", [] { test_explicit(3, 10); }}, {"basic-4", [] { test_explicit(4, 11); }}, {"basic-14", [] { test_explicit(14, 1); }}, {"short-5", [] { test_random(1000, 5, 107); }}, {"short-33", [] { test_random(1000, 33, 968); }}, {"short-500", [] { test_random(1000, 500, 501); }}, {"long-5", [] { test_random(100000, 5, 230); }}, {"long-33", [] { test_random(100000, 33, 99767); }}, {"long-5000", [] { test_random(100000, 5000, 95001); }}, {"triv-1", [] { test_trivial(1000000, 1, 2); }}, {"triv-5", [] { test_trivial(1000000, 5, 6); }}, {"triv-3333", [] { test_trivial(1000000, 3333, 3334); }}, {"triv-500000", [] { test_trivial(1000000, 500000, 500001); }}, };
 #include #include #include #include #include #include using namespace std; extern vector>> tests; void expect_failed(const string& message) { cerr << "Test error: " << message << endl; exit(1); } int main(int argc, char* argv[]) { vector required_tests; if (argc > 1) { required_tests.assign(argv + 1, argv + argc); } else { for (const auto& test : tests) required_tests.push_back(test.first); } for (const auto& required_test : required_tests) { bool found = false; for (const auto& test : tests) if (required_test == test.first) { cerr << "Running test " << required_test << endl; test.second(); found = true; break; } if (!found) { cerr << "Unknown test " << required_test << endl; return 1; } } return 0; }
 class SuffixArray: def __init__(self, text): self.text = text # S is the suffix array (a permutation which sorts suffixes) # R is the ranking array (the inverse of S) self.R, self.S = self._build_suffix_array(text) def _build_suffix_array(self, text): """ Construct the suffix array and ranking array for the given string using the doubling algorithm. """ n = len(text) R = [None] * (n+1) S = [None] * (n+1) R = self._sort_and_rank(S, lambda a: ord(text[a]) if a < len(text) else -1) k = 1 while k < n: R = self._sort_and_rank(S, lambda a: (R[a], (R[a+k] if a+k < n else -1))) k *= 2 return (tuple(R), tuple(S)) # An auxiliary function used in the doubling algorithm. def _sort_and_rank(self, S, key): for i in range(len(S)): S[i] = i S.sort(key = key) R = [None] * len(S) for i, s in enumerate(S): prev_s = S[i-1] if i == 0 or key(prev_s) != key(s): R[s] = i else: R[s] = R[prev_s] return R def num_kgrams(self, k): """Return the number of distinct k-grams in the string.""" raise NotImplementedError
 #!/usr/bin/python import sys from kgrams import SuffixArray def test_generic(text, k, expected_count): sa = SuffixArray(text) count = sa.num_kgrams(k) assert count == expected_count, \ "Expected %i %i-grams, found %i." % (expected_count, k, count) def test_explicit(k, expected_count): test_generic("annbansbananas", k, expected_count) def test_random(n, k, expected_count): b = bytearray(n) state = n for i in range(n): state = (state*2654289733 + 7) % (1 << 32) x = (state >> 28) % 16 next = "aaaaaaaaaaaabbbc"[x] b[i] = ord(next) test_generic(b.decode(), k, expected_count) def test_trivial(n, k, expected_count): s = "".join( "b" if i == n // 2 else "a" for i in range (n) ) test_generic(s, k, expected_count) tests = [ ("basic-1", lambda: test_explicit(1, 4)), ("basic-2", lambda: test_explicit(2, 8)), ("basic-3", lambda: test_explicit(3, 10)), ("basic-4", lambda: test_explicit(4, 11)), ("basic-14", lambda: test_explicit(14, 1)), ("short-5", lambda: test_random(1000, 5, 107)), ("short-33", lambda: test_random(1000, 33, 968)), ("short-500", lambda: test_random(1000, 500, 501)), ("long-5", lambda: test_random(100000, 5, 230)), ("long-33", lambda: test_random(100000, 33, 99767)), ("long-5000", lambda: test_random(100000, 5000, 95001)), ("triv-1", lambda: test_trivial(1000000, 1, 2)), ("triv-5", lambda: test_trivial(1000000, 5, 6)), ("triv-3333", lambda: test_trivial(1000000, 3333, 3334)), ("triv-500000", lambda: test_trivial(1000000, 500000, 500001)), ] if __name__ == "__main__": for required_test in sys.argv[1:] or [name for name, _ in tests]: for name, test in tests: if name == required_test: print("Running test {}".format(name), file=sys.stderr) test() break else: raise ValueError("Unknown test {}".format(name))