Skip to content
Snippets Groups Projects
Commit 1dafd64d authored by Petr Chmel's avatar Petr Chmel
Browse files

Publish kgrams

parent bef042bc
No related branches found
No related tags found
No related merge requests found
test: kgrams_test
./$<
INCLUDE ?= .
CXXFLAGS=-std=c++23 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
kgrams_test: kgrams_test.cpp kgrams.h test_main.cpp
$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
clean:
rm -f kgrams_test
.PHONY: clean test
#include <vector>
#include <iostream>
#include <algorithm>
#include <string>
#include <functional>
using namespace std;
class SuffixArray {
public:
string text;
int n; // Length of text
vector<int> S; // Permutation which sorts suffixes
vector<int> R; // Ranking array (an inverse of S)
// Construct suffix array and ranking array for the given string
// using the doubling algorithm.
SuffixArray(const string &orig_text)
{
text = orig_text;
n = text.size();
S.resize(n+1);
R.resize(n+1);
sort_and_rank([this](int a, int b) -> bool { return text[a] < text[b]; });
for (int k=1; k<n; k*=2) {
sort_and_rank([this,k](int a, int b) -> bool {
pair<int,int> pa(R[a], (a+k < n) ? R[a+k] : -1);
pair<int,int> pb(R[b], (b+k < n) ? R[b+k] : -1);
return (pa < pb);
});
}
}
// An auxiliary function used in the doubling algorithm.
void sort_and_rank(function<bool(int a, int b)> comp)
{
for (size_t i=0; i<S.size(); i++)
S[i] = i;
sort(S.begin(), S.end(), comp);
vector<int> R2(S.size());
for (size_t i=0; i<S.size(); i++) {
if (!i || comp(S[i-1], S[i]) || comp(S[i], S[i-1]))
R2[S[i]] = i;
else
R2[S[i]] = R2[S[i-1]];
}
R.swap(R2);
}
// Return the number of distinct k-grams in the string.
int num_kgrams(int k)
{
// TODO: Implement
}
};
#include <algorithm>
#include <functional>
#include <string>
#include <vector>
#include <cstdint>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message);
#include "kgrams.h"
void test_generic(const string& text, int k, int expected_kg)
{
SuffixArray sa(text);
int num_kg = sa.num_kgrams(k);
EXPECT(num_kg == expected_kg, "Expected " + to_string(expected_kg) + " " + to_string(k) + "-grams, found " + to_string(num_kg) + ".");
}
// Test on a fixed string.
void test_explicit(int k, int expected_kg)
{
test_generic("annbansbananas", k, expected_kg);
}
// Test on a very non-uniform random string.
void test_random(int n, int k, int expected_kg)
{
string s(n, ' ');
uint32_t state = n;
for (int i=0; i<n; i++) {
state = state*2654289733 + 7;
unsigned x = (state >> 28) % 16;
char next = "aaaaaaaaaaaabbbc"[x];
s[i] = next;
}
test_generic(s, k, expected_kg);
}
// Test on an almost-constant string.
void test_trivial(int n, int k, int expected_kg)
{
string s(n, ' ');
for (int i=0; i<n; i++) {
if (i == n/2)
s[i] = 'b';
else
s[i] = 'a';
}
test_generic(s, k, expected_kg);
}
// Test on a constant string.
void test_uniform(int n, int k, int expected_kg)
{
string s(n, 'a');
test_generic(s, k, expected_kg);
}
vector<pair<string, function<void()>>> tests = {
{"basic-1", [] { test_explicit(1, 4); }},
{"basic-2", [] { test_explicit(2, 8); }},
{"basic-3", [] { test_explicit(3, 10); }},
{"basic-4", [] { test_explicit(4, 11); }},
{"basic-14", [] { test_explicit(14, 1); }},
{"short-5", [] { test_random(1000, 5, 107); }},
{"short-33", [] { test_random(1000, 33, 968); }},
{"short-500", [] { test_random(1000, 500, 501); }},
{"long-5", [] { test_random(100000, 5, 230); }},
{"long-33", [] { test_random(100000, 33, 99767); }},
{"long-5000", [] { test_random(100000, 5000, 95001); }},
{"triv-1", [] { test_trivial(1000000, 1, 2); }},
{"triv-5", [] { test_trivial(1000000, 5, 6); }},
{"triv-3333", [] { test_trivial(1000000, 3333, 3334); }},
{"triv-500000", [] { test_trivial(1000000, 500000, 500001); }},
{"unif-10", [] { test_uniform(1000000, 10, 1); }},
{"unif-500", [] { test_uniform(1000000, 500, 1); }},
{"unif-500000", [] { test_uniform(1000000, 500000, 1); }},
};
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
extern vector<pair<string, function<void()>>> tests;
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
int main(int argc, char* argv[]) {
vector<string> required_tests;
if (argc > 1) {
required_tests.assign(argv + 1, argv + argc);
} else {
for (const auto& test : tests)
required_tests.push_back(test.first);
}
for (const auto& required_test : required_tests) {
bool found = false;
for (const auto& test : tests)
if (required_test == test.first) {
cerr << "Running test " << required_test << endl;
test.second();
found = true;
break;
}
if (!found) {
cerr << "Unknown test " << required_test << endl;
return 1;
}
}
return 0;
}
class SuffixArray:
def __init__(self, text):
self.text = text
# S is the suffix array (a permutation which sorts suffixes)
# R is the ranking array (the inverse of S)
self.R, self.S = self._build_suffix_array(text)
def _build_suffix_array(self, text):
"""
Construct the suffix array and ranking array for the given string
using the doubling algorithm.
"""
n = len(text)
R = [None] * (n+1)
S = [None] * (n+1)
R = self._sort_and_rank(S, lambda a: ord(text[a]) if a < len(text) else -1)
k = 1
while k < n:
R = self._sort_and_rank(S, lambda a: (R[a], (R[a+k] if a+k < n else -1)))
k *= 2
return (tuple(R), tuple(S))
# An auxiliary function used in the doubling algorithm.
def _sort_and_rank(self, S, key):
for i in range(len(S)): S[i] = i
S.sort(key = key)
R = [None] * len(S)
for i, s in enumerate(S):
prev_s = S[i-1]
if i == 0 or key(prev_s) != key(s): R[s] = i
else: R[s] = R[prev_s]
return R
def num_kgrams(self, k):
"""Return the number of distinct k-grams in the string."""
raise NotImplementedError
#!/usr/bin/python
import sys
from kgrams import SuffixArray
def test_generic(text, k, expected_count):
sa = SuffixArray(text)
count = sa.num_kgrams(k)
assert count == expected_count, \
"Expected %i %i-grams, found %i." % (expected_count, k, count)
def test_explicit(k, expected_count):
test_generic("annbansbananas", k, expected_count)
def test_random(n, k, expected_count):
b = bytearray(n)
state = n
for i in range(n):
state = (state*2654289733 + 7) % (1 << 32)
x = (state >> 28) % 16
next = "aaaaaaaaaaaabbbc"[x]
b[i] = ord(next)
test_generic(b.decode(), k, expected_count)
def test_trivial(n, k, expected_count):
s = "".join( "b" if i == n // 2 else "a" for i in range (n) )
test_generic(s, k, expected_count)
def test_uniform(n, k, expected_count):
s = "".join(["a" for _ in range(n)])
test_generic(s, k, expected_count)
tests = [
("basic-1", lambda: test_explicit(1, 4)),
("basic-2", lambda: test_explicit(2, 8)),
("basic-3", lambda: test_explicit(3, 10)),
("basic-4", lambda: test_explicit(4, 11)),
("basic-14", lambda: test_explicit(14, 1)),
("short-5", lambda: test_random(1000, 5, 107)),
("short-33", lambda: test_random(1000, 33, 968)),
("short-500", lambda: test_random(1000, 500, 501)),
("long-5", lambda: test_random(100000, 5, 230)),
("long-33", lambda: test_random(100000, 33, 99767)),
("long-5000", lambda: test_random(100000, 5000, 95001)),
("triv-1", lambda: test_trivial(1000000, 1, 2)),
("triv-5", lambda: test_trivial(1000000, 5, 6)),
("triv-3333", lambda: test_trivial(1000000, 3333, 3334)),
("triv-500000", lambda: test_trivial(1000000, 500000, 500001)),
("unif-10", lambda: test_uniform(1000000, 10, 1)),
("unif-500", lambda: test_uniform(1000000, 500, 1)),
("unif-500000", lambda: test_uniform(1000000, 500000, 1)),
]
if __name__ == "__main__":
for required_test in sys.argv[1:] or [name for name, _ in tests]:
for name, test in tests:
if name == required_test:
print("Running test {}".format(name), file=sys.stderr)
test()
break
else:
raise ValueError("Unknown test {}".format(name))
Your task is to write a function which takes a string and an integer K
and it reports how many different K-grams (K-character substrings) the
string has.
You are given an algorithm for construction of the suffix array. For
simplicity, this algorithm has time complexity $O(n \log^2 n)$. Except
for constructing the suffix array, your algorithm should run in linear
time.
Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master).
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment