Skip to content
Snippets Groups Projects
Commit d2c41f15 authored by Lukáš Ondráček's avatar Lukáš Ondráček
Browse files

K-grams

parent edecd693
No related branches found
No related tags found
No related merge requests found
test: kgrams_test
./$<
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
kgrams_test: kgrams_test.cpp kgrams.h test_main.cpp
$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
clean:
rm -f kgrams_test
.PHONY: clean test
#include <vector>
#include <iostream>
#include <algorithm>
#include <string>
#include <functional>
using namespace std;
class SuffixArray {
public:
string text;
int n; // Length of text
vector<int> S; // Permutation which sorts suffixes
vector<int> R; // Ranking array (an inverse of S)
// Construct suffix array and ranking array for the given string
// using the doubling algorithm.
SuffixArray(const string &orig_text)
{
text = orig_text;
n = text.size();
S.resize(n+1);
R.resize(n+1);
sort_and_rank([this](int a, int b) -> bool { return text[a] < text[b]; });
for (int k=1; k<n; k*=2) {
sort_and_rank([this,k](int a, int b) -> bool {
pair<int,int> pa(R[a], (a+k < n) ? R[a+k] : -1);
pair<int,int> pb(R[b], (b+k < n) ? R[b+k] : -1);
return (pa < pb);
});
}
}
// An auxiliary function used in the doubling algorithm.
void sort_and_rank(function<bool(int a, int b)> comp)
{
for (size_t i=0; i<S.size(); i++)
S[i] = i;
sort(S.begin(), S.end(), comp);
vector<int> R2(S.size());
for (size_t i=0; i<S.size(); i++) {
if (!i || comp(S[i-1], S[i]) || comp(S[i], S[i-1]))
R2[S[i]] = i;
else
R2[S[i]] = R2[S[i-1]];
}
R.swap(R2);
}
// Return the number of distinct k-grams in the string.
int num_kgrams(int k)
{
// TODO: Implement
}
};
#include <algorithm>
#include <functional>
#include <string>
#include <vector>
#include <cstdint>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message);
#include "kgrams.h"
void test_generic(const string& text, int k, int expected_kg)
{
SuffixArray sa(text);
int num_kg = sa.num_kgrams(k);
EXPECT(num_kg == expected_kg, "Expected " + to_string(expected_kg) + " " + to_string(k) + "-grams, found " + to_string(num_kg) + ".");
}
// Test on a fixed string.
void test_explicit(int k, int expected_kg)
{
test_generic("annbansbananas", k, expected_kg);
}
// Test on a very non-uniform random string.
void test_random(int n, int k, int expected_kg)
{
string s(n, ' ');
uint32_t state = n;
for (int i=0; i<n; i++) {
state = state*2654289733 + 7;
unsigned x = (state >> 28) % 16;
char next = "aaaaaaaaaaaabbbc"[x];
s[i] = next;
}
test_generic(s, k, expected_kg);
}
// Test on an almost-constant string.
void test_trivial(int n, int k, int expected_kg)
{
string s(n, ' ');
for (int i=0; i<n; i++) {
if (i == n/2)
s[i] = 'b';
else
s[i] = 'a';
}
test_generic(s, k, expected_kg);
}
vector<pair<string, function<void()>>> tests = {
{"basic-1", [] { test_explicit(1, 4); }},
{"basic-2", [] { test_explicit(2, 8); }},
{"basic-3", [] { test_explicit(3, 10); }},
{"basic-4", [] { test_explicit(4, 11); }},
{"basic-14", [] { test_explicit(14, 1); }},
{"short-5", [] { test_random(1000, 5, 107); }},
{"short-33", [] { test_random(1000, 33, 968); }},
{"short-500", [] { test_random(1000, 500, 501); }},
{"long-5", [] { test_random(100000, 5, 230); }},
{"long-33", [] { test_random(100000, 33, 99767); }},
{"long-5000", [] { test_random(100000, 5000, 95001); }},
{"triv-1", [] { test_trivial(1000000, 1, 2); }},
{"triv-5", [] { test_trivial(1000000, 5, 6); }},
{"triv-3333", [] { test_trivial(1000000, 3333, 3334); }},
{"triv-500000", [] { test_trivial(1000000, 500000, 500001); }},
};
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
extern vector<pair<string, function<void()>>> tests;
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
int main(int argc, char* argv[]) {
vector<string> required_tests;
if (argc > 1) {
required_tests.assign(argv + 1, argv + argc);
} else {
for (const auto& test : tests)
required_tests.push_back(test.first);
}
for (const auto& required_test : required_tests) {
bool found = false;
for (const auto& test : tests)
if (required_test == test.first) {
cerr << "Running test " << required_test << endl;
test.second();
found = true;
break;
}
if (!found) {
cerr << "Unknown test " << required_test << endl;
return 1;
}
}
return 0;
}
class SuffixArray:
def __init__(self, text):
self.text = text
# S is the suffix array (a permutation which sorts suffixes)
# R is the ranking array (the inverse of S)
self.R, self.S = self._build_suffix_array(text)
def _build_suffix_array(self, text):
"""
Construct the suffix array and ranking array for the given string
using the doubling algorithm.
"""
n = len(text)
R = [None] * (n+1)
S = [None] * (n+1)
R = self._sort_and_rank(S, lambda a: ord(text[a]) if a < len(text) else -1)
k = 1
while k < n:
R = self._sort_and_rank(S, lambda a: (R[a], (R[a+k] if a+k < n else -1)))
k *= 2
return (tuple(R), tuple(S))
# An auxiliary function used in the doubling algorithm.
def _sort_and_rank(self, S, key):
for i in range(len(S)): S[i] = i
S.sort(key = key)
R = [None] * len(S)
for i, s in enumerate(S):
prev_s = S[i-1]
if i == 0 or key(prev_s) != key(s): R[s] = i
else: R[s] = R[prev_s]
return R
def num_kgrams(self, k):
"""Return the number of distinct k-grams in the string."""
raise NotImplementedError
#!/usr/bin/python
import sys
from kgrams import SuffixArray
def test_generic(text, k, expected_count):
sa = SuffixArray(text)
count = sa.num_kgrams(k)
assert count == expected_count, \
"Expected %i %i-grams, found %i." % (expected_count, k, count)
def test_explicit(k, expected_count):
test_generic("annbansbananas", k, expected_count)
def test_random(n, k, expected_count):
b = bytearray(n)
state = n
for i in range(n):
state = (state*2654289733 + 7) % (1 << 32)
x = (state >> 28) % 16
next = "aaaaaaaaaaaabbbc"[x]
b[i] = ord(next)
test_generic(b.decode(), k, expected_count)
def test_trivial(n, k, expected_count):
s = "".join( "b" if i == n // 2 else "a" for i in range (n) )
test_generic(s, k, expected_count)
tests = [
("basic-1", lambda: test_explicit(1, 4)),
("basic-2", lambda: test_explicit(2, 8)),
("basic-3", lambda: test_explicit(3, 10)),
("basic-4", lambda: test_explicit(4, 11)),
("basic-14", lambda: test_explicit(14, 1)),
("short-5", lambda: test_random(1000, 5, 107)),
("short-33", lambda: test_random(1000, 33, 968)),
("short-500", lambda: test_random(1000, 500, 501)),
("long-5", lambda: test_random(100000, 5, 230)),
("long-33", lambda: test_random(100000, 33, 99767)),
("long-5000", lambda: test_random(100000, 5000, 95001)),
("triv-1", lambda: test_trivial(1000000, 1, 2)),
("triv-5", lambda: test_trivial(1000000, 5, 6)),
("triv-3333", lambda: test_trivial(1000000, 3333, 3334)),
("triv-500000", lambda: test_trivial(1000000, 500000, 500001)),
]
if __name__ == "__main__":
for required_test in sys.argv[1:] or [name for name, _ in tests]:
for name, test in tests:
if name == required_test:
print("Running test {}".format(name), file=sys.stderr)
test()
break
else:
raise ValueError("Unknown test {}".format(name))
Your task is to write a function which takes a string and an integer K
and it reports how many different K-grams (K-character substrings) the
string has.
You are given an algorithm for construction of the suffix array. For
simplicity, this algorithm has time complexity $O(n \log^2 n)$. Except
for constructing the suffix array, your algorithm should run in linear
time.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment