Commit 12d7291f authored by Martin Mareš's avatar Martin Mareš

Find duplicates

parent 82e31d7f
test: find_duplicates_test
./$<
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
find_duplicates_test: find_duplicates_test.cpp find_duplicates.h test_main.cpp
$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
clean:
rm -f find_duplicates_test
.PHONY: clean test
#include <unordered_map>
vector<string> find_duplicates(DataGenerator& generator) {
/*
* Find duplicates in the given data.
*
* The `generator` provides a forward iterator over strings
* for traversing the data, so it can be iterated for example
* using a `for` cycle:
*
* for (const string& item : generator) {...}
*
* The `generator` can be traversed multiple times.
*
* The goal is to return a vector of duplicated entries,
* reporting each duplicated entry only once.
*/
return vector<string>();
}
#include <cmath>
#include <functional>
#include <iterator>
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <type_traits>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message);
template < typename Impl >
class IteratorHelper : iterator<input_iterator_tag, typename Impl::T> {
public:
IteratorHelper() {}
template < typename ... Args >
IteratorHelper(Args... args) : impl(args...) { finished = !impl.next(); }
IteratorHelper& operator++() {
finished = !impl.next();
return *this;
}
IteratorHelper operator++(int) {
IteratorHelper tmp(*this);
operator++();
return tmp;
}
bool operator==(const IteratorHelper& other) const { return other.finished && finished; }
bool operator!=(const IteratorHelper& other) const { return !(*this == other); }
auto operator*() -> typename Impl::T { return impl.get(); }
private:
bool finished = true;
Impl impl;
};
class DataGenerator {
public:
struct Gen {
uint64_t state;
uint64_t mul;
uint64_t mod;
uint64_t next() {
uint64_t ret = state;
state = (state * mul) % mod;
return ret;
}
};
struct IteratorImpl {
DataGenerator* dg = nullptr;
bool only_dups;
Gen rng, fw_gen, bw_gen;
int fw_steps = 0;
int bw_steps = 0;
uint64_t val;
string ret;
using T = string;
IteratorImpl() {}
IteratorImpl(DataGenerator *dg, bool only_dups) : dg(dg), only_dups(only_dups) {
rng = { (dg->seed * 311) % dg->prime, 78403, dg->prime };
fw_gen = { dg->seed, dg->step, dg->prime };
bw_gen = { dg->rev_seed, dg->rev_step, dg->prime };
}
bool next() {
repeat:
if (fw_steps >= dg->length) return false;
if (rng.next() < dg->prime * (dg->repeat_prob / (dg->repeat_prob + 1))) {
while (rng.next() < dg->prime * (1 - dg->repeat_prob)) {
bw_gen.next();
bw_steps++;
}
if (only_dups && bw_steps >= dg->length) return false;
bw_steps++;
val = bw_gen.next();
return true;
} else {
fw_steps++;
if (!only_dups) {
val = fw_gen.next();
return true;
}
goto repeat;
}
}
string get() {
constexpr char alphabet[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-";
constexpr uint64_t p = (1 << 21) - 19;
static_assert(sizeof(alphabet) == 65); // +1 due to '\0' at the end
ret = string(dg->str_len, ' ');
uint64_t state = val;
int i = 0;
while (i < dg->str_len) {
for (int j = 0; j < 5 && i < dg->str_len; j++)
ret[i++] = alphabet[(state >> (6*j)) & 0x3F];
state = state * p + 11;
}
return ret;
}
};
using Iterator = IteratorHelper<IteratorImpl>;
Iterator begin() { return Iterator(this, false); }
Iterator end() { return Iterator(); }
DataGenerator(int _seed, int _length, double _repeat_prob, int _str_len) {
prime = (1ULL << 30) - 101;
seed = _seed + 101 + _length;
for (int i = 0; i < 100; i++) seed = (seed * 54321) % prime;
repeat_prob = _repeat_prob;
length = _length;
step = 23987;
uint64_t x = pow_mod(step, length - 1, prime);
rev_seed = (x * seed) % prime;
rev_step = mult_inverse(step, prime);
str_len = _str_len;
};
private:
string alphabet;
uint64_t seed, rev_seed, step, rev_step, prime;
int length, str_len;
double repeat_prob;
Iterator dups() { return Iterator(this, true); }
uint64_t pow_mod(uint64_t x, uint64_t n, uint64_t mod) {
if (n == 0) return 1;
if (n == 1) return x % mod;
uint64_t rec = pow_mod(x, n / 2, mod);
rec = (rec * rec) % mod;
if (n % 2 == 1) return (rec * x) % mod;
return rec;
}
uint64_t mult_inverse(uint64_t x, uint64_t mod) {
// works only for prime mod
return pow_mod(x, mod - 2, mod);
}
friend void test_duplicates(int, int, double, int);
};
#include "find_duplicates.h"
#ifdef __linux__
#include <sys/time.h>
#include <sys/resource.h>
#endif
void test_duplicates(int seed, int length, double repeat_prob, int str_len) {
#ifdef __linux__
rlimit data_limit;
data_limit.rlim_cur = data_limit.rlim_max = 64 << 20;
setrlimit(RLIMIT_DATA, &data_limit);
#endif
DataGenerator generator(seed, length, repeat_prob, str_len);
auto results = find_duplicates(generator);
vector<string> correct;
for (auto it = generator.dups(); it != generator.end(); ++it)
correct.push_back(*it);
EXPECT(results.size() == correct.size(),
"Wrong number of generated duplicates, got " + to_string(results.size()) +
" and expected " + to_string(correct.size()));
sort(correct.begin(), correct.end());
sort(results.begin(), results.end());
for (int i = 0; i < int(results.size()); i++)
EXPECT(results[i] == correct[i],
"Wrong generated duplicate, got " + results[i] + " and expected " + correct[i]);
}
vector<pair<string, function<void()>>> tests = {
{"10k", [] { test_duplicates(43, 10*1000, 0.01, 13); }},
{"100k", [] { test_duplicates(43, 100*1000, 0.01, 20); }},
{"1M", [] { test_duplicates(43, 1000*1000, 0.001, 40); }},
{"10M", [] { test_duplicates(43, 10*1000*1000, 0.0001, 160); }},
{"16M", [] { test_duplicates(43, 16*1000*1000, 0.0001, 360); }},
};
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
extern vector<pair<string, function<void()>>> tests;
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
int main(int argc, char* argv[]) {
vector<string> required_tests;
if (argc > 1) {
required_tests.assign(argv + 1, argv + argc);
} else {
for (const auto& test : tests)
required_tests.push_back(test.first);
}
for (const auto& required_test : required_tests) {
bool found = false;
for (const auto& test : tests)
if (required_test == test.first) {
cerr << "Running test " << required_test << endl;
test.second();
found = true;
break;
}
if (!found) {
cerr << "Unknown test " << required_test << endl;
return 1;
}
}
return 0;
}
#!/usr/bin/env python3
import sys
def find_duplicates(data_generator):
"""Find duplicates in the given data.
The `data_generator` is an iterable over strings, so it can be
iterated for example using a `for` cycle:
for item in data_generator: ...
It can be iterated multiple times.
The goal is to return a list of duplicated entries, reporting each duplicated
entry only once.
"""
raise NotImplementedError()
#!/usr/bin/env python3
import gc
import itertools
import sys
import string
from find_duplicates import find_duplicates
class DataGenerator():
def __init__(self, seed, length, repeat_prob, str_len):
self.prime = 2**30 - 101
self.seed = seed + 101 + length
for _ in range(100): self.seed = (self.seed * 54321) % self.prime
self.repeat_prob = float(repeat_prob)
self.length = length
self.step = 23987
x = self._pow_mod(self.step, self.length - 1, self.prime)
self.rev_seed = (x * self.seed) % self.prime
self.rev_step = self._mult_inverse(self.step, self.prime)
self.str_len = str_len
def _generator(self, only_dups=False):
def gen(seed, step):
state = seed
while True:
yield state
state = (state * step) % self.prime
rng = gen((self.seed * 311) % self.prime, 78403)
fw_gen = gen(self.seed, self.step)
bw_gen = gen(self.rev_seed, self.rev_step)
fw_steps = 0
bw_steps = 0
while fw_steps < self.length:
if next(rng) < self.prime * (self.repeat_prob / (self.repeat_prob + 1)):
while next(rng) < self.prime * (1 - self.repeat_prob):
next(bw_gen)
bw_steps += 1
if only_dups and bw_steps >= self.length: return
bw_steps += 1
yield self._make_string(next(bw_gen))
else:
fw_steps += 1
if not only_dups:
yield self._make_string(next(fw_gen))
def _make_string(self, x):
alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
assert(len(alphabet) == 64)
long_strings = [
"hn7fHKPgyw6GiGu3dRx8NpDPIK1eB2",
"YPBhODY2UU7KTntxAI9YbK4JNPCPJj",
"5qh0uhJW3ZheD65ZnNThGeeB6ds7pI",
"wW8jgWM7cEkEmNWOsyEmOQezHGOGnf",
"JAL6lzo1W3viaHhBrAPC992YIBdQHS",
"Y7OtykNRwyNaZvHsLtFBYoVSJac9xM",
"xIHUKmJFH663fuzs37PXSC8AwL9inq",
]
p = 2**21 - 19
ret = []
state = x
i = 0
for j in range(0, 30, 6):
if i >= self.str_len: break
ret.append(alphabet[(state >> j) & 0x3F])
i += 1
state = state * p + 11;
while i < self.str_len:
ret.append(long_strings[state % len(long_strings)])
state = state * p + 11;
i += len(ret[-1])
while i < self.str_len:
for j in range(0, 30, 6):
if i >= self.str_len: break
ret.append(alphabet[(state >> j) & 0x3F])
i += 1
state = state * p + 11;
return "".join(ret)
def __iter__(self):
return self._generator()
def _pow_mod(self, x, n, mod):
if n == 0: return 1
if n == 1: return x % mod
rec = self._pow_mod(x, n // 2, mod)
rec = (rec * rec) % mod
if n % 2 == 1:
return (rec * x) % mod
else:
return rec
def _mult_inverse(self, x, mod):
# works only for prime mod
return self._pow_mod(x, mod - 2, mod)
def test_duplicates(seed, length, repeat_prob, str_len):
generator = DataGenerator(seed, length, repeat_prob, str_len)
results = find_duplicates(generator)
gc.collect()
correct = list(generator._generator(only_dups=True))
assert len(results) == len(correct), \
"Wrong number of generated duplicates, got %i and expected %i" % (len(results), len(correct))
assert sorted(results) == sorted(correct), \
"The generates list of duplicates is not correct, got {} and expected {}".format(results, correct)
tests = [
("10k", lambda: test_duplicates(42, 10**4, 0.01, 14)),
("100k", lambda: test_duplicates(10, 10**5, 0.01, 20)),
("1M", lambda: test_duplicates(10, 10**6, 0.001, 340)),
("10M", lambda: True),
("16M", lambda: True),
]
if __name__ == "__main__":
try:
import resource
resource.setrlimit(resource.RLIMIT_DATA, (12<<20, 12<<20))
except:
pass
for required_test in sys.argv[1:] or [name for name, _ in tests]:
for name, test in tests:
if name == required_test:
print("Running test {}".format(name), file=sys.stderr)
test()
break
else:
raise ValueError("Unknown test {}".format(name))
In this assignment, you are given a large file on input. Your goal is to find
duplicated lines and return every duplicated line once.
The challenging part of this assignment is the fact, that your program has to
run in a limited memory, using at most `64MB` for C++ and `12MB` for Python
(and Python itself requires about 5MB), and the input file can be considerably
larger than this memory limit. However, you can rely on the fact that the
number of duplicated lines is considerably smaller (so that all duplicated
lines fit in the memory at the same time).
Instead of handling a real file, you are given a data generator (an `iterator`
in C++ and a `generator` in Python). Note that limiting memory during the
tests works only on Linux (and not on Windows), and of course also in ReCodEx.
You can use full standard library of Python and C++ in this assignment,
including data structure implementations (also, `bytearray` might come handy).
Your solution must also work on other input data of the same size with similar
number of duplicates. Hence solutions depending on the fact that each string is
uniquely determined by some its substring or similar properties of the input
will not be accepted.
As usual, you should submit only the `find_duplicates.{h,py}` file.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment