From 2194946825b3c71a4d097846f5b9aea6dc9b23f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ondra=20Mi=C4=8Dka=20=40=20miles-teg?=
<mitch.ondra@gmail.com>
Date: Mon, 11 May 2020 13:39:35 +0200
Subject: [PATCH] Find duplicates
---
11-find_duplicates/cpp/Makefile | 13 ++
11-find_duplicates/cpp/find_duplicates.h | 19 ++
.../cpp/find_duplicates_test.cpp | 212 ++++++++++++++++++
11-find_duplicates/cpp/test_main.cpp | 43 ++++
11-find_duplicates/python/find_duplicates.py | 18 ++
.../python/find_duplicates_test.py | 144 ++++++++++++
11-find_duplicates/task.md | 22 ++
7 files changed, 471 insertions(+)
create mode 100644 11-find_duplicates/cpp/Makefile
create mode 100644 11-find_duplicates/cpp/find_duplicates.h
create mode 100644 11-find_duplicates/cpp/find_duplicates_test.cpp
create mode 100644 11-find_duplicates/cpp/test_main.cpp
create mode 100644 11-find_duplicates/python/find_duplicates.py
create mode 100644 11-find_duplicates/python/find_duplicates_test.py
create mode 100644 11-find_duplicates/task.md
diff --git a/11-find_duplicates/cpp/Makefile b/11-find_duplicates/cpp/Makefile
new file mode 100644
index 0000000..015ffbf
--- /dev/null
+++ b/11-find_duplicates/cpp/Makefile
@@ -0,0 +1,13 @@
+test: find_duplicates_test
+ ./$<
+
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+
+find_duplicates_test: find_duplicates_test.cpp find_duplicates.h test_main.cpp
+ $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
+
+clean:
+ rm -f find_duplicates_test
+
+.PHONY: clean test
diff --git a/11-find_duplicates/cpp/find_duplicates.h b/11-find_duplicates/cpp/find_duplicates.h
new file mode 100644
index 0000000..ead255c
--- /dev/null
+++ b/11-find_duplicates/cpp/find_duplicates.h
@@ -0,0 +1,19 @@
+#include <unordered_map>
+
+vector<string> find_duplicates(DataGenerator& generator) {
+ /*
+ * Find duplicates in the given data.
+ *
+ * The `generator` provides a forward iterator over strings
+ * for traversing the data, so it can be iterated for example
+ * using a `for` cycle:
+ *
+ * for (const string& item : generator) {...}
+ *
+ * The `generator` can be traversed multiple times.
+ *
+ * The goal is to return a vector of duplicated entries,
+ * reporting each duplicated entry only once.
+ */
+ return vector<string>();
+}
diff --git a/11-find_duplicates/cpp/find_duplicates_test.cpp b/11-find_duplicates/cpp/find_duplicates_test.cpp
new file mode 100644
index 0000000..32b0001
--- /dev/null
+++ b/11-find_duplicates/cpp/find_duplicates_test.cpp
@@ -0,0 +1,212 @@
+#include <cmath>
+#include <functional>
+#include <iterator>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <type_traits>
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message);
+
+template < typename Impl >
+class IteratorHelper : iterator<input_iterator_tag, typename Impl::T> {
+ public:
+ IteratorHelper() {}
+
+ template < typename ... Args >
+ IteratorHelper(Args... args) : impl(args...) { finished = !impl.next(); }
+
+ IteratorHelper& operator++() {
+ finished = !impl.next();
+ return *this;
+ }
+
+ IteratorHelper operator++(int) {
+ IteratorHelper tmp(*this);
+ operator++();
+ return tmp;
+ }
+
+ bool operator==(const IteratorHelper& other) const { return other.finished && finished; }
+ bool operator!=(const IteratorHelper& other) const { return !(*this == other); }
+ auto operator*() -> typename Impl::T { return impl.get(); }
+
+ private:
+ bool finished = true;
+ Impl impl;
+};
+
+
+class DataGenerator {
+ public:
+ struct Gen {
+ uint64_t state;
+ uint64_t mul;
+ uint64_t mod;
+
+ uint64_t next() {
+ uint64_t ret = state;
+ state = (state * mul) % mod;
+ return ret;
+ }
+ };
+
+ struct IteratorImpl {
+ DataGenerator* dg = nullptr;
+ bool only_dups;
+ Gen rng, fw_gen, bw_gen;
+ int fw_steps = 0;
+ int bw_steps = 0;
+ uint64_t val;
+ string ret;
+
+ using T = string;
+
+ IteratorImpl() {}
+
+ IteratorImpl(DataGenerator *dg, bool only_dups) : dg(dg), only_dups(only_dups) {
+ rng = { (dg->seed * 311) % dg->prime, 78403, dg->prime };
+ fw_gen = { dg->seed, dg->step, dg->prime };
+ bw_gen = { dg->rev_seed, dg->rev_step, dg->prime };
+ }
+
+ bool next() {
+ repeat:
+ if (fw_steps >= dg->length) return false;
+
+ if (rng.next() < dg->prime * (dg->repeat_prob / (dg->repeat_prob + 1))) {
+ while (rng.next() < dg->prime * (1 - dg->repeat_prob)) {
+ bw_gen.next();
+ bw_steps++;
+ }
+
+ if (only_dups && bw_steps >= dg->length) return false;
+
+ bw_steps++;
+ val = bw_gen.next();
+ return true;
+ } else {
+ fw_steps++;
+ if (!only_dups) {
+ val = fw_gen.next();
+ return true;
+ }
+ goto repeat;
+ }
+ }
+
+ string get() {
+ constexpr char alphabet[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-";
+ constexpr uint64_t p = (1 << 21) - 19;
+ static_assert(sizeof(alphabet) == 65); // +1 due to '\0' at the end
+
+ ret = string(dg->str_len, ' ');
+ uint64_t state = val;
+ int i = 0;
+
+ while (i < dg->str_len) {
+ for (int j = 0; j < 5 && i < dg->str_len; j++)
+ ret[i++] = alphabet[(state >> (6*j)) & 0x3F];
+ state = state * p + 11;
+ }
+
+ return ret;
+ }
+ };
+
+ using Iterator = IteratorHelper<IteratorImpl>;
+
+ Iterator begin() { return Iterator(this, false); }
+ Iterator end() { return Iterator(); }
+
+ DataGenerator(int _seed, int _length, double _repeat_prob, int _str_len) {
+ prime = (1ULL << 30) - 101;
+
+ seed = _seed + 101 + _length;
+ for (int i = 0; i < 100; i++) seed = (seed * 54321) % prime;
+
+ repeat_prob = _repeat_prob;
+ length = _length;
+
+ step = 23987;
+
+ uint64_t x = pow_mod(step, length - 1, prime);
+ rev_seed = (x * seed) % prime;
+ rev_step = mult_inverse(step, prime);
+
+ str_len = _str_len;
+ };
+
+ private:
+ string alphabet;
+ uint64_t seed, rev_seed, step, rev_step, prime;
+ int length, str_len;
+ double repeat_prob;
+
+ Iterator dups() { return Iterator(this, true); }
+
+ uint64_t pow_mod(uint64_t x, uint64_t n, uint64_t mod) {
+ if (n == 0) return 1;
+ if (n == 1) return x % mod;
+
+ uint64_t rec = pow_mod(x, n / 2, mod);
+ rec = (rec * rec) % mod;
+
+ if (n % 2 == 1) return (rec * x) % mod;
+ return rec;
+ }
+
+ uint64_t mult_inverse(uint64_t x, uint64_t mod) {
+ // works only for prime mod
+ return pow_mod(x, mod - 2, mod);
+ }
+
+ friend void test_duplicates(int, int, double, int);
+};
+
+#include "find_duplicates.h"
+
+#ifdef __linux__
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+void test_duplicates(int seed, int length, double repeat_prob, int str_len) {
+#ifdef __linux__
+ rlimit data_limit;
+ data_limit.rlim_cur = data_limit.rlim_max = 64 << 20;
+ setrlimit(RLIMIT_DATA, &data_limit);
+#endif
+
+ DataGenerator generator(seed, length, repeat_prob, str_len);
+ auto results = find_duplicates(generator);
+
+ vector<string> correct;
+ for (auto it = generator.dups(); it != generator.end(); ++it)
+ correct.push_back(*it);
+
+ EXPECT(results.size() == correct.size(),
+ "Wrong number of generated duplicates, got " + to_string(results.size()) +
+ " and expected " + to_string(correct.size()));
+
+ sort(correct.begin(), correct.end());
+ sort(results.begin(), results.end());
+
+ for (int i = 0; i < int(results.size()); i++)
+ EXPECT(results[i] == correct[i],
+ "Wrong generated duplicate, got " + results[i] + " and expected " + correct[i]);
+}
+
+vector<pair<string, function<void()>>> tests = {
+ {"10k", [] { test_duplicates(43, 10*1000, 0.01, 13); }},
+ {"100k", [] { test_duplicates(43, 100*1000, 0.01, 20); }},
+ {"1M", [] { test_duplicates(43, 1000*1000, 0.001, 40); }},
+ {"10M", [] { test_duplicates(43, 10*1000*1000, 0.0001, 160); }},
+ {"16M", [] { test_duplicates(43, 16*1000*1000, 0.0001, 360); }},
+};
diff --git a/11-find_duplicates/cpp/test_main.cpp b/11-find_duplicates/cpp/test_main.cpp
new file mode 100644
index 0000000..3f4aff0
--- /dev/null
+++ b/11-find_duplicates/cpp/test_main.cpp
@@ -0,0 +1,43 @@
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+ cerr << "Test error: " << message << endl;
+ exit(1);
+}
+
+int main(int argc, char* argv[]) {
+ vector<string> required_tests;
+
+ if (argc > 1) {
+ required_tests.assign(argv + 1, argv + argc);
+ } else {
+ for (const auto& test : tests)
+ required_tests.push_back(test.first);
+ }
+
+ for (const auto& required_test : required_tests) {
+ bool found = false;
+ for (const auto& test : tests)
+ if (required_test == test.first) {
+ cerr << "Running test " << required_test << endl;
+ test.second();
+ found = true;
+ break;
+ }
+ if (!found) {
+ cerr << "Unknown test " << required_test << endl;
+ return 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/11-find_duplicates/python/find_duplicates.py b/11-find_duplicates/python/find_duplicates.py
new file mode 100644
index 0000000..424ed3d
--- /dev/null
+++ b/11-find_duplicates/python/find_duplicates.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+import sys
+
+def find_duplicates(data_generator):
+ """Find duplicates in the given data.
+
+ The `data_generator` is an iterable over strings, so it can be
+ iterated for example using a `for` cycle:
+
+ for item in data_generator: ...
+
+ It can be iterated multiple times.
+
+ The goal is to return a list of duplicated entries, reporting each duplicated
+ entry only once.
+ """
+
+ raise NotImplementedError()
diff --git a/11-find_duplicates/python/find_duplicates_test.py b/11-find_duplicates/python/find_duplicates_test.py
new file mode 100644
index 0000000..f2233cc
--- /dev/null
+++ b/11-find_duplicates/python/find_duplicates_test.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+import gc
+import itertools
+import sys
+import string
+
+from find_duplicates import find_duplicates
+
+class DataGenerator():
+ def __init__(self, seed, length, repeat_prob, str_len):
+ self.prime = 2**30 - 101
+
+ self.seed = seed + 101 + length
+ for _ in range(100): self.seed = (self.seed * 54321) % self.prime
+
+ self.repeat_prob = float(repeat_prob)
+ self.length = length
+
+ self.step = 23987
+
+ x = self._pow_mod(self.step, self.length - 1, self.prime)
+ self.rev_seed = (x * self.seed) % self.prime
+ self.rev_step = self._mult_inverse(self.step, self.prime)
+
+ self.str_len = str_len
+
+ def _generator(self, only_dups=False):
+ def gen(seed, step):
+ state = seed
+ while True:
+ yield state
+ state = (state * step) % self.prime
+
+ rng = gen((self.seed * 311) % self.prime, 78403)
+ fw_gen = gen(self.seed, self.step)
+ bw_gen = gen(self.rev_seed, self.rev_step)
+
+ fw_steps = 0
+ bw_steps = 0
+ while fw_steps < self.length:
+ if next(rng) < self.prime * (self.repeat_prob / (self.repeat_prob + 1)):
+ while next(rng) < self.prime * (1 - self.repeat_prob):
+ next(bw_gen)
+ bw_steps += 1
+
+ if only_dups and bw_steps >= self.length: return
+
+ bw_steps += 1
+ yield self._make_string(next(bw_gen))
+ else:
+ fw_steps += 1
+ if not only_dups:
+ yield self._make_string(next(fw_gen))
+
+ def _make_string(self, x):
+ alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+ assert(len(alphabet) == 64)
+ long_strings = [
+ "hn7fHKPgyw6GiGu3dRx8NpDPIK1eB2",
+ "YPBhODY2UU7KTntxAI9YbK4JNPCPJj",
+ "5qh0uhJW3ZheD65ZnNThGeeB6ds7pI",
+ "wW8jgWM7cEkEmNWOsyEmOQezHGOGnf",
+ "JAL6lzo1W3viaHhBrAPC992YIBdQHS",
+ "Y7OtykNRwyNaZvHsLtFBYoVSJac9xM",
+ "xIHUKmJFH663fuzs37PXSC8AwL9inq",
+ ]
+ p = 2**21 - 19
+
+ ret = []
+ state = x
+ i = 0
+
+ for j in range(0, 30, 6):
+ if i >= self.str_len: break
+ ret.append(alphabet[(state >> j) & 0x3F])
+ i += 1
+
+ state = state * p + 11;
+
+ while i < self.str_len:
+ ret.append(long_strings[state % len(long_strings)])
+ state = state * p + 11;
+ i += len(ret[-1])
+
+ while i < self.str_len:
+ for j in range(0, 30, 6):
+ if i >= self.str_len: break
+ ret.append(alphabet[(state >> j) & 0x3F])
+ i += 1
+ state = state * p + 11;
+
+ return "".join(ret)
+
+ def __iter__(self):
+ return self._generator()
+
+ def _pow_mod(self, x, n, mod):
+ if n == 0: return 1
+ if n == 1: return x % mod
+ rec = self._pow_mod(x, n // 2, mod)
+ rec = (rec * rec) % mod
+ if n % 2 == 1:
+ return (rec * x) % mod
+ else:
+ return rec
+
+ def _mult_inverse(self, x, mod):
+ # works only for prime mod
+ return self._pow_mod(x, mod - 2, mod)
+
+def test_duplicates(seed, length, repeat_prob, str_len):
+ generator = DataGenerator(seed, length, repeat_prob, str_len)
+ results = find_duplicates(generator)
+ gc.collect()
+
+ correct = list(generator._generator(only_dups=True))
+ assert len(results) == len(correct), \
+ "Wrong number of generated duplicates, got %i and expected %i" % (len(results), len(correct))
+ assert sorted(results) == sorted(correct), \
+ "The generates list of duplicates is not correct, got {} and expected {}".format(results, correct)
+
+tests = [
+ ("10k", lambda: test_duplicates(42, 10**4, 0.01, 14)),
+ ("100k", lambda: test_duplicates(10, 10**5, 0.01, 20)),
+ ("1M", lambda: test_duplicates(10, 10**6, 0.001, 340)),
+ ("10M", lambda: True),
+ ("16M", lambda: True),
+]
+
+if __name__ == "__main__":
+ try:
+ import resource
+ resource.setrlimit(resource.RLIMIT_DATA, (12<<20, 12<<20))
+ except:
+ pass
+
+ for required_test in sys.argv[1:] or [name for name, _ in tests]:
+ for name, test in tests:
+ if name == required_test:
+ print("Running test {}".format(name), file=sys.stderr)
+ test()
+ break
+ else:
+ raise ValueError("Unknown test {}".format(name))
diff --git a/11-find_duplicates/task.md b/11-find_duplicates/task.md
new file mode 100644
index 0000000..154ac2e
--- /dev/null
+++ b/11-find_duplicates/task.md
@@ -0,0 +1,22 @@
+In this assignment, you are given a large file on input. Your goal is to find
+duplicated lines and return every duplicated line once.
+
+The challenging part of this assignment is the fact, that your program has to
+run in a limited memory, using at most `64MB` for C++ and `12MB` for Python
+(and Python itself requires about 5MB), and the input file can be considerably
+larger than this memory limit. However, you can rely on the fact that the
+number of duplicated lines is considerably smaller (so that all duplicated
+lines fit in the memory at the same time).
+
+Instead of handling a real file, you are given a data generator (an `iterator`
+in C++ and a `generator` in Python). Note that limiting memory during the
+tests works only on Linux (and not on Windows), and of course also in ReCodEx.
+
+You can use full standard library of Python and C++ in this assignment,
+including data structure implementations (also, `bytearray` might come handy).
+Your solution must also work on other input data of the same size with similar
+number of duplicates. Hence solutions depending on the fact that each string is
+uniquely determined by some its substring or similar properties of the input
+will not be accepted.
+
+As usual, you should submit only the `find_duplicates.{h,py}` file.
--
GitLab