From dac98e6a953b6d0a37590be57a62a593b74843ad Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Wed, 5 May 2021 10:54:20 +0200
Subject: [PATCH] Find duplicates

---
 09-find_duplicates/cpp/Makefile               |  13 ++
 09-find_duplicates/cpp/find_duplicates.h      |  19 ++
 .../cpp/find_duplicates_test.cpp              | 212 ++++++++++++++++++
 09-find_duplicates/cpp/test_main.cpp          |  43 ++++
 09-find_duplicates/python/find_duplicates.py  |  18 ++
 .../python/find_duplicates_test.py            | 144 ++++++++++++
 09-find_duplicates/task.md                    |  22 ++
 7 files changed, 471 insertions(+)
 create mode 100644 09-find_duplicates/cpp/Makefile
 create mode 100644 09-find_duplicates/cpp/find_duplicates.h
 create mode 100644 09-find_duplicates/cpp/find_duplicates_test.cpp
 create mode 100644 09-find_duplicates/cpp/test_main.cpp
 create mode 100644 09-find_duplicates/python/find_duplicates.py
 create mode 100644 09-find_duplicates/python/find_duplicates_test.py
 create mode 100644 09-find_duplicates/task.md

diff --git a/09-find_duplicates/cpp/Makefile b/09-find_duplicates/cpp/Makefile
new file mode 100644
index 0000000..015ffbf
--- /dev/null
+++ b/09-find_duplicates/cpp/Makefile
@@ -0,0 +1,13 @@
+test: find_duplicates_test
+	./$<
+
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+
+find_duplicates_test: find_duplicates_test.cpp find_duplicates.h test_main.cpp
+	$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
+
+clean:
+	rm -f find_duplicates_test
+
+.PHONY: clean test
diff --git a/09-find_duplicates/cpp/find_duplicates.h b/09-find_duplicates/cpp/find_duplicates.h
new file mode 100644
index 0000000..ead255c
--- /dev/null
+++ b/09-find_duplicates/cpp/find_duplicates.h
@@ -0,0 +1,19 @@
+#include <unordered_map>
+
+vector<string> find_duplicates(DataGenerator& generator) {
+    /*
+     * Find duplicates in the given data.
+     *
+     * The `generator` provides a forward iterator over strings
+     * for traversing the data, so it can be iterated for example
+     * using a `for` cycle:
+     *
+     *   for (const string& item : generator) {...}
+     *
+     * The `generator` can be traversed multiple times.
+     *
+     * The goal is to return a vector of duplicated entries,
+     * reporting each duplicated entry only once.
+     */
+    return vector<string>();
+}
diff --git a/09-find_duplicates/cpp/find_duplicates_test.cpp b/09-find_duplicates/cpp/find_duplicates_test.cpp
new file mode 100644
index 0000000..32b0001
--- /dev/null
+++ b/09-find_duplicates/cpp/find_duplicates_test.cpp
@@ -0,0 +1,212 @@
+#include <cmath>
+#include <functional>
+#include <iterator>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <type_traits>
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message);
+
+template < typename Impl >
+class IteratorHelper : iterator<input_iterator_tag, typename Impl::T> {
+  public:
+    IteratorHelper() {}
+
+    template < typename ... Args >
+    IteratorHelper(Args... args) : impl(args...) { finished = !impl.next(); }
+
+    IteratorHelper& operator++() {
+        finished = !impl.next();
+        return *this;
+    }
+
+    IteratorHelper operator++(int) {
+        IteratorHelper tmp(*this);
+        operator++();
+        return tmp;
+    }
+
+    bool operator==(const IteratorHelper& other) const { return other.finished && finished; }
+    bool operator!=(const IteratorHelper& other) const { return !(*this == other); }
+    auto operator*() -> typename Impl::T { return impl.get(); }
+
+    private:
+    bool finished = true;
+    Impl impl;
+};
+
+
+class DataGenerator {
+  public:
+    struct Gen {
+      uint64_t state;
+      uint64_t mul;
+      uint64_t mod;
+
+      uint64_t next() {
+        uint64_t ret = state;
+        state = (state * mul) % mod;
+        return ret;
+      }
+    };
+
+    struct IteratorImpl {
+        DataGenerator* dg = nullptr;
+        bool only_dups;
+        Gen rng, fw_gen, bw_gen;
+        int fw_steps = 0;
+        int bw_steps = 0;
+        uint64_t val;
+        string ret;
+
+        using T = string;
+
+        IteratorImpl() {}
+
+        IteratorImpl(DataGenerator *dg, bool only_dups) : dg(dg), only_dups(only_dups) {
+            rng = { (dg->seed * 311) % dg->prime, 78403, dg->prime };
+            fw_gen = { dg->seed, dg->step, dg->prime };
+            bw_gen = { dg->rev_seed, dg->rev_step, dg->prime };
+        }
+
+        bool next() {
+            repeat:
+            if (fw_steps >= dg->length) return false;
+
+            if (rng.next() < dg->prime * (dg->repeat_prob / (dg->repeat_prob + 1))) {
+                while (rng.next() < dg->prime * (1 - dg->repeat_prob)) {
+                    bw_gen.next();
+                    bw_steps++;
+                }
+
+                if (only_dups && bw_steps >= dg->length) return false;
+
+                bw_steps++;
+                val = bw_gen.next();
+                return true;
+            } else {
+                fw_steps++;
+                if (!only_dups) {
+                  val = fw_gen.next();
+                  return true;
+                }
+                goto repeat;
+            }
+        }
+
+        string get() {
+            constexpr char alphabet[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-";
+            constexpr uint64_t p = (1 << 21) - 19;
+            static_assert(sizeof(alphabet) == 65); // +1 due to '\0' at the end
+
+            ret = string(dg->str_len, ' ');
+            uint64_t state = val;
+            int i = 0;
+
+            while (i < dg->str_len) {
+                for (int j = 0; j < 5 && i < dg->str_len; j++)
+                    ret[i++] = alphabet[(state >> (6*j)) & 0x3F];
+                state = state * p + 11;
+            }
+
+            return ret;
+        }
+    };
+
+    using Iterator = IteratorHelper<IteratorImpl>;
+
+    Iterator begin() { return Iterator(this, false); }
+    Iterator end() { return Iterator(); }
+
+    DataGenerator(int _seed, int _length, double _repeat_prob, int _str_len) {
+        prime = (1ULL << 30) - 101;
+
+        seed = _seed + 101 + _length;
+        for (int i = 0; i < 100; i++) seed = (seed * 54321) % prime;
+
+        repeat_prob = _repeat_prob;
+        length = _length;
+
+        step = 23987;
+
+        uint64_t x = pow_mod(step, length - 1, prime);
+        rev_seed = (x * seed) % prime;
+        rev_step = mult_inverse(step, prime);
+
+        str_len = _str_len;
+    };
+
+  private:
+    string alphabet;
+    uint64_t seed, rev_seed, step, rev_step, prime;
+    int length, str_len;
+    double repeat_prob;
+
+    Iterator dups() { return Iterator(this, true); }
+
+    uint64_t pow_mod(uint64_t x, uint64_t n, uint64_t mod) {
+        if (n == 0) return 1;
+        if (n == 1) return x % mod;
+
+        uint64_t rec = pow_mod(x, n / 2, mod);
+        rec = (rec * rec) % mod;
+
+        if (n % 2 == 1) return (rec * x) % mod;
+        return rec;
+    }
+
+    uint64_t mult_inverse(uint64_t x, uint64_t mod) {
+        // works only for prime mod
+        return pow_mod(x, mod - 2, mod);
+    }
+
+    friend void test_duplicates(int, int, double, int);
+};
+
+#include "find_duplicates.h"
+
+#ifdef __linux__
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+void test_duplicates(int seed, int length, double repeat_prob, int str_len) {
+#ifdef __linux__
+    rlimit data_limit;
+    data_limit.rlim_cur = data_limit.rlim_max = 64 << 20;
+    setrlimit(RLIMIT_DATA, &data_limit);
+#endif
+
+    DataGenerator generator(seed, length, repeat_prob, str_len);
+    auto results = find_duplicates(generator);
+
+    vector<string> correct;
+    for (auto it = generator.dups(); it != generator.end(); ++it)
+        correct.push_back(*it);
+
+    EXPECT(results.size() == correct.size(),
+           "Wrong number of generated duplicates, got " + to_string(results.size()) +
+           " and expected " + to_string(correct.size()));
+
+    sort(correct.begin(), correct.end());
+    sort(results.begin(), results.end());
+
+    for (int i = 0; i < int(results.size()); i++)
+        EXPECT(results[i] == correct[i],
+               "Wrong generated duplicate, got " + results[i] + " and expected " + correct[i]);
+}
+
+vector<pair<string, function<void()>>> tests = {
+    {"10k", [] { test_duplicates(43, 10*1000, 0.01, 13); }},
+    {"100k", [] { test_duplicates(43, 100*1000, 0.01, 20); }},
+    {"1M", [] { test_duplicates(43, 1000*1000, 0.001, 40); }},
+    {"10M", [] { test_duplicates(43, 10*1000*1000, 0.0001, 160); }},
+    {"16M", [] { test_duplicates(43, 16*1000*1000, 0.0001, 360); }},
+};
diff --git a/09-find_duplicates/cpp/test_main.cpp b/09-find_duplicates/cpp/test_main.cpp
new file mode 100644
index 0000000..3f4aff0
--- /dev/null
+++ b/09-find_duplicates/cpp/test_main.cpp
@@ -0,0 +1,43 @@
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+int main(int argc, char* argv[]) {
+    vector<string> required_tests;
+
+    if (argc > 1) {
+        required_tests.assign(argv + 1, argv + argc);
+    } else {
+        for (const auto& test : tests)
+            required_tests.push_back(test.first);
+    }
+
+    for (const auto& required_test : required_tests) {
+        bool found = false;
+        for (const auto& test : tests)
+            if (required_test == test.first) {
+                cerr << "Running test " << required_test << endl;
+                test.second();
+                found = true;
+                break;
+            }
+        if (!found) {
+            cerr << "Unknown test " << required_test << endl;
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/09-find_duplicates/python/find_duplicates.py b/09-find_duplicates/python/find_duplicates.py
new file mode 100644
index 0000000..424ed3d
--- /dev/null
+++ b/09-find_duplicates/python/find_duplicates.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+import sys
+
+def find_duplicates(data_generator):
+    """Find duplicates in the given data.
+
+    The `data_generator` is an iterable over strings, so it can be
+    iterated for example using a `for` cycle:
+
+      for item in data_generator: ...
+
+    It can be iterated multiple times.
+
+    The goal is to return a list of duplicated entries, reporting each duplicated
+    entry only once.
+    """
+
+    raise NotImplementedError()
diff --git a/09-find_duplicates/python/find_duplicates_test.py b/09-find_duplicates/python/find_duplicates_test.py
new file mode 100644
index 0000000..f2233cc
--- /dev/null
+++ b/09-find_duplicates/python/find_duplicates_test.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+import gc
+import itertools
+import sys
+import string
+
+from find_duplicates import find_duplicates
+
+class DataGenerator():
+    def __init__(self, seed, length, repeat_prob, str_len):
+        self.prime = 2**30 - 101
+
+        self.seed = seed + 101 + length
+        for _ in range(100): self.seed = (self.seed * 54321) % self.prime
+
+        self.repeat_prob = float(repeat_prob)
+        self.length = length
+
+        self.step = 23987
+
+        x = self._pow_mod(self.step, self.length - 1, self.prime)
+        self.rev_seed = (x * self.seed) % self.prime
+        self.rev_step = self._mult_inverse(self.step, self.prime)
+
+        self.str_len = str_len
+
+    def _generator(self, only_dups=False):
+        def gen(seed, step):
+            state = seed
+            while True:
+                yield state
+                state = (state * step) % self.prime
+
+        rng = gen((self.seed * 311) % self.prime, 78403)
+        fw_gen = gen(self.seed, self.step)
+        bw_gen = gen(self.rev_seed, self.rev_step)
+
+        fw_steps = 0
+        bw_steps = 0
+        while fw_steps < self.length:
+            if next(rng) < self.prime * (self.repeat_prob / (self.repeat_prob + 1)):
+                while next(rng) < self.prime * (1 - self.repeat_prob):
+                    next(bw_gen)
+                    bw_steps += 1
+
+                if only_dups and bw_steps >= self.length: return
+
+                bw_steps += 1
+                yield self._make_string(next(bw_gen))
+            else:
+                fw_steps += 1
+                if not only_dups:
+                    yield self._make_string(next(fw_gen))
+
+    def _make_string(self, x):
+        alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
+        assert(len(alphabet) == 64)
+        long_strings = [
+            "hn7fHKPgyw6GiGu3dRx8NpDPIK1eB2",
+            "YPBhODY2UU7KTntxAI9YbK4JNPCPJj",
+            "5qh0uhJW3ZheD65ZnNThGeeB6ds7pI",
+            "wW8jgWM7cEkEmNWOsyEmOQezHGOGnf",
+            "JAL6lzo1W3viaHhBrAPC992YIBdQHS",
+            "Y7OtykNRwyNaZvHsLtFBYoVSJac9xM",
+            "xIHUKmJFH663fuzs37PXSC8AwL9inq",
+        ]
+        p = 2**21 - 19
+
+        ret = []
+        state = x
+        i = 0
+
+        for j in range(0, 30, 6):
+            if i >= self.str_len: break
+            ret.append(alphabet[(state >> j) & 0x3F])
+            i += 1
+
+        state = state * p + 11;
+
+        while i < self.str_len:
+            ret.append(long_strings[state % len(long_strings)])
+            state = state * p + 11;
+            i += len(ret[-1])
+
+        while i < self.str_len:
+            for j in range(0, 30, 6):
+                if i >= self.str_len: break
+                ret.append(alphabet[(state >> j) & 0x3F])
+                i += 1
+            state = state * p + 11;
+
+        return "".join(ret)
+
+    def __iter__(self):
+        return self._generator()
+
+    def _pow_mod(self, x, n, mod):
+        if n == 0: return 1
+        if n == 1: return x % mod
+        rec = self._pow_mod(x, n // 2, mod)
+        rec = (rec * rec) % mod
+        if n % 2 == 1:
+            return (rec * x) % mod
+        else:
+            return rec
+
+    def _mult_inverse(self, x, mod):
+        # works only for prime mod
+        return self._pow_mod(x, mod - 2, mod)
+
+def test_duplicates(seed, length, repeat_prob, str_len):
+    generator = DataGenerator(seed, length, repeat_prob, str_len)
+    results = find_duplicates(generator)
+    gc.collect()
+
+    correct = list(generator._generator(only_dups=True))
+    assert len(results) == len(correct), \
+        "Wrong number of generated duplicates, got %i and expected %i" % (len(results), len(correct))
+    assert sorted(results) == sorted(correct), \
+        "The generates list of duplicates is not correct, got {} and expected {}".format(results, correct)
+
+tests = [
+    ("10k", lambda: test_duplicates(42, 10**4, 0.01, 14)),
+    ("100k", lambda: test_duplicates(10, 10**5, 0.01, 20)),
+    ("1M", lambda: test_duplicates(10, 10**6, 0.001, 340)),
+    ("10M", lambda: True),
+    ("16M", lambda: True),
+]
+
+if __name__ == "__main__":
+    try:
+        import resource
+        resource.setrlimit(resource.RLIMIT_DATA, (12<<20, 12<<20))
+    except:
+        pass
+
+    for required_test in sys.argv[1:] or [name for name, _ in tests]:
+        for name, test in tests:
+            if name == required_test:
+                print("Running test {}".format(name), file=sys.stderr)
+                test()
+                break
+        else:
+            raise ValueError("Unknown test {}".format(name))
diff --git a/09-find_duplicates/task.md b/09-find_duplicates/task.md
new file mode 100644
index 0000000..154ac2e
--- /dev/null
+++ b/09-find_duplicates/task.md
@@ -0,0 +1,22 @@
+In this assignment, you are given a large file on input. Your goal is to find
+duplicated lines and return every duplicated line once.
+
+The challenging part of this assignment is the fact, that your program has to
+run in a limited memory, using at most `64MB` for C++ and `12MB` for Python
+(and Python itself requires about 5MB), and the input file can be considerably
+larger than this memory limit. However, you can rely on the fact that the
+number of duplicated lines is considerably smaller (so that all duplicated
+lines fit in the memory at the same time).
+
+Instead of handling a real file, you are given a data generator (an `iterator`
+in C++ and a `generator` in Python). Note that limiting memory during the
+tests works only on Linux (and not on Windows), and of course also in ReCodEx.
+
+You can use full standard library of Python and C++ in this assignment,
+including data structure implementations (also, `bytearray` might come handy).
+Your solution must also work on other input data of the same size with similar
+number of duplicates. Hence solutions depending on the fact that each string is
+uniquely determined by some its substring or similar properties of the input
+will not be accepted.
+
+As usual, you should submit only the `find_duplicates.{h,py}` file.
-- 
GitLab