From 87bbc07193834634fe50dd287d80a042de91b3a6 Mon Sep 17 00:00:00 2001
From: Milan Straka <milan@strakovi.com>
Date: Wed, 8 May 2019 22:57:22 +0200
Subject: [PATCH] The find_duplicates assignment.

---
 11-find_duplicates/cpp/Makefile               |  13 +++
 11-find_duplicates/cpp/find_duplicates.h      |  16 +++
 .../cpp/find_duplicates_test.cpp              | 100 ++++++++++++++++++
 11-find_duplicates/cpp/test_main.cpp          |  43 ++++++++
 11-find_duplicates/python/find_duplicates.py  |  15 +++
 .../python/find_duplicates_test.py            |  54 ++++++++++
 11-find_duplicates/task.md                    |  18 ++++
 7 files changed, 259 insertions(+)
 create mode 100644 11-find_duplicates/cpp/Makefile
 create mode 100644 11-find_duplicates/cpp/find_duplicates.h
 create mode 100644 11-find_duplicates/cpp/find_duplicates_test.cpp
 create mode 100644 11-find_duplicates/cpp/test_main.cpp
 create mode 100644 11-find_duplicates/python/find_duplicates.py
 create mode 100644 11-find_duplicates/python/find_duplicates_test.py
 create mode 100644 11-find_duplicates/task.md

diff --git a/11-find_duplicates/cpp/Makefile b/11-find_duplicates/cpp/Makefile
new file mode 100644
index 0000000..015ffbf
--- /dev/null
+++ b/11-find_duplicates/cpp/Makefile
@@ -0,0 +1,13 @@
+test: find_duplicates_test
+	./$<
+
+INCLUDE ?= .
+CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
+
+find_duplicates_test: find_duplicates_test.cpp find_duplicates.h test_main.cpp
+	$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
+
+clean:
+	rm -f find_duplicates_test
+
+.PHONY: clean test
diff --git a/11-find_duplicates/cpp/find_duplicates.h b/11-find_duplicates/cpp/find_duplicates.h
new file mode 100644
index 0000000..bea6bbf
--- /dev/null
+++ b/11-find_duplicates/cpp/find_duplicates.h
@@ -0,0 +1,16 @@
+#include <unordered_map>
+
+vector<string> find_duplicates(DataGenerator& generator) {
+    /*
+     * Find duplicates in the given data.
+     *
+     * The `generator` provides a forward iterator over strings
+     * for traversing the data (so it can be iterated for example
+     * using a `for` cycle). It can be traversed multiple times.
+     *
+     * The goal is to return a vector of duplicated entries,
+     * reporting each duplicated entry only once, in the order
+     * of their first occurrence in the data.
+     */
+    return vector<string>();
+}
diff --git a/11-find_duplicates/cpp/find_duplicates_test.cpp b/11-find_duplicates/cpp/find_duplicates_test.cpp
new file mode 100644
index 0000000..8af7b58
--- /dev/null
+++ b/11-find_duplicates/cpp/find_duplicates_test.cpp
@@ -0,0 +1,100 @@
+#include <cmath>
+#include <functional>
+#include <iterator>
+#include <iostream>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+// If the condition is not true, report an error and halt.
+#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
+
+void expect_failed(const string& message);
+
+class DataGenerator {
+ public:
+   class Iterator : iterator<input_iterator_tag, string> {
+     public:
+       Iterator(int counter, DataGenerator* generator) { this->counter = counter; this->generator = generator; }
+       Iterator(const Iterator& other) { this->counter = other.counter; this->generator = other.generator; }
+       Iterator& operator++() { if (counter < generator->total_size) counter++; return *this; }
+       Iterator operator++(int) { Iterator tmp(*this); operator++(); return tmp; }
+       bool operator==(const Iterator& other) const { return counter == other.counter; }
+       bool operator!=(const Iterator& other) const { return counter != other.counter; }
+       const string& operator*() {
+           data.clear();
+
+           int segment = counter / generator->segment_size, index = counter % generator->segment_size;
+           if (index + 1 == generator->segment_size) {
+               data.push_back('0' + generator->base - 1 - segment);
+               data.append(generator->suffix);
+           } else {
+               data.push_back('0' + segment);
+               for (int length = generator->length - 1; length; length--, index /= generator->base)
+                   data.push_back('0' + (index % generator->base));
+           }
+           return data;
+       }
+
+     private:
+       DataGenerator* generator;
+       string data;
+       int counter;
+   };
+
+   inline Iterator begin() { return Iterator(0, this); }
+   inline Iterator end() { return Iterator(total_size, this); }
+
+   DataGenerator(int base, int length, string suffix) {
+       this->base = base;
+       this->length = length;
+       this->suffix = suffix;
+       segment_size = powl(base, length - 1) + 1;
+       total_size = base * segment_size;
+   };
+
+ private:
+   int base, length;
+   int segment_size, total_size;
+   string suffix;
+};
+
+#include "find_duplicates.h"
+
+#ifdef __linux__
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+
+void test_duplicates(int base, int length, string suffix) {
+#ifdef __linux__
+    rlimit data_limit;
+    data_limit.rlim_cur = data_limit.rlim_max = 64 << 20;
+    setrlimit(RLIMIT_DATA, &data_limit);
+#endif
+
+    DataGenerator generator(base, length, suffix);
+    auto results = find_duplicates(generator);
+
+    vector<string> correct;
+    for (int i = 0; i < base / 2; i++) {
+        correct.push_back(string(1, '0' + i) + suffix);
+        correct.push_back(string(1, '0' + base - 1 - i) + suffix);
+    }
+
+    EXPECT(results.size() == correct.size(),
+           "Wrong number of generated duplicates, got " + to_string(results.size()) +
+           " and expected " + to_string(correct.size()));
+    for (int i = 0; i < int(results.size()); i++)
+        EXPECT(results[i] == correct[i],
+               "Wrong generated duplicate, got " + results[i] + " and expected " + correct[i]);
+}
+
+vector<pair<string, function<void()>>> tests = {
+    {"10k", [] { test_duplicates(10, 4, "101"); }},
+    {"100k", [] { test_duplicates(10, 5, "1984"); }},
+    {"1M", [] { test_duplicates(10, 6, "22222"); }},
+    {"10M", [] { test_duplicates(10, 7, "314159"); }},
+    {"16M", [] { test_duplicates(8, 8, "7654321"); }},
+};
diff --git a/11-find_duplicates/cpp/test_main.cpp b/11-find_duplicates/cpp/test_main.cpp
new file mode 100644
index 0000000..3f4aff0
--- /dev/null
+++ b/11-find_duplicates/cpp/test_main.cpp
@@ -0,0 +1,43 @@
+#include <cstdlib>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace std;
+
+extern vector<pair<string, function<void()>>> tests;
+
+void expect_failed(const string& message) {
+    cerr << "Test error: " << message << endl;
+    exit(1);
+}
+
+int main(int argc, char* argv[]) {
+    vector<string> required_tests;
+
+    if (argc > 1) {
+        required_tests.assign(argv + 1, argv + argc);
+    } else {
+        for (const auto& test : tests)
+            required_tests.push_back(test.first);
+    }
+
+    for (const auto& required_test : required_tests) {
+        bool found = false;
+        for (const auto& test : tests)
+            if (required_test == test.first) {
+                cerr << "Running test " << required_test << endl;
+                test.second();
+                found = true;
+                break;
+            }
+        if (!found) {
+            cerr << "Unknown test " << required_test << endl;
+            return 1;
+        }
+    }
+
+    return 0;
+}
diff --git a/11-find_duplicates/python/find_duplicates.py b/11-find_duplicates/python/find_duplicates.py
new file mode 100644
index 0000000..62eacce
--- /dev/null
+++ b/11-find_duplicates/python/find_duplicates.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+import sys
+
+def find_duplicates(data_generator):
+    """Find duplicates in the given data.
+
+    The data_generator is an `iterable` (it can be iterated over in a for cycle,
+    or passed to `iter` method to explicitly generate an iterator) returning
+    strings. It can be iterated multiple times.
+
+    The goal is to return a list of duplicated entries, reporting each duplicated
+    entry only once, in the order of their first occurrence in the data.
+    """
+
+    raise NotImplementedError()
diff --git a/11-find_duplicates/python/find_duplicates_test.py b/11-find_duplicates/python/find_duplicates_test.py
new file mode 100644
index 0000000..0b08916
--- /dev/null
+++ b/11-find_duplicates/python/find_duplicates_test.py
@@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import gc
+import itertools
+import sys
+
+from find_duplicates import find_duplicates
+
+class DataGenerator():
+    def __init__(self, base, length, suffix):
+        self.digits = "0123456789"[:base]
+        self.length = length
+        self.suffix = suffix
+
+    def generator(self):
+        for first in range(len(self.digits)):
+            for sequence in itertools.product(self.digits[first], *[self.digits] * (self.length - 1)):
+                yield "".join(sequence)
+            yield self.digits[-1 - first] + self.suffix
+
+    def __iter__(self):
+        return self.generator()
+
+def test_duplicates(base, length, suffix):
+    generator = DataGenerator(base, length, suffix)
+    results = find_duplicates(generator)
+    gc.collect()
+
+    prefixes = [generator.digits[i] for o in range(0, base // 2) for i in [o, -1 - o]]
+    correct = [prefix + suffix for prefix in prefixes]
+    assert results == correct, "The generates list of duplicates is not correct, got {} and expected {}".format(results, correct)
+
+tests = [
+    ("10k", lambda: test_duplicates(10, 4, "101")),
+    ("100k", lambda: test_duplicates(10, 5, "1984")),
+    ("1M", lambda: test_duplicates(10, 6, "22222")),
+    ("10M", lambda: test_duplicates(10, 7, "314159")),
+    ("16M", lambda: test_duplicates(8, 8, "7654321")),
+]
+
+if __name__ == "__main__":
+    try:
+        import resource
+        resource.setrlimit(resource.RLIMIT_DATA, (64<<20, 64<<20))
+    except:
+        pass
+
+    for required_test in sys.argv[1:] or [name for name, _ in tests]:
+        for name, test in tests:
+            if name == required_test:
+                print("Running test {}".format(name), file=sys.stderr)
+                test()
+                break
+        else:
+            raise ValueError("Unknown test {}".format(name))
diff --git a/11-find_duplicates/task.md b/11-find_duplicates/task.md
new file mode 100644
index 0000000..ad076c4
--- /dev/null
+++ b/11-find_duplicates/task.md
@@ -0,0 +1,18 @@
+In this assignment, you are given a large file on input. Your goal is to find
+duplicated lines and return every duplicated line once, in the order of their
+first occurrences in the file.
+
+The challenging part of this assignment is the fact, that your program has to
+run in a limited memory, using at most `64MB`, and the input file can be
+considerably larger than this memory limit. However, you can rely on the fact
+that the number of duplicated lines is considerably smaller (so that all
+duplicated lines fit in the memory at the same time).
+
+Instead of handling a real file, you are given a data generator (an `iterator`
+in C++ and a `generator` in Python). Note that limiting memory during the
+tests works only on Linux (and not on Windows), and of course also in ReCodEx.
+
+You can use full standard library of Python and C++ in this assignment,
+including data structure implementations (also, `bytearray` might come handy).
+Note that the largest test in Python can run for several minutes.
+As usual, you should submit only the `find_duplicates.{h,py}` file.
-- 
GitLab