From 87bbc07193834634fe50dd287d80a042de91b3a6 Mon Sep 17 00:00:00 2001 From: Milan Straka <milan@strakovi.com> Date: Wed, 8 May 2019 22:57:22 +0200 Subject: [PATCH] The find_duplicates assignment. --- 11-find_duplicates/cpp/Makefile | 13 +++ 11-find_duplicates/cpp/find_duplicates.h | 16 +++ .../cpp/find_duplicates_test.cpp | 100 ++++++++++++++++++ 11-find_duplicates/cpp/test_main.cpp | 43 ++++++++ 11-find_duplicates/python/find_duplicates.py | 15 +++ .../python/find_duplicates_test.py | 54 ++++++++++ 11-find_duplicates/task.md | 18 ++++ 7 files changed, 259 insertions(+) create mode 100644 11-find_duplicates/cpp/Makefile create mode 100644 11-find_duplicates/cpp/find_duplicates.h create mode 100644 11-find_duplicates/cpp/find_duplicates_test.cpp create mode 100644 11-find_duplicates/cpp/test_main.cpp create mode 100644 11-find_duplicates/python/find_duplicates.py create mode 100644 11-find_duplicates/python/find_duplicates_test.py create mode 100644 11-find_duplicates/task.md diff --git a/11-find_duplicates/cpp/Makefile b/11-find_duplicates/cpp/Makefile new file mode 100644 index 0000000..015ffbf --- /dev/null +++ b/11-find_duplicates/cpp/Makefile @@ -0,0 +1,13 @@ +test: find_duplicates_test + ./$< + +INCLUDE ?= . +CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) + +find_duplicates_test: find_duplicates_test.cpp find_duplicates.h test_main.cpp + $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@ + +clean: + rm -f find_duplicates_test + +.PHONY: clean test diff --git a/11-find_duplicates/cpp/find_duplicates.h b/11-find_duplicates/cpp/find_duplicates.h new file mode 100644 index 0000000..bea6bbf --- /dev/null +++ b/11-find_duplicates/cpp/find_duplicates.h @@ -0,0 +1,16 @@ +#include <unordered_map> + +vector<string> find_duplicates(DataGenerator& generator) { + /* + * Find duplicates in the given data. + * + * The `generator` provides a forward iterator over strings + * for traversing the data (so it can be iterated for example + * using a `for` cycle). It can be traversed multiple times. + * + * The goal is to return a vector of duplicated entries, + * reporting each duplicated entry only once, in the order + * of their first occurrence in the data. + */ + return vector<string>(); +} diff --git a/11-find_duplicates/cpp/find_duplicates_test.cpp b/11-find_duplicates/cpp/find_duplicates_test.cpp new file mode 100644 index 0000000..8af7b58 --- /dev/null +++ b/11-find_duplicates/cpp/find_duplicates_test.cpp @@ -0,0 +1,100 @@ +#include <cmath> +#include <functional> +#include <iterator> +#include <iostream> +#include <string> +#include <vector> + +using namespace std; + +// If the condition is not true, report an error and halt. +#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) + +void expect_failed(const string& message); + +class DataGenerator { + public: + class Iterator : iterator<input_iterator_tag, string> { + public: + Iterator(int counter, DataGenerator* generator) { this->counter = counter; this->generator = generator; } + Iterator(const Iterator& other) { this->counter = other.counter; this->generator = other.generator; } + Iterator& operator++() { if (counter < generator->total_size) counter++; return *this; } + Iterator operator++(int) { Iterator tmp(*this); operator++(); return tmp; } + bool operator==(const Iterator& other) const { return counter == other.counter; } + bool operator!=(const Iterator& other) const { return counter != other.counter; } + const string& operator*() { + data.clear(); + + int segment = counter / generator->segment_size, index = counter % generator->segment_size; + if (index + 1 == generator->segment_size) { + data.push_back('0' + generator->base - 1 - segment); + data.append(generator->suffix); + } else { + data.push_back('0' + segment); + for (int length = generator->length - 1; length; length--, index /= generator->base) + data.push_back('0' + (index % generator->base)); + } + return data; + } + + private: + DataGenerator* generator; + string data; + int counter; + }; + + inline Iterator begin() { return Iterator(0, this); } + inline Iterator end() { return Iterator(total_size, this); } + + DataGenerator(int base, int length, string suffix) { + this->base = base; + this->length = length; + this->suffix = suffix; + segment_size = powl(base, length - 1) + 1; + total_size = base * segment_size; + }; + + private: + int base, length; + int segment_size, total_size; + string suffix; +}; + +#include "find_duplicates.h" + +#ifdef __linux__ +#include <sys/time.h> +#include <sys/resource.h> +#endif + +void test_duplicates(int base, int length, string suffix) { +#ifdef __linux__ + rlimit data_limit; + data_limit.rlim_cur = data_limit.rlim_max = 64 << 20; + setrlimit(RLIMIT_DATA, &data_limit); +#endif + + DataGenerator generator(base, length, suffix); + auto results = find_duplicates(generator); + + vector<string> correct; + for (int i = 0; i < base / 2; i++) { + correct.push_back(string(1, '0' + i) + suffix); + correct.push_back(string(1, '0' + base - 1 - i) + suffix); + } + + EXPECT(results.size() == correct.size(), + "Wrong number of generated duplicates, got " + to_string(results.size()) + + " and expected " + to_string(correct.size())); + for (int i = 0; i < int(results.size()); i++) + EXPECT(results[i] == correct[i], + "Wrong generated duplicate, got " + results[i] + " and expected " + correct[i]); +} + +vector<pair<string, function<void()>>> tests = { + {"10k", [] { test_duplicates(10, 4, "101"); }}, + {"100k", [] { test_duplicates(10, 5, "1984"); }}, + {"1M", [] { test_duplicates(10, 6, "22222"); }}, + {"10M", [] { test_duplicates(10, 7, "314159"); }}, + {"16M", [] { test_duplicates(8, 8, "7654321"); }}, +}; diff --git a/11-find_duplicates/cpp/test_main.cpp b/11-find_duplicates/cpp/test_main.cpp new file mode 100644 index 0000000..3f4aff0 --- /dev/null +++ b/11-find_duplicates/cpp/test_main.cpp @@ -0,0 +1,43 @@ +#include <cstdlib> +#include <functional> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +using namespace std; + +extern vector<pair<string, function<void()>>> tests; + +void expect_failed(const string& message) { + cerr << "Test error: " << message << endl; + exit(1); +} + +int main(int argc, char* argv[]) { + vector<string> required_tests; + + if (argc > 1) { + required_tests.assign(argv + 1, argv + argc); + } else { + for (const auto& test : tests) + required_tests.push_back(test.first); + } + + for (const auto& required_test : required_tests) { + bool found = false; + for (const auto& test : tests) + if (required_test == test.first) { + cerr << "Running test " << required_test << endl; + test.second(); + found = true; + break; + } + if (!found) { + cerr << "Unknown test " << required_test << endl; + return 1; + } + } + + return 0; +} diff --git a/11-find_duplicates/python/find_duplicates.py b/11-find_duplicates/python/find_duplicates.py new file mode 100644 index 0000000..62eacce --- /dev/null +++ b/11-find_duplicates/python/find_duplicates.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 +import sys + +def find_duplicates(data_generator): + """Find duplicates in the given data. + + The data_generator is an `iterable` (it can be iterated over in a for cycle, + or passed to `iter` method to explicitly generate an iterator) returning + strings. It can be iterated multiple times. + + The goal is to return a list of duplicated entries, reporting each duplicated + entry only once, in the order of their first occurrence in the data. + """ + + raise NotImplementedError() diff --git a/11-find_duplicates/python/find_duplicates_test.py b/11-find_duplicates/python/find_duplicates_test.py new file mode 100644 index 0000000..0b08916 --- /dev/null +++ b/11-find_duplicates/python/find_duplicates_test.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +import gc +import itertools +import sys + +from find_duplicates import find_duplicates + +class DataGenerator(): + def __init__(self, base, length, suffix): + self.digits = "0123456789"[:base] + self.length = length + self.suffix = suffix + + def generator(self): + for first in range(len(self.digits)): + for sequence in itertools.product(self.digits[first], *[self.digits] * (self.length - 1)): + yield "".join(sequence) + yield self.digits[-1 - first] + self.suffix + + def __iter__(self): + return self.generator() + +def test_duplicates(base, length, suffix): + generator = DataGenerator(base, length, suffix) + results = find_duplicates(generator) + gc.collect() + + prefixes = [generator.digits[i] for o in range(0, base // 2) for i in [o, -1 - o]] + correct = [prefix + suffix for prefix in prefixes] + assert results == correct, "The generates list of duplicates is not correct, got {} and expected {}".format(results, correct) + +tests = [ + ("10k", lambda: test_duplicates(10, 4, "101")), + ("100k", lambda: test_duplicates(10, 5, "1984")), + ("1M", lambda: test_duplicates(10, 6, "22222")), + ("10M", lambda: test_duplicates(10, 7, "314159")), + ("16M", lambda: test_duplicates(8, 8, "7654321")), +] + +if __name__ == "__main__": + try: + import resource + resource.setrlimit(resource.RLIMIT_DATA, (64<<20, 64<<20)) + except: + pass + + for required_test in sys.argv[1:] or [name for name, _ in tests]: + for name, test in tests: + if name == required_test: + print("Running test {}".format(name), file=sys.stderr) + test() + break + else: + raise ValueError("Unknown test {}".format(name)) diff --git a/11-find_duplicates/task.md b/11-find_duplicates/task.md new file mode 100644 index 0000000..ad076c4 --- /dev/null +++ b/11-find_duplicates/task.md @@ -0,0 +1,18 @@ +In this assignment, you are given a large file on input. Your goal is to find +duplicated lines and return every duplicated line once, in the order of their +first occurrences in the file. + +The challenging part of this assignment is the fact, that your program has to +run in a limited memory, using at most `64MB`, and the input file can be +considerably larger than this memory limit. However, you can rely on the fact +that the number of duplicated lines is considerably smaller (so that all +duplicated lines fit in the memory at the same time). + +Instead of handling a real file, you are given a data generator (an `iterator` +in C++ and a `generator` in Python). Note that limiting memory during the +tests works only on Linux (and not on Windows), and of course also in ReCodEx. + +You can use full standard library of Python and C++ in this assignment, +including data structure implementations (also, `bytearray` might come handy). +Note that the largest test in Python can run for several minutes. +As usual, you should submit only the `find_duplicates.{h,py}` file. -- GitLab