From bef042bc80dee28c055c11c4d78f22236470f449 Mon Sep 17 00:00:00 2001 From: Petr Chmel <petr@chmel.net> Date: Mon, 28 Apr 2025 07:49:11 +0200 Subject: [PATCH] Publish find duplicates --- 09-find_duplicates/cpp/Makefile | 13 ++ 09-find_duplicates/cpp/find_duplicates.h | 19 ++ .../cpp/find_duplicates_test.cpp | 213 ++++++++++++++++++ 09-find_duplicates/cpp/test_main.cpp | 43 ++++ 09-find_duplicates/python/find_duplicates.py | 18 ++ .../python/find_duplicates_test.py | 147 ++++++++++++ 09-find_duplicates/task.md | 35 +++ 7 files changed, 488 insertions(+) create mode 100644 09-find_duplicates/cpp/Makefile create mode 100644 09-find_duplicates/cpp/find_duplicates.h create mode 100644 09-find_duplicates/cpp/find_duplicates_test.cpp create mode 100644 09-find_duplicates/cpp/test_main.cpp create mode 100644 09-find_duplicates/python/find_duplicates.py create mode 100644 09-find_duplicates/python/find_duplicates_test.py create mode 100644 09-find_duplicates/task.md diff --git a/09-find_duplicates/cpp/Makefile b/09-find_duplicates/cpp/Makefile new file mode 100644 index 0000000..049e7f7 --- /dev/null +++ b/09-find_duplicates/cpp/Makefile @@ -0,0 +1,13 @@ +test: find_duplicates_test + ./$< + +INCLUDE ?= . +CXXFLAGS=-std=c++23 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE) + +find_duplicates_test: find_duplicates_test.cpp find_duplicates.h test_main.cpp + $(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@ + +clean: + rm -f find_duplicates_test + +.PHONY: clean test diff --git a/09-find_duplicates/cpp/find_duplicates.h b/09-find_duplicates/cpp/find_duplicates.h new file mode 100644 index 0000000..ead255c --- /dev/null +++ b/09-find_duplicates/cpp/find_duplicates.h @@ -0,0 +1,19 @@ +#include <unordered_map> + +vector<string> find_duplicates(DataGenerator& generator) { + /* + * Find duplicates in the given data. + * + * The `generator` provides a forward iterator over strings + * for traversing the data, so it can be iterated for example + * using a `for` cycle: + * + * for (const string& item : generator) {...} + * + * The `generator` can be traversed multiple times. + * + * The goal is to return a vector of duplicated entries, + * reporting each duplicated entry only once. + */ + return vector<string>(); +} diff --git a/09-find_duplicates/cpp/find_duplicates_test.cpp b/09-find_duplicates/cpp/find_duplicates_test.cpp new file mode 100644 index 0000000..95ced16 --- /dev/null +++ b/09-find_duplicates/cpp/find_duplicates_test.cpp @@ -0,0 +1,213 @@ +#include <cmath> +#include <functional> +#include <iterator> +#include <iostream> +#include <string> +#include <vector> +#include <algorithm> +#include <type_traits> +#include <cstdint> + +using namespace std; + +// If the condition is not true, report an error and halt. +#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0) + +void expect_failed(const string& message); + +template < typename Impl > +class IteratorHelper : iterator<input_iterator_tag, typename Impl::T> { + public: + IteratorHelper() {} + + template < typename ... Args > + IteratorHelper(Args... args) : impl(args...) { finished = !impl.next(); } + + IteratorHelper& operator++() { + finished = !impl.next(); + return *this; + } + + IteratorHelper operator++(int) { + IteratorHelper tmp(*this); + operator++(); + return tmp; + } + + bool operator==(const IteratorHelper& other) const { return other.finished && finished; } + bool operator!=(const IteratorHelper& other) const { return !(*this == other); } + auto operator*() -> typename Impl::T { return impl.get(); } + + private: + bool finished = true; + Impl impl; +}; + + +class DataGenerator { + public: + struct Gen { + uint64_t state; + uint64_t mul; + uint64_t mod; + + uint64_t next() { + uint64_t ret = state; + state = (state * mul) % mod; + return ret; + } + }; + + struct IteratorImpl { + DataGenerator* dg = nullptr; + bool only_dups; + Gen rng, fw_gen, bw_gen; + int fw_steps = 0; + int bw_steps = 0; + uint64_t val; + string ret; + + using T = string; + + IteratorImpl() {} + + IteratorImpl(DataGenerator *dg, bool only_dups) : dg(dg), only_dups(only_dups) { + rng = { (dg->seed * 311) % dg->prime, 78403, dg->prime }; + fw_gen = { dg->seed, dg->step, dg->prime }; + bw_gen = { dg->rev_seed, dg->rev_step, dg->prime }; + } + + bool next() { + repeat: + if (fw_steps >= dg->length) return false; + + if (rng.next() < dg->prime * (dg->repeat_prob / (dg->repeat_prob + 1))) { + while (rng.next() < dg->prime * (1 - dg->repeat_prob)) { + bw_gen.next(); + bw_steps++; + } + + if (only_dups && bw_steps >= dg->length) return false; + + bw_steps++; + val = bw_gen.next(); + return true; + } else { + fw_steps++; + if (!only_dups) { + val = fw_gen.next(); + return true; + } + goto repeat; + } + } + + string get() { + constexpr char alphabet[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"; + constexpr uint64_t p = (1 << 21) - 19; + static_assert(sizeof(alphabet) == 65); // +1 due to '\0' at the end + + ret = string(dg->str_len, ' '); + uint64_t state = val; + int i = 0; + + while (i < dg->str_len) { + for (int j = 0; j < 5 && i < dg->str_len; j++) + ret[i++] = alphabet[(state >> (6*j)) & 0x3F]; + state = state * p + 11; + } + + return ret; + } + }; + + using Iterator = IteratorHelper<IteratorImpl>; + + Iterator begin() { return Iterator(this, false); } + Iterator end() { return Iterator(); } + + DataGenerator(int _seed, int _length, double _repeat_prob, int _str_len) { + prime = (1ULL << 30) - 101; + + seed = _seed + 101 + _length; + for (int i = 0; i < 100; i++) seed = (seed * 54321) % prime; + + repeat_prob = _repeat_prob; + length = _length; + + step = 23987; + + uint64_t x = pow_mod(step, length - 1, prime); + rev_seed = (x * seed) % prime; + rev_step = mult_inverse(step, prime); + + str_len = _str_len; + }; + + private: + string alphabet; + uint64_t seed, rev_seed, step, rev_step, prime; + int length, str_len; + double repeat_prob; + + Iterator dups() { return Iterator(this, true); } + + uint64_t pow_mod(uint64_t x, uint64_t n, uint64_t mod) { + if (n == 0) return 1; + if (n == 1) return x % mod; + + uint64_t rec = pow_mod(x, n / 2, mod); + rec = (rec * rec) % mod; + + if (n % 2 == 1) return (rec * x) % mod; + return rec; + } + + uint64_t mult_inverse(uint64_t x, uint64_t mod) { + // works only for prime mod + return pow_mod(x, mod - 2, mod); + } + + friend void test_duplicates(int, int, double, int); +}; + +#include "find_duplicates.h" + +#ifdef __linux__ +#include <sys/time.h> +#include <sys/resource.h> +#endif + +void test_duplicates(int seed, int length, double repeat_prob, int str_len) { +#ifdef __linux__ + rlimit data_limit; + data_limit.rlim_cur = data_limit.rlim_max = 64 << 20; + setrlimit(RLIMIT_DATA, &data_limit); +#endif + + DataGenerator generator(seed, length, repeat_prob, str_len); + auto results = find_duplicates(generator); + + vector<string> correct; + for (auto it = generator.dups(); it != generator.end(); ++it) + correct.push_back(*it); + + EXPECT(results.size() == correct.size(), + "Wrong number of generated duplicates, got " + to_string(results.size()) + + " and expected " + to_string(correct.size())); + + sort(correct.begin(), correct.end()); + sort(results.begin(), results.end()); + + for (int i = 0; i < int(results.size()); i++) + EXPECT(results[i] == correct[i], + "Wrong generated duplicate, got " + results[i] + " and expected " + correct[i]); +} + +vector<pair<string, function<void()>>> tests = { + {"10k", [] { test_duplicates(43, 10*1000, 0.01, 13); }}, + {"100k", [] { test_duplicates(43, 100*1000, 0.01, 20); }}, + {"1M", [] { test_duplicates(43, 1000*1000, 0.001, 40); }}, + {"10M", [] { test_duplicates(43, 10*1000*1000, 0.0001, 160); }}, + {"16M", [] { test_duplicates(43, 16*1000*1000, 0.0001, 360); }}, +}; diff --git a/09-find_duplicates/cpp/test_main.cpp b/09-find_duplicates/cpp/test_main.cpp new file mode 100644 index 0000000..3f4aff0 --- /dev/null +++ b/09-find_duplicates/cpp/test_main.cpp @@ -0,0 +1,43 @@ +#include <cstdlib> +#include <functional> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +using namespace std; + +extern vector<pair<string, function<void()>>> tests; + +void expect_failed(const string& message) { + cerr << "Test error: " << message << endl; + exit(1); +} + +int main(int argc, char* argv[]) { + vector<string> required_tests; + + if (argc > 1) { + required_tests.assign(argv + 1, argv + argc); + } else { + for (const auto& test : tests) + required_tests.push_back(test.first); + } + + for (const auto& required_test : required_tests) { + bool found = false; + for (const auto& test : tests) + if (required_test == test.first) { + cerr << "Running test " << required_test << endl; + test.second(); + found = true; + break; + } + if (!found) { + cerr << "Unknown test " << required_test << endl; + return 1; + } + } + + return 0; +} diff --git a/09-find_duplicates/python/find_duplicates.py b/09-find_duplicates/python/find_duplicates.py new file mode 100644 index 0000000..424ed3d --- /dev/null +++ b/09-find_duplicates/python/find_duplicates.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +import sys + +def find_duplicates(data_generator): + """Find duplicates in the given data. + + The `data_generator` is an iterable over strings, so it can be + iterated for example using a `for` cycle: + + for item in data_generator: ... + + It can be iterated multiple times. + + The goal is to return a list of duplicated entries, reporting each duplicated + entry only once. + """ + + raise NotImplementedError() diff --git a/09-find_duplicates/python/find_duplicates_test.py b/09-find_duplicates/python/find_duplicates_test.py new file mode 100644 index 0000000..af317b3 --- /dev/null +++ b/09-find_duplicates/python/find_duplicates_test.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 +import gc +import itertools +import sys +import string + +from find_duplicates import find_duplicates + +class DataGenerator(): + def __init__(self, seed, length, repeat_prob, str_len): + self.prime = 2**30 - 101 + + self.seed = seed + 101 + length + for _ in range(100): self.seed = (self.seed * 54321) % self.prime + + self.repeat_prob = float(repeat_prob) + self.length = length + + self.step = 23987 + + x = self._pow_mod(self.step, self.length - 1, self.prime) + self.rev_seed = (x * self.seed) % self.prime + self.rev_step = self._mult_inverse(self.step, self.prime) + + self.str_len = str_len + + + def _generator(self, only_dups=False): + def gen(seed, step): + state = seed + while True: + yield state + state = (state * step) % self.prime + + rng = gen((self.seed * 311) % self.prime, 78403) + fw_gen = gen(self.seed, self.step) + bw_gen = gen(self.rev_seed, self.rev_step) + + fw_steps = 0 + bw_steps = 0 + while fw_steps < self.length: + if next(rng) < self.prime * (self.repeat_prob / (self.repeat_prob + 1)): + while next(rng) < self.prime * (1 - self.repeat_prob): + next(bw_gen) + bw_steps += 1 + + if only_dups and bw_steps >= self.length: return + + bw_steps += 1 + yield self._make_string(next(bw_gen)) + else: + fw_steps += 1 + if not only_dups: + yield self._make_string(next(fw_gen)) + + + def _make_string(self, x): + alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-" + assert(len(alphabet) == 64) + long_strings = [ + "hn7fHKPgyw6GiGu3dRx8NpDPIK1eB2", + "YPBhODY2UU7KTntxAI9YbK4JNPCPJj", + "5qh0uhJW3ZheD65ZnNThGeeB6ds7pI", + "wW8jgWM7cEkEmNWOsyEmOQezHGOGnf", + "JAL6lzo1W3viaHhBrAPC992YIBdQHS", + "Y7OtykNRwyNaZvHsLtFBYoVSJac9xM", + "xIHUKmJFH663fuzs37PXSC8AwL9inq", + ] + p = 2**21 - 19 + + ret = [] + state = x + i = 0 + + for j in range(0, 30, 6): + if i >= self.str_len: break + ret.append(alphabet[(state >> j) & 0x3F]) + i += 1 + + state = state * p + 11; + + while i < self.str_len: + ret.append(long_strings[state % len(long_strings)]) + state = state * p + 11; + i += len(ret[-1]) + + while i < self.str_len: + for j in range(0, 30, 6): + if i >= self.str_len: break + ret.append(alphabet[(state >> j) & 0x3F]) + i += 1 + state = state * p + 11; + + return "".join(ret) + + def __iter__(self): + return self._generator() + + def _pow_mod(self, x, n, mod): + if n == 0: return 1 + if n == 1: return x % mod + rec = self._pow_mod(x, n // 2, mod) + rec = (rec * rec) % mod + if n % 2 == 1: + return (rec * x) % mod + else: + return rec + + def _mult_inverse(self, x, mod): + # works only for prime mod + return self._pow_mod(x, mod - 2, mod) + + +def test_duplicates(seed, length, repeat_prob, str_len): + generator = DataGenerator(seed, length, repeat_prob, str_len) + results = find_duplicates(generator) + gc.collect() + + correct = list(generator._generator(only_dups=True)) + assert len(results) == len(correct), \ + "Wrong number of generated duplicates, got %i and expected %i" % (len(results), len(correct)) + assert sorted(results) == sorted(correct), \ + "The generates list of duplicates is not correct, got {} and expected {}".format(results, correct) + +tests = [ + ("10k", lambda: test_duplicates(42, 10**4, 0.01, 14)), + ("100k", lambda: test_duplicates(10, 10**5, 0.01, 20)), + ("1M", lambda: test_duplicates(10, 10**6, 0.001, 340)), + ("10M", lambda: True), + ("16M", lambda: True), +] + +if __name__ == "__main__": + try: + import resource + resource.setrlimit(resource.RLIMIT_DATA, (12<<20, 12<<20)) + except: + pass + + for required_test in sys.argv[1:] or [name for name, _ in tests]: + for name, test in tests: + if name == required_test: + print("Running test {}".format(name), file=sys.stderr) + test() + break + else: + raise ValueError("Unknown test {}".format(name)) diff --git a/09-find_duplicates/task.md b/09-find_duplicates/task.md new file mode 100644 index 0000000..fa21e0d --- /dev/null +++ b/09-find_duplicates/task.md @@ -0,0 +1,35 @@ +In this assignment, you are given a large file on input. Your goal is to find +duplicated lines and return every duplicated line once. + +The challenging part of this assignment is the fact, that your program has to +run in a limited memory, using at most `64MB` for C++ and `12MB` for Python +(and Python itself requires about 5MB), and the input file can be considerably +larger than this memory limit. However, you can rely on the fact that the +number of duplicated lines is considerably smaller (so that all duplicated +lines fit in the memory at the same time). + +Instead of handling a real file, you are given a data generator (an `iterator` +in C++ and a `generator` in Python). Note that limiting memory during the +tests works only on Linux (and not on Windows), and of course also in ReCodEx. + +You can use full standard library of Python and C++ in this assignment, +including data structure implementations (also, `bytearray` might come handy). +Your solution must also work on other input data of the same size with similar +number of duplicates. Hence, solutions depending on the fact that each string is +uniquely determined by some its substring or similar properties of the input +will not be accepted. + +As usual, you should submit only the `find_duplicates.{h,py}` file. + +Note that due to the space constraints of the Python solutions, tests `10M` and `16M` are +not used and are always considered successful by ReCodEx. + +Source code templates can be found in [git](https://gitlab.kam.mff.cuni.cz/datovky/assignments/-/tree/master). + +Hints: +* Array [ False ] * 2**20 requires approximately 8 MB since Python stores it as an array of pointers to one value False. Use bytearray instead. +* Read carefully the documentation of bytearray and distinguish the terms bit and byte. +* In Python, do not import numpy or other libraries consuming more memory to load than available. +* The memory limit prevents storing all keys, so trying trivial solutions which store all keys in a dictionary is a waste of time. +* Count the number of duplicates and candidates for duplicates. For properly implemented hashing, those two numbers should be very close. +* Use profilers to trace memory usage; see e.g. https://docs.python.org/3/library/tracemalloc.html or https://valgrind.org/. -- GitLab