Commit 87bbc071 authored by Milan Straka's avatar Milan Straka

The find_duplicates assignment.

parent 6aec72ea
test: find_duplicates_test
./$<
INCLUDE ?= .
CXXFLAGS=-std=c++11 -O2 -Wall -Wextra -g -Wno-sign-compare -I$(INCLUDE)
find_duplicates_test: find_duplicates_test.cpp find_duplicates.h test_main.cpp
$(CXX) $(CXXFLAGS) $(filter %.cpp,$^) -o $@
clean:
rm -f find_duplicates_test
.PHONY: clean test
#include <unordered_map>
vector<string> find_duplicates(DataGenerator& generator) {
/*
* Find duplicates in the given data.
*
* The `generator` provides a forward iterator over strings
* for traversing the data (so it can be iterated for example
* using a `for` cycle). It can be traversed multiple times.
*
* The goal is to return a vector of duplicated entries,
* reporting each duplicated entry only once, in the order
* of their first occurrence in the data.
*/
return vector<string>();
}
#include <cmath>
#include <functional>
#include <iterator>
#include <iostream>
#include <string>
#include <vector>
using namespace std;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void expect_failed(const string& message);
class DataGenerator {
public:
class Iterator : iterator<input_iterator_tag, string> {
public:
Iterator(int counter, DataGenerator* generator) { this->counter = counter; this->generator = generator; }
Iterator(const Iterator& other) { this->counter = other.counter; this->generator = other.generator; }
Iterator& operator++() { if (counter < generator->total_size) counter++; return *this; }
Iterator operator++(int) { Iterator tmp(*this); operator++(); return tmp; }
bool operator==(const Iterator& other) const { return counter == other.counter; }
bool operator!=(const Iterator& other) const { return counter != other.counter; }
const string& operator*() {
data.clear();
int segment = counter / generator->segment_size, index = counter % generator->segment_size;
if (index + 1 == generator->segment_size) {
data.push_back('0' + generator->base - 1 - segment);
data.append(generator->suffix);
} else {
data.push_back('0' + segment);
for (int length = generator->length - 1; length; length--, index /= generator->base)
data.push_back('0' + (index % generator->base));
}
return data;
}
private:
DataGenerator* generator;
string data;
int counter;
};
inline Iterator begin() { return Iterator(0, this); }
inline Iterator end() { return Iterator(total_size, this); }
DataGenerator(int base, int length, string suffix) {
this->base = base;
this->length = length;
this->suffix = suffix;
segment_size = powl(base, length - 1) + 1;
total_size = base * segment_size;
};
private:
int base, length;
int segment_size, total_size;
string suffix;
};
#include "find_duplicates.h"
#ifdef __linux__
#include <sys/time.h>
#include <sys/resource.h>
#endif
void test_duplicates(int base, int length, string suffix) {
#ifdef __linux__
rlimit data_limit;
data_limit.rlim_cur = data_limit.rlim_max = 64 << 20;
setrlimit(RLIMIT_DATA, &data_limit);
#endif
DataGenerator generator(base, length, suffix);
auto results = find_duplicates(generator);
vector<string> correct;
for (int i = 0; i < base / 2; i++) {
correct.push_back(string(1, '0' + i) + suffix);
correct.push_back(string(1, '0' + base - 1 - i) + suffix);
}
EXPECT(results.size() == correct.size(),
"Wrong number of generated duplicates, got " + to_string(results.size()) +
" and expected " + to_string(correct.size()));
for (int i = 0; i < int(results.size()); i++)
EXPECT(results[i] == correct[i],
"Wrong generated duplicate, got " + results[i] + " and expected " + correct[i]);
}
vector<pair<string, function<void()>>> tests = {
{"10k", [] { test_duplicates(10, 4, "101"); }},
{"100k", [] { test_duplicates(10, 5, "1984"); }},
{"1M", [] { test_duplicates(10, 6, "22222"); }},
{"10M", [] { test_duplicates(10, 7, "314159"); }},
{"16M", [] { test_duplicates(8, 8, "7654321"); }},
};
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using namespace std;
extern vector<pair<string, function<void()>>> tests;
void expect_failed(const string& message) {
cerr << "Test error: " << message << endl;
exit(1);
}
int main(int argc, char* argv[]) {
vector<string> required_tests;
if (argc > 1) {
required_tests.assign(argv + 1, argv + argc);
} else {
for (const auto& test : tests)
required_tests.push_back(test.first);
}
for (const auto& required_test : required_tests) {
bool found = false;
for (const auto& test : tests)
if (required_test == test.first) {
cerr << "Running test " << required_test << endl;
test.second();
found = true;
break;
}
if (!found) {
cerr << "Unknown test " << required_test << endl;
return 1;
}
}
return 0;
}
#!/usr/bin/env python3
import sys
def find_duplicates(data_generator):
"""Find duplicates in the given data.
The data_generator is an `iterable` (it can be iterated over in a for cycle,
or passed to `iter` method to explicitly generate an iterator) returning
strings. It can be iterated multiple times.
The goal is to return a list of duplicated entries, reporting each duplicated
entry only once, in the order of their first occurrence in the data.
"""
raise NotImplementedError()
#!/usr/bin/env python3
import gc
import itertools
import sys
from find_duplicates import find_duplicates
class DataGenerator():
def __init__(self, base, length, suffix):
self.digits = "0123456789"[:base]
self.length = length
self.suffix = suffix
def generator(self):
for first in range(len(self.digits)):
for sequence in itertools.product(self.digits[first], *[self.digits] * (self.length - 1)):
yield "".join(sequence)
yield self.digits[-1 - first] + self.suffix
def __iter__(self):
return self.generator()
def test_duplicates(base, length, suffix):
generator = DataGenerator(base, length, suffix)
results = find_duplicates(generator)
gc.collect()
prefixes = [generator.digits[i] for o in range(0, base // 2) for i in [o, -1 - o]]
correct = [prefix + suffix for prefix in prefixes]
assert results == correct, "The generates list of duplicates is not correct, got {} and expected {}".format(results, correct)
tests = [
("10k", lambda: test_duplicates(10, 4, "101")),
("100k", lambda: test_duplicates(10, 5, "1984")),
("1M", lambda: test_duplicates(10, 6, "22222")),
("10M", lambda: test_duplicates(10, 7, "314159")),
("16M", lambda: test_duplicates(8, 8, "7654321")),
]
if __name__ == "__main__":
try:
import resource
resource.setrlimit(resource.RLIMIT_DATA, (64<<20, 64<<20))
except:
pass
for required_test in sys.argv[1:] or [name for name, _ in tests]:
for name, test in tests:
if name == required_test:
print("Running test {}".format(name), file=sys.stderr)
test()
break
else:
raise ValueError("Unknown test {}".format(name))
In this assignment, you are given a large file on input. Your goal is to find
duplicated lines and return every duplicated line once, in the order of their
first occurrences in the file.
The challenging part of this assignment is the fact, that your program has to
run in a limited memory, using at most `64MB`, and the input file can be
considerably larger than this memory limit. However, you can rely on the fact
that the number of duplicated lines is considerably smaller (so that all
duplicated lines fit in the memory at the same time).
Instead of handling a real file, you are given a data generator (an `iterator`
in C++ and a `generator` in Python). Note that limiting memory during the
tests works only on Linux (and not on Windows), and of course also in ReCodEx.
You can use full standard library of Python and C++ in this assignment,
including data structure implementations (also, `bytearray` might come handy).
Note that the largest test in Python can run for several minutes.
As usual, you should submit only the `find_duplicates.{h,py}` file.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment