Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
datovky
assignments
Commits
dac98e6a
Commit
dac98e6a
authored
May 05, 2021
by
Martin Mareš
Browse files
Find duplicates
parent
43914683
Changes
7
Hide whitespace changes
Inline
Side-by-side
09-find_duplicates/cpp/Makefile
0 → 100644
View file @
dac98e6a
test
:
find_duplicates_test
./
$<
INCLUDE
?=
.
CXXFLAGS
=
-std
=
c++11
-O2
-Wall
-Wextra
-g
-Wno-sign-compare
-I
$(INCLUDE)
find_duplicates_test
:
find_duplicates_test.cpp find_duplicates.h test_main.cpp
$(CXX)
$(CXXFLAGS)
$(
filter
%.cpp,
$^
)
-o
$@
clean
:
rm
-f
find_duplicates_test
.PHONY
:
clean test
09-find_duplicates/cpp/find_duplicates.h
0 → 100644
View file @
dac98e6a
#include <unordered_map>
vector
<
string
>
find_duplicates
(
DataGenerator
&
generator
)
{
/*
* Find duplicates in the given data.
*
* The `generator` provides a forward iterator over strings
* for traversing the data, so it can be iterated for example
* using a `for` cycle:
*
* for (const string& item : generator) {...}
*
* The `generator` can be traversed multiple times.
*
* The goal is to return a vector of duplicated entries,
* reporting each duplicated entry only once.
*/
return
vector
<
string
>
();
}
09-find_duplicates/cpp/find_duplicates_test.cpp
0 → 100644
View file @
dac98e6a
#include <cmath>
#include <functional>
#include <iterator>
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <type_traits>
using
namespace
std
;
// If the condition is not true, report an error and halt.
#define EXPECT(condition, message) do { if (!(condition)) expect_failed(message); } while (0)
void
expect_failed
(
const
string
&
message
);
template
<
typename
Impl
>
class
IteratorHelper
:
iterator
<
input_iterator_tag
,
typename
Impl
::
T
>
{
public:
IteratorHelper
()
{}
template
<
typename
...
Args
>
IteratorHelper
(
Args
...
args
)
:
impl
(
args
...)
{
finished
=
!
impl
.
next
();
}
IteratorHelper
&
operator
++
()
{
finished
=
!
impl
.
next
();
return
*
this
;
}
IteratorHelper
operator
++
(
int
)
{
IteratorHelper
tmp
(
*
this
);
operator
++
();
return
tmp
;
}
bool
operator
==
(
const
IteratorHelper
&
other
)
const
{
return
other
.
finished
&&
finished
;
}
bool
operator
!=
(
const
IteratorHelper
&
other
)
const
{
return
!
(
*
this
==
other
);
}
auto
operator
*
()
->
typename
Impl
::
T
{
return
impl
.
get
();
}
private:
bool
finished
=
true
;
Impl
impl
;
};
class
DataGenerator
{
public:
struct
Gen
{
uint64_t
state
;
uint64_t
mul
;
uint64_t
mod
;
uint64_t
next
()
{
uint64_t
ret
=
state
;
state
=
(
state
*
mul
)
%
mod
;
return
ret
;
}
};
struct
IteratorImpl
{
DataGenerator
*
dg
=
nullptr
;
bool
only_dups
;
Gen
rng
,
fw_gen
,
bw_gen
;
int
fw_steps
=
0
;
int
bw_steps
=
0
;
uint64_t
val
;
string
ret
;
using
T
=
string
;
IteratorImpl
()
{}
IteratorImpl
(
DataGenerator
*
dg
,
bool
only_dups
)
:
dg
(
dg
),
only_dups
(
only_dups
)
{
rng
=
{
(
dg
->
seed
*
311
)
%
dg
->
prime
,
78403
,
dg
->
prime
};
fw_gen
=
{
dg
->
seed
,
dg
->
step
,
dg
->
prime
};
bw_gen
=
{
dg
->
rev_seed
,
dg
->
rev_step
,
dg
->
prime
};
}
bool
next
()
{
repeat:
if
(
fw_steps
>=
dg
->
length
)
return
false
;
if
(
rng
.
next
()
<
dg
->
prime
*
(
dg
->
repeat_prob
/
(
dg
->
repeat_prob
+
1
)))
{
while
(
rng
.
next
()
<
dg
->
prime
*
(
1
-
dg
->
repeat_prob
))
{
bw_gen
.
next
();
bw_steps
++
;
}
if
(
only_dups
&&
bw_steps
>=
dg
->
length
)
return
false
;
bw_steps
++
;
val
=
bw_gen
.
next
();
return
true
;
}
else
{
fw_steps
++
;
if
(
!
only_dups
)
{
val
=
fw_gen
.
next
();
return
true
;
}
goto
repeat
;
}
}
string
get
()
{
constexpr
char
alphabet
[]
=
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
;
constexpr
uint64_t
p
=
(
1
<<
21
)
-
19
;
static_assert
(
sizeof
(
alphabet
)
==
65
);
// +1 due to '\0' at the end
ret
=
string
(
dg
->
str_len
,
' '
);
uint64_t
state
=
val
;
int
i
=
0
;
while
(
i
<
dg
->
str_len
)
{
for
(
int
j
=
0
;
j
<
5
&&
i
<
dg
->
str_len
;
j
++
)
ret
[
i
++
]
=
alphabet
[(
state
>>
(
6
*
j
))
&
0x3F
];
state
=
state
*
p
+
11
;
}
return
ret
;
}
};
using
Iterator
=
IteratorHelper
<
IteratorImpl
>
;
Iterator
begin
()
{
return
Iterator
(
this
,
false
);
}
Iterator
end
()
{
return
Iterator
();
}
DataGenerator
(
int
_seed
,
int
_length
,
double
_repeat_prob
,
int
_str_len
)
{
prime
=
(
1ULL
<<
30
)
-
101
;
seed
=
_seed
+
101
+
_length
;
for
(
int
i
=
0
;
i
<
100
;
i
++
)
seed
=
(
seed
*
54321
)
%
prime
;
repeat_prob
=
_repeat_prob
;
length
=
_length
;
step
=
23987
;
uint64_t
x
=
pow_mod
(
step
,
length
-
1
,
prime
);
rev_seed
=
(
x
*
seed
)
%
prime
;
rev_step
=
mult_inverse
(
step
,
prime
);
str_len
=
_str_len
;
};
private:
string
alphabet
;
uint64_t
seed
,
rev_seed
,
step
,
rev_step
,
prime
;
int
length
,
str_len
;
double
repeat_prob
;
Iterator
dups
()
{
return
Iterator
(
this
,
true
);
}
uint64_t
pow_mod
(
uint64_t
x
,
uint64_t
n
,
uint64_t
mod
)
{
if
(
n
==
0
)
return
1
;
if
(
n
==
1
)
return
x
%
mod
;
uint64_t
rec
=
pow_mod
(
x
,
n
/
2
,
mod
);
rec
=
(
rec
*
rec
)
%
mod
;
if
(
n
%
2
==
1
)
return
(
rec
*
x
)
%
mod
;
return
rec
;
}
uint64_t
mult_inverse
(
uint64_t
x
,
uint64_t
mod
)
{
// works only for prime mod
return
pow_mod
(
x
,
mod
-
2
,
mod
);
}
friend
void
test_duplicates
(
int
,
int
,
double
,
int
);
};
#include "find_duplicates.h"
#ifdef __linux__
#include <sys/time.h>
#include <sys/resource.h>
#endif
void
test_duplicates
(
int
seed
,
int
length
,
double
repeat_prob
,
int
str_len
)
{
#ifdef __linux__
rlimit
data_limit
;
data_limit
.
rlim_cur
=
data_limit
.
rlim_max
=
64
<<
20
;
setrlimit
(
RLIMIT_DATA
,
&
data_limit
);
#endif
DataGenerator
generator
(
seed
,
length
,
repeat_prob
,
str_len
);
auto
results
=
find_duplicates
(
generator
);
vector
<
string
>
correct
;
for
(
auto
it
=
generator
.
dups
();
it
!=
generator
.
end
();
++
it
)
correct
.
push_back
(
*
it
);
EXPECT
(
results
.
size
()
==
correct
.
size
(),
"Wrong number of generated duplicates, got "
+
to_string
(
results
.
size
())
+
" and expected "
+
to_string
(
correct
.
size
()));
sort
(
correct
.
begin
(),
correct
.
end
());
sort
(
results
.
begin
(),
results
.
end
());
for
(
int
i
=
0
;
i
<
int
(
results
.
size
());
i
++
)
EXPECT
(
results
[
i
]
==
correct
[
i
],
"Wrong generated duplicate, got "
+
results
[
i
]
+
" and expected "
+
correct
[
i
]);
}
vector
<
pair
<
string
,
function
<
void
()
>>>
tests
=
{
{
"10k"
,
[]
{
test_duplicates
(
43
,
10
*
1000
,
0.01
,
13
);
}},
{
"100k"
,
[]
{
test_duplicates
(
43
,
100
*
1000
,
0.01
,
20
);
}},
{
"1M"
,
[]
{
test_duplicates
(
43
,
1000
*
1000
,
0.001
,
40
);
}},
{
"10M"
,
[]
{
test_duplicates
(
43
,
10
*
1000
*
1000
,
0.0001
,
160
);
}},
{
"16M"
,
[]
{
test_duplicates
(
43
,
16
*
1000
*
1000
,
0.0001
,
360
);
}},
};
09-find_duplicates/cpp/test_main.cpp
0 → 100644
View file @
dac98e6a
#include <cstdlib>
#include <functional>
#include <iostream>
#include <string>
#include <utility>
#include <vector>
using
namespace
std
;
extern
vector
<
pair
<
string
,
function
<
void
()
>>>
tests
;
void
expect_failed
(
const
string
&
message
)
{
cerr
<<
"Test error: "
<<
message
<<
endl
;
exit
(
1
);
}
int
main
(
int
argc
,
char
*
argv
[])
{
vector
<
string
>
required_tests
;
if
(
argc
>
1
)
{
required_tests
.
assign
(
argv
+
1
,
argv
+
argc
);
}
else
{
for
(
const
auto
&
test
:
tests
)
required_tests
.
push_back
(
test
.
first
);
}
for
(
const
auto
&
required_test
:
required_tests
)
{
bool
found
=
false
;
for
(
const
auto
&
test
:
tests
)
if
(
required_test
==
test
.
first
)
{
cerr
<<
"Running test "
<<
required_test
<<
endl
;
test
.
second
();
found
=
true
;
break
;
}
if
(
!
found
)
{
cerr
<<
"Unknown test "
<<
required_test
<<
endl
;
return
1
;
}
}
return
0
;
}
09-find_duplicates/python/find_duplicates.py
0 → 100644
View file @
dac98e6a
#!/usr/bin/env python3
import
sys
def
find_duplicates
(
data_generator
):
"""Find duplicates in the given data.
The `data_generator` is an iterable over strings, so it can be
iterated for example using a `for` cycle:
for item in data_generator: ...
It can be iterated multiple times.
The goal is to return a list of duplicated entries, reporting each duplicated
entry only once.
"""
raise
NotImplementedError
()
09-find_duplicates/python/find_duplicates_test.py
0 → 100644
View file @
dac98e6a
#!/usr/bin/env python3
import
gc
import
itertools
import
sys
import
string
from
find_duplicates
import
find_duplicates
class
DataGenerator
():
def
__init__
(
self
,
seed
,
length
,
repeat_prob
,
str_len
):
self
.
prime
=
2
**
30
-
101
self
.
seed
=
seed
+
101
+
length
for
_
in
range
(
100
):
self
.
seed
=
(
self
.
seed
*
54321
)
%
self
.
prime
self
.
repeat_prob
=
float
(
repeat_prob
)
self
.
length
=
length
self
.
step
=
23987
x
=
self
.
_pow_mod
(
self
.
step
,
self
.
length
-
1
,
self
.
prime
)
self
.
rev_seed
=
(
x
*
self
.
seed
)
%
self
.
prime
self
.
rev_step
=
self
.
_mult_inverse
(
self
.
step
,
self
.
prime
)
self
.
str_len
=
str_len
def
_generator
(
self
,
only_dups
=
False
):
def
gen
(
seed
,
step
):
state
=
seed
while
True
:
yield
state
state
=
(
state
*
step
)
%
self
.
prime
rng
=
gen
((
self
.
seed
*
311
)
%
self
.
prime
,
78403
)
fw_gen
=
gen
(
self
.
seed
,
self
.
step
)
bw_gen
=
gen
(
self
.
rev_seed
,
self
.
rev_step
)
fw_steps
=
0
bw_steps
=
0
while
fw_steps
<
self
.
length
:
if
next
(
rng
)
<
self
.
prime
*
(
self
.
repeat_prob
/
(
self
.
repeat_prob
+
1
)):
while
next
(
rng
)
<
self
.
prime
*
(
1
-
self
.
repeat_prob
):
next
(
bw_gen
)
bw_steps
+=
1
if
only_dups
and
bw_steps
>=
self
.
length
:
return
bw_steps
+=
1
yield
self
.
_make_string
(
next
(
bw_gen
))
else
:
fw_steps
+=
1
if
not
only_dups
:
yield
self
.
_make_string
(
next
(
fw_gen
))
def
_make_string
(
self
,
x
):
alphabet
=
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-"
assert
(
len
(
alphabet
)
==
64
)
long_strings
=
[
"hn7fHKPgyw6GiGu3dRx8NpDPIK1eB2"
,
"YPBhODY2UU7KTntxAI9YbK4JNPCPJj"
,
"5qh0uhJW3ZheD65ZnNThGeeB6ds7pI"
,
"wW8jgWM7cEkEmNWOsyEmOQezHGOGnf"
,
"JAL6lzo1W3viaHhBrAPC992YIBdQHS"
,
"Y7OtykNRwyNaZvHsLtFBYoVSJac9xM"
,
"xIHUKmJFH663fuzs37PXSC8AwL9inq"
,
]
p
=
2
**
21
-
19
ret
=
[]
state
=
x
i
=
0
for
j
in
range
(
0
,
30
,
6
):
if
i
>=
self
.
str_len
:
break
ret
.
append
(
alphabet
[(
state
>>
j
)
&
0x3F
])
i
+=
1
state
=
state
*
p
+
11
;
while
i
<
self
.
str_len
:
ret
.
append
(
long_strings
[
state
%
len
(
long_strings
)])
state
=
state
*
p
+
11
;
i
+=
len
(
ret
[
-
1
])
while
i
<
self
.
str_len
:
for
j
in
range
(
0
,
30
,
6
):
if
i
>=
self
.
str_len
:
break
ret
.
append
(
alphabet
[(
state
>>
j
)
&
0x3F
])
i
+=
1
state
=
state
*
p
+
11
;
return
""
.
join
(
ret
)
def
__iter__
(
self
):
return
self
.
_generator
()
def
_pow_mod
(
self
,
x
,
n
,
mod
):
if
n
==
0
:
return
1
if
n
==
1
:
return
x
%
mod
rec
=
self
.
_pow_mod
(
x
,
n
//
2
,
mod
)
rec
=
(
rec
*
rec
)
%
mod
if
n
%
2
==
1
:
return
(
rec
*
x
)
%
mod
else
:
return
rec
def
_mult_inverse
(
self
,
x
,
mod
):
# works only for prime mod
return
self
.
_pow_mod
(
x
,
mod
-
2
,
mod
)
def
test_duplicates
(
seed
,
length
,
repeat_prob
,
str_len
):
generator
=
DataGenerator
(
seed
,
length
,
repeat_prob
,
str_len
)
results
=
find_duplicates
(
generator
)
gc
.
collect
()
correct
=
list
(
generator
.
_generator
(
only_dups
=
True
))
assert
len
(
results
)
==
len
(
correct
),
\
"Wrong number of generated duplicates, got %i and expected %i"
%
(
len
(
results
),
len
(
correct
))
assert
sorted
(
results
)
==
sorted
(
correct
),
\
"The generates list of duplicates is not correct, got {} and expected {}"
.
format
(
results
,
correct
)
tests
=
[
(
"10k"
,
lambda
:
test_duplicates
(
42
,
10
**
4
,
0.01
,
14
)),
(
"100k"
,
lambda
:
test_duplicates
(
10
,
10
**
5
,
0.01
,
20
)),
(
"1M"
,
lambda
:
test_duplicates
(
10
,
10
**
6
,
0.001
,
340
)),
(
"10M"
,
lambda
:
True
),
(
"16M"
,
lambda
:
True
),
]
if
__name__
==
"__main__"
:
try
:
import
resource
resource
.
setrlimit
(
resource
.
RLIMIT_DATA
,
(
12
<<
20
,
12
<<
20
))
except
:
pass
for
required_test
in
sys
.
argv
[
1
:]
or
[
name
for
name
,
_
in
tests
]:
for
name
,
test
in
tests
:
if
name
==
required_test
:
print
(
"Running test {}"
.
format
(
name
),
file
=
sys
.
stderr
)
test
()
break
else
:
raise
ValueError
(
"Unknown test {}"
.
format
(
name
))
09-find_duplicates/task.md
0 → 100644
View file @
dac98e6a
In this assignment, you are given a large file on input. Your goal is to find
duplicated lines and return every duplicated line once.
The challenging part of this assignment is the fact, that your program has to
run in a limited memory, using at most
`64MB`
for C++ and
`12MB`
for Python
(and Python itself requires about 5MB), and the input file can be considerably
larger than this memory limit. However, you can rely on the fact that the
number of duplicated lines is considerably smaller (so that all duplicated
lines fit in the memory at the same time).
Instead of handling a real file, you are given a data generator (an
`iterator`
in C++ and a
`generator`
in Python). Note that limiting memory during the
tests works only on Linux (and not on Windows), and of course also in ReCodEx.
You can use full standard library of Python and C++ in this assignment,
including data structure implementations (also,
`bytearray`
might come handy).
Your solution must also work on other input data of the same size with similar
number of duplicates. Hence solutions depending on the fact that each string is
uniquely determined by some its substring or similar properties of the input
will not be accepted.
As usual, you should submit only the
`find_duplicates.{h,py}`
file.
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment