Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Odevzdávací Systém MO
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Martin Mareš
Odevzdávací Systém MO
Commits
621db11d
Commit
621db11d
authored
4 years ago
by
Václav Volhejn
Browse files
Options
Downloads
Patches
Plain Diff
Okomentovat
parent
e1fade4b
No related branches found
No related tags found
1 merge request
!8
Skript na zkracování oficiálních jmen škol
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
bin/shorten-schools
+59
-43
59 additions, 43 deletions
bin/shorten-schools
with
59 additions
and
43 deletions
bin/shorten-schools
+
59
−
43
View file @
621db11d
#!/usr/bin/env python3
#!/usr/bin/env python3
# Zkrátí v databázi oficiální dlouhá jména škol na něco čitelnějšího, uloží
"""
# do sloupce places.name
Zkrátí v databázi oficiální dlouhá jména škol na něco čitelnějšího, uloží
do sloupce places.name.
Algoritmus se jména snaží dostat do podoby ZKRÁCENÉ_JMÉNO, kde
ZKRÁCENÉ_JMÉNO = NÁZEV MÍSTO
NÁZEV = např.
"
SŠ
"
,
"
ZŠ T. G. Masaryka
"
,
"
SPŠ strojnická a SOŠ profesora Švejcara
"
MÍSTO = MĚSTO [ULICE [Č.P.]]
např.
"
Slatinice
"
,
"
Praha 7
"
,
"
Olomouc, Svatoplukova
"
Může existovat víc možností zkrácení, např.
ZŠ a MŠ Olomouc, Svatoplukova 11
ZŠ a MŠ Olomouc, Svatoplukova
ZŠ a MŠ Olomouc
Algoritmus vytvoří všechny varianty jmen a pak kontroluje, jestli při použití
nejkratší varianty (
"
ZŠ a MŠ Olomouc
"
) nenastane konflikt jmen s jinou školou.
Pokud ano, zkusí použít pro obě školy delší variantu názvu. Toto se opakuje,
dokud se konflikty nevyřeší.
"""
import
copy
import
copy
import
random
import
random
import
re
import
re
import
sys
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.orm
import
aliased
...
@@ -26,10 +44,11 @@ schools = []
...
@@ -26,10 +44,11 @@ schools = []
for
school
,
place
,
parent_place
in
schools_q
:
for
school
,
place
,
parent_place
in
schools_q
:
# Parent má být škola
assert
parent_place
.
level
==
3
assert
parent_place
.
level
==
3
n
=
place
.
name
on
=
school
.
official_
name
# Toto platí před prvním spuštením skriptu, pak už ne (změníme place.
name
)
# assert
n == on
# assert
place.name == school.official_name
schools
.
append
(
schools
.
append
(
{
{
...
@@ -40,7 +59,8 @@ for school, place, parent_place in schools_q:
...
@@ -40,7 +59,8 @@ for school, place, parent_place in schools_q:
}
}
)
)
schools_orig
=
copy
.
deepcopy
(
schools
)
def
eprint
(
*
args
,
**
kwargs
):
print
(
*
args
,
file
=
sys
.
stderr
,
**
kwargs
)
def
sorted_by_length
(
schools
):
def
sorted_by_length
(
schools
):
...
@@ -52,33 +72,32 @@ def sorted_by_length(schools):
...
@@ -52,33 +72,32 @@ def sorted_by_length(schools):
def
summarize
(
schools
,
k
=
5
):
def
summarize
(
schools
,
k
=
5
):
lens
=
[
len
(
sc
[
"
names
"
][
-
1
])
for
sc
in
schools
]
lens
=
[
len
(
sc
[
"
names
"
][
-
1
])
for
sc
in
schools
]
avg_len
=
sum
(
lens
)
/
len
(
schools
)
avg_len
=
sum
(
lens
)
/
len
(
schools
)
print
(
"
Average length:
"
,
avg_len
)
e
print
(
"
Average length:
"
,
avg_len
)
print
(
"
Maximum length:
"
,
max
(
lens
))
e
print
(
"
Maximum length:
"
,
max
(
lens
))
names_by_lens
=
sorted_by_length
(
schools
)
names_by_lens
=
sorted_by_length
(
schools
)
print
()
e
print
()
print
(
f
"
{
k
}
longest:
"
)
e
print
(
f
"
{
k
}
longest:
"
)
for
sc
in
names_by_lens
[::
-
1
][:
k
]:
for
sc
in
names_by_lens
[::
-
1
][:
k
]:
print
(
f
'
{
sc
[
"
names
"
][
-
1
]
}
(@
{
sc
[
"
city
"
]
}
)
'
)
e
print
(
f
'
{
sc
[
"
names
"
][
-
1
]
}
(@
{
sc
[
"
city
"
]
}
)
'
)
random
.
shuffle
(
names_by_lens
)
random
.
shuffle
(
names_by_lens
)
print
()
e
print
()
print
(
f
"
{
k
}
random:
"
)
e
print
(
f
"
{
k
}
random:
"
)
for
sc
in
names_by_lens
[:
k
]:
for
sc
in
names_by_lens
[:
k
]:
print
(
f
'
Old:
{
sc
[
"
names
"
][
0
]
}
'
)
e
print
(
f
'
Old:
{
sc
[
"
names
"
][
0
]
}
'
)
print
(
f
'
{
sc
[
"
names
"
][
-
1
]
}
'
)
e
print
(
f
'
{
sc
[
"
names
"
][
-
1
]
}
'
)
print
()
e
print
()
city_rules
=
[
city_rules
=
[
(
r
"
(\w)-(\w)
"
,
r
"
\1 - \2
"
),
(
r
"
(\w)-(\w)
"
,
r
"
\1 - \2
"
),
# Mezery kolem pomlček jsou někdy nekonzistentní
(
"
Praha
"
,
"
v Praze
"
),
(
"
Praha
"
,
"
v Praze
"
),
(
"
v Praze 4
"
,
"
v Praze 12
"
),
(
"
v Praze 4
"
,
"
v Praze 12
"
),
(
"
v Praze 4
"
,
"
v Praze 12
"
),
(
r
"
v Praze [0-9]+
"
,
"
v Praze
"
),
(
r
"
v Praze [0-9]+
"
,
"
v Praze
"
),
(
"
v Praze
"
,
"
Praha
"
),
(
"
v Praze
"
,
"
Praha
"
),
None
,
# Dummy
at the end
None
,
# Dummy
]
]
school_kinds
=
[
school_kinds
=
[
...
@@ -119,14 +138,18 @@ def shorten_name(name):
...
@@ -119,14 +138,18 @@ def shorten_name(name):
def
partition
(
name
,
city
):
def
partition
(
name
,
city
):
"""
Rozdělí název školy na část před názvem města a část po názvu města
"""
# Zkouší drobné úpravy názvu města
for
rule
in
city_rules
:
for
rule
in
city_rules
:
#
Eat up rest of the word for cases like
"Tábor
ské
"
#
Pro slova jako "Táborské" chceme odstranit i zbytek slova, nejen
"Tábor"
pat
=
r
"
\b{}\w*\b
"
.
format
(
city
)
pat
=
r
"
\b{}\w*\b
"
.
format
(
city
)
if
re
.
search
(
pat
,
name
)
is
not
None
:
if
re
.
search
(
pat
,
name
)
is
not
None
:
parts
=
re
.
split
(
pat
,
name
)
parts
=
re
.
split
(
pat
,
name
)
if
len
(
parts
)
!=
2
:
if
len
(
parts
)
!=
2
:
#
Multiple occurrences of city - what to do?
#
Název města se vyskytuje víckrát, není jasné, co dělat
return
None
return
None
else
:
else
:
ok
=
True
ok
=
True
...
@@ -135,7 +158,8 @@ def partition(name, city):
...
@@ -135,7 +158,8 @@ def partition(name, city):
ok
=
False
ok
=
False
if
not
ok
:
if
not
ok
:
# Part of the school kind follows after city name
# Názvová část školy pokračuje i po názvu města (např. "Táborské gymnázium"),
# nelze automaticky vyřešit
return
None
return
None
else
:
else
:
return
parts
return
parts
...
@@ -143,19 +167,13 @@ def partition(name, city):
...
@@ -143,19 +167,13 @@ def partition(name, city):
if
rule
is
not
None
:
if
rule
is
not
None
:
city
=
re
.
sub
(
rule
[
0
],
rule
[
1
],
city
)
city
=
re
.
sub
(
rule
[
0
],
rule
[
1
],
city
)
#
Failed to find match
#
Nenašli jsme název města
return
[
name
]
return
[
name
]
def
shorten_in_city
(
city
,
schools
):
for
sc
in
schools
:
name_p
,
place_p
=
sc
[
"
parts
"
]
sc
[
"
name
"
]
=
"
|
"
.
join
([
name_p
,
city
,
place_p
])
def
remove_house_number
(
name
):
def
remove_house_number
(
name
):
name
,
n
=
re
.
subn
(
r
"
(, ([^\W\d_]| |\.)+) [0-9/]+[a-z]?$
"
,
r
"
\1
"
,
name
)
name
,
n
=
re
.
subn
(
r
"
(, ([^\W\d_]| |\.)+) [0-9/]+[a-z]?$
"
,
r
"
\1
"
,
name
)
# True
if changed, False if not
# True
, pokud se název změnil
return
name
,
n
>
0
return
name
,
n
>
0
...
@@ -177,7 +195,7 @@ def shorten_all(schools):
...
@@ -177,7 +195,7 @@ def shorten_all(schools):
sc
[
"
names
"
].
append
(
remove_formalities
(
sc
[
"
names
"
][
-
1
]))
sc
[
"
names
"
].
append
(
remove_formalities
(
sc
[
"
names
"
][
-
1
]))
sc
[
"
parts
"
]
=
partition
(
sc
[
"
names
"
][
-
1
],
sc
[
"
city
"
])
sc
[
"
parts
"
]
=
partition
(
sc
[
"
names
"
][
-
1
],
sc
[
"
city
"
])
print
(
"
Total schools: {}
"
.
format
(
len
(
schools
)))
e
print
(
"
Total schools: {}
"
.
format
(
len
(
schools
)))
n_split
=
0
n_split
=
0
...
@@ -185,9 +203,11 @@ def shorten_all(schools):
...
@@ -185,9 +203,11 @@ def shorten_all(schools):
sc
[
"
names
"
].
append
(
shorten_name
(
sc
[
"
names
"
][
-
1
]))
sc
[
"
names
"
].
append
(
shorten_name
(
sc
[
"
names
"
][
-
1
]))
if
sc
[
"
parts
"
]
is
not
None
:
if
sc
[
"
parts
"
]
is
not
None
:
if
len
(
sc
[
"
parts
"
])
==
1
:
if
len
(
sc
[
"
parts
"
])
==
1
:
#
City name not found in school name
#
Název města nenalezen v názvu školy
sc
[
"
names
"
].
append
(
f
"
{
sc
[
'
names
'
][
-
1
]
}
,
{
sc
[
'
city
'
]
}
"
)
sc
[
"
names
"
].
append
(
f
"
{
sc
[
'
names
'
][
-
1
]
}
,
{
sc
[
'
city
'
]
}
"
)
else
:
else
:
# Když máme rozdělení, můžeme zkusit odstanit číslo popisné
# a případně i celý název ulice
n_split
+=
1
n_split
+=
1
assert
len
(
sc
[
"
parts
"
])
==
2
assert
len
(
sc
[
"
parts
"
])
==
2
...
@@ -206,26 +226,21 @@ def shorten_all(schools):
...
@@ -206,26 +226,21 @@ def shorten_all(schools):
sc
[
"
names
"
].
append
(
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
"
)
sc
[
"
names
"
].
append
(
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
"
)
print
(
f
"
Successfully split up
{
n_split
}
schools
"
)
e
print
(
f
"
Successfully split up
{
n_split
}
schools
"
)
return
schools
return
schools
schools
=
copy
.
deepcopy
(
schools_orig
)
shortened
=
shorten_all
(
schools
)
def
is_conflict
(
names1
,
names2
):
def
is_conflict
(
names1
,
names2
):
return
any
([(
name
in
names1
)
for
name
in
names2
])
return
any
([(
name
in
names1
)
for
name
in
names2
])
def
remove_conflicts
(
shortened
):
def
remove_conflicts
(
shortened
):
n_conflicts
=
0
"""
Vrátí se k delším variantám jmen, pokud se vyskytly konflikty
"""
again
=
True
again
=
True
while
again
:
while
again
:
shortened
.
sort
(
key
=
lambda
sc
:
sc
[
"
names
"
][
-
1
])
shortened
.
sort
(
key
=
lambda
sc
:
sc
[
"
names
"
][
-
1
])
print
(
"
----------------------------
"
)
e
print
(
"
----------------------------
"
)
n_conflicts
=
0
n_conflicts
=
0
again
=
False
again
=
False
...
@@ -244,15 +259,16 @@ def remove_conflicts(shortened):
...
@@ -244,15 +259,16 @@ def remove_conflicts(shortened):
assert
len
(
sc
[
"
names
"
])
>
1
assert
len
(
sc
[
"
names
"
])
>
1
sc
[
"
names
"
].
pop
()
sc
[
"
names
"
].
pop
()
print
(
f
"
Found
{
n_conflicts
}
conflicts
"
)
e
print
(
f
"
Found
{
n_conflicts
}
conflicts
"
)
# Hack -
fix false positives, we always want to do these changes
# Hack -
tato zkrácení vždy chceme aplikovat, předpokládáme, že nevzniknou konflikty
for
sc
in
shortened
:
for
sc
in
shortened
:
sc
[
"
names
"
].
append
(
remove_formalities
(
shorten_name
(
sc
[
"
names
"
][
-
1
])))
sc
[
"
names
"
].
append
(
remove_formalities
(
shorten_name
(
sc
[
"
names
"
][
-
1
])))
print
(
"
Done (possible unremovable conflicts)
"
)
e
print
(
"
Done (possible unremovable conflicts)
"
)
shortened
=
shorten_all
(
schools
)
remove_conflicts
(
shortened
)
remove_conflicts
(
shortened
)
summarize
(
shortened
,
k
=
10
)
summarize
(
shortened
,
k
=
10
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment