Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Odevzdávací Systém MO
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Martin Mareš
Odevzdávací Systém MO
Commits
e1fade4b
Commit
e1fade4b
authored
4 years ago
by
Václav Volhejn
Browse files
Options
Downloads
Patches
Plain Diff
Skript na zkracování jmen škol
parent
749cecaa
No related branches found
No related tags found
1 merge request
!8
Skript na zkracování oficiálních jmen škol
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
bin/shorten-schools
+278
-0
278 additions, 0 deletions
bin/shorten-schools
with
278 additions
and
0 deletions
bin/shorten-schools
0 → 100755
+
278
−
0
View file @
e1fade4b
#!/usr/bin/env python3
# Zkrátí v databázi oficiální dlouhá jména škol na něco čitelnějšího, uloží
# do sloupce places.name
import
copy
import
random
import
re
from
sqlalchemy.orm
import
aliased
import
mo.db
as
db
session
=
db
.
get_session
()
school_place_t
=
aliased
(
db
.
Place
)
parent_place_t
=
aliased
(
db
.
Place
)
schools_q
=
(
session
.
query
(
db
.
School
,
school_place_t
,
parent_place_t
)
.
filter
(
db
.
School
.
place_id
==
school_place_t
.
place_id
)
.
filter
(
parent_place_t
.
place_id
==
school_place_t
.
parent
)
.
all
()
)
schools
=
[]
for
school
,
place
,
parent_place
in
schools_q
:
assert
parent_place
.
level
==
3
n
=
place
.
name
on
=
school
.
official_name
# assert n == on
schools
.
append
(
{
"
place_id
"
:
school
.
place_id
,
"
names
"
:
[
school
.
official_name
],
"
city
"
:
parent_place
.
name
,
"
db_place
"
:
place
,
}
)
schools_orig
=
copy
.
deepcopy
(
schools
)
def
sorted_by_length
(
schools
):
schools2
=
copy
.
copy
(
schools
)
schools2
.
sort
(
key
=
lambda
sc
:
len
(
sc
[
"
names
"
][
-
1
]))
return
schools2
def
summarize
(
schools
,
k
=
5
):
lens
=
[
len
(
sc
[
"
names
"
][
-
1
])
for
sc
in
schools
]
avg_len
=
sum
(
lens
)
/
len
(
schools
)
print
(
"
Average length:
"
,
avg_len
)
print
(
"
Maximum length:
"
,
max
(
lens
))
names_by_lens
=
sorted_by_length
(
schools
)
print
()
print
(
f
"
{
k
}
longest:
"
)
for
sc
in
names_by_lens
[::
-
1
][:
k
]:
print
(
f
'
{
sc
[
"
names
"
][
-
1
]
}
(@
{
sc
[
"
city
"
]
}
)
'
)
random
.
shuffle
(
names_by_lens
)
print
()
print
(
f
"
{
k
}
random:
"
)
for
sc
in
names_by_lens
[:
k
]:
print
(
f
'
Old:
{
sc
[
"
names
"
][
0
]
}
'
)
print
(
f
'
{
sc
[
"
names
"
][
-
1
]
}
'
)
print
()
city_rules
=
[
(
r
"
(\w)-(\w)
"
,
r
"
\1 - \2
"
),
(
"
Praha
"
,
"
v Praze
"
),
(
"
v Praze 4
"
,
"
v Praze 12
"
),
(
"
v Praze 4
"
,
"
v Praze 12
"
),
(
r
"
v Praze [0-9]+
"
,
"
v Praze
"
),
(
"
v Praze
"
,
"
Praha
"
),
None
,
# Dummy at the end
]
school_kinds
=
[
(
"
Gymnázium
"
,
"
G
"
),
(
"
Vyšší odborná škola
"
,
"
VOŠ
"
),
(
"
Střední odborná škola
"
,
"
SOŠ
"
),
(
"
Střední zdravotnická škola
"
,
"
SZŠ
"
),
(
"
Střední průmyslová škola
"
,
"
SPŠ
"
),
(
"
Střední pedagogická škola
"
,
"
SPŠ
"
),
(
"
Střední odborné učiliště
"
,
"
SOU
"
),
(
"
Střední škola
"
,
"
SŠ
"
),
(
"
Základní škola
"
,
"
ZŠ
"
),
(
"
Základní umělecká škola
"
,
"
ZUŠ
"
),
(
"
Mateřská škola
"
,
"
MŠ
"
),
]
formalities
=
[
r
"
,?-? ?příspěvková organizace
"
,
r
"
,? s.r.o.
"
,
r
"
,? o.p.s.
"
,
r
"
s právem státní jazykové zkoušky
"
,
r
"
,? ?okres .+$
"
,
]
def
remove_formalities
(
name
):
for
formality
in
formalities
:
name
=
re
.
sub
(
formality
,
""
,
name
,
flags
=
re
.
IGNORECASE
)
return
name
def
shorten_name
(
name
):
for
re_from
,
re_to
in
school_kinds
:
name
=
re
.
sub
(
re_from
,
re_to
,
name
,
flags
=
re
.
IGNORECASE
)
return
name
def
partition
(
name
,
city
):
for
rule
in
city_rules
:
# Eat up rest of the word for cases like "Táborské"
pat
=
r
"
\b{}\w*\b
"
.
format
(
city
)
if
re
.
search
(
pat
,
name
)
is
not
None
:
parts
=
re
.
split
(
pat
,
name
)
if
len
(
parts
)
!=
2
:
# Multiple occurrences of city - what to do?
return
None
else
:
ok
=
True
for
kind
,
_
in
school_kinds
:
if
kind
.
lower
()
in
parts
[
1
].
lower
():
ok
=
False
if
not
ok
:
# Part of the school kind follows after city name
return
None
else
:
return
parts
if
rule
is
not
None
:
city
=
re
.
sub
(
rule
[
0
],
rule
[
1
],
city
)
# Failed to find match
return
[
name
]
def
shorten_in_city
(
city
,
schools
):
for
sc
in
schools
:
name_p
,
place_p
=
sc
[
"
parts
"
]
sc
[
"
name
"
]
=
"
|
"
.
join
([
name_p
,
city
,
place_p
])
def
remove_house_number
(
name
):
name
,
n
=
re
.
subn
(
r
"
(, ([^\W\d_]| |\.)+) [0-9/]+[a-z]?$
"
,
r
"
\1
"
,
name
)
# True if changed, False if not
return
name
,
n
>
0
def
should_have_comma_after_name
(
p_name
):
# Čárku chceme v případech jako
# "Základní škola generála Zdeňka Škarvady, Ostrava-Poruba"
# ale ne pro
# "Základní škola Dolní Ředice, okres Pardubice"
for
sk
in
school_kinds
:
if
p_name
.
endswith
(
sk
):
return
False
return
True
def
shorten_all
(
schools
):
for
sc
in
schools
:
sc
[
"
names
"
].
append
(
remove_formalities
(
sc
[
"
names
"
][
-
1
]))
sc
[
"
parts
"
]
=
partition
(
sc
[
"
names
"
][
-
1
],
sc
[
"
city
"
])
print
(
"
Total schools: {}
"
.
format
(
len
(
schools
)))
n_split
=
0
for
sc
in
schools
:
sc
[
"
names
"
].
append
(
shorten_name
(
sc
[
"
names
"
][
-
1
]))
if
sc
[
"
parts
"
]
is
not
None
:
if
len
(
sc
[
"
parts
"
])
==
1
:
# City name not found in school name
sc
[
"
names
"
].
append
(
f
"
{
sc
[
'
names
'
][
-
1
]
}
,
{
sc
[
'
city
'
]
}
"
)
else
:
n_split
+=
1
assert
len
(
sc
[
"
parts
"
])
==
2
p_name
,
p_place
=
sc
[
"
parts
"
]
p_name
=
shorten_name
(
p_name
).
strip
(
"
,-
"
)
if
should_have_comma_after_name
(
p_name
):
p_name
+=
"
,
"
p_place2
,
changed
=
remove_house_number
(
p_place
)
if
changed
:
sc
[
"
names
"
].
append
(
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
,
{
p_place2
.
strip
(
'
,-
'
)
}
"
)
sc
[
"
names
"
].
append
(
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
"
)
print
(
f
"
Successfully split up
{
n_split
}
schools
"
)
return
schools
schools
=
copy
.
deepcopy
(
schools_orig
)
shortened
=
shorten_all
(
schools
)
def
is_conflict
(
names1
,
names2
):
return
any
([(
name
in
names1
)
for
name
in
names2
])
def
remove_conflicts
(
shortened
):
n_conflicts
=
0
again
=
True
while
again
:
shortened
.
sort
(
key
=
lambda
sc
:
sc
[
"
names
"
][
-
1
])
print
(
"
----------------------------
"
)
n_conflicts
=
0
again
=
False
bad_names
=
set
()
for
sc1
,
sc2
in
zip
(
shortened
,
shortened
[
1
:]):
if
is_conflict
(
sc1
[
"
names
"
],
sc2
[
"
names
"
]):
n_conflicts
+=
1
if
sc1
[
"
names
"
][
0
]
!=
sc2
[
"
names
"
][
0
]:
bad_names
.
add
(
sc1
[
"
names
"
][
-
1
])
again
=
True
for
sc
in
shortened
:
if
sc
[
"
names
"
][
-
1
]
in
bad_names
:
assert
len
(
sc
[
"
names
"
])
>
1
sc
[
"
names
"
].
pop
()
print
(
f
"
Found
{
n_conflicts
}
conflicts
"
)
# Hack - fix false positives, we always want to do these changes
for
sc
in
shortened
:
sc
[
"
names
"
].
append
(
remove_formalities
(
shorten_name
(
sc
[
"
names
"
][
-
1
])))
print
(
"
Done (possible unremovable conflicts)
"
)
remove_conflicts
(
shortened
)
summarize
(
shortened
,
k
=
10
)
try
:
from
tqdm
import
tqdm
except
:
tqdm
=
lambda
x
:
x
# Tohle trvá nesmyslně dlouho, určitě to jde rychleji
places
=
session
.
query
(
db
.
Place
)
for
sc
in
tqdm
(
shortened
):
sc
[
"
db_place
"
].
name
=
sc
[
"
names
"
][
-
1
]
(
places
.
filter
(
db
.
Place
.
place_id
==
sc
[
"
place_id
"
]).
update
(
{
db
.
Place
.
name
:
sc
[
"
names
"
][
-
1
]}
)
)
session
.
commit
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment