Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Odevzdávací Systém MO
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Martin Mareš
Odevzdávací Systém MO
Commits
33d1b36c
Commit
33d1b36c
authored
4 years ago
by
Václav Volhejn
Browse files
Options
Downloads
Patches
Plain Diff
Opravit nalezené chyby/okrajové případy
parent
621db11d
No related branches found
No related tags found
1 merge request
!8
Skript na zkracování oficiálních jmen škol
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
bin/shorten-schools
+126
-83
126 additions, 83 deletions
bin/shorten-schools
with
126 additions
and
83 deletions
bin/shorten-schools
+
126
−
83
View file @
33d1b36c
...
@@ -23,41 +23,12 @@ import copy
...
@@ -23,41 +23,12 @@ import copy
import
random
import
random
import
re
import
re
import
sys
import
sys
import
argparse
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.orm
import
aliased
import
mo.db
as
db
import
mo.db
as
db
session
=
db
.
get_session
()
school_place_t
=
aliased
(
db
.
Place
)
parent_place_t
=
aliased
(
db
.
Place
)
schools_q
=
(
session
.
query
(
db
.
School
,
school_place_t
,
parent_place_t
)
.
filter
(
db
.
School
.
place_id
==
school_place_t
.
place_id
)
.
filter
(
parent_place_t
.
place_id
==
school_place_t
.
parent
)
.
all
()
)
schools
=
[]
for
school
,
place
,
parent_place
in
schools_q
:
# Parent má být škola
assert
parent_place
.
level
==
3
# Toto platí před prvním spuštením skriptu, pak už ne (změníme place.name)
# assert place.name == school.official_name
schools
.
append
(
{
"
place_id
"
:
school
.
place_id
,
"
names
"
:
[
school
.
official_name
],
"
city
"
:
parent_place
.
name
,
"
db_place
"
:
place
,
}
)
def
eprint
(
*
args
,
**
kwargs
):
def
eprint
(
*
args
,
**
kwargs
):
print
(
*
args
,
file
=
sys
.
stderr
,
**
kwargs
)
print
(
*
args
,
file
=
sys
.
stderr
,
**
kwargs
)
...
@@ -91,38 +62,6 @@ def summarize(schools, k=5):
...
@@ -91,38 +62,6 @@ def summarize(schools, k=5):
eprint
()
eprint
()
city_rules
=
[
(
r
"
(\w)-(\w)
"
,
r
"
\1 - \2
"
),
# Mezery kolem pomlček jsou někdy nekonzistentní
(
"
Praha
"
,
"
v Praze
"
),
(
"
v Praze 4
"
,
"
v Praze 12
"
),
(
r
"
v Praze [0-9]+
"
,
"
v Praze
"
),
(
"
v Praze
"
,
"
Praha
"
),
None
,
# Dummy
]
school_kinds
=
[
(
"
Gymnázium
"
,
"
G
"
),
(
"
Vyšší odborná škola
"
,
"
VOŠ
"
),
(
"
Střední odborná škola
"
,
"
SOŠ
"
),
(
"
Střední zdravotnická škola
"
,
"
SZŠ
"
),
(
"
Střední průmyslová škola
"
,
"
SPŠ
"
),
(
"
Střední pedagogická škola
"
,
"
SPŠ
"
),
(
"
Střední odborné učiliště
"
,
"
SOU
"
),
(
"
Střední škola
"
,
"
SŠ
"
),
(
"
Základní škola
"
,
"
ZŠ
"
),
(
"
Základní umělecká škola
"
,
"
ZUŠ
"
),
(
"
Mateřská škola
"
,
"
MŠ
"
),
]
formalities
=
[
r
"
,?-? ?příspěvková organizace
"
,
r
"
,? s.r.o.
"
,
r
"
,? o.p.s.
"
,
r
"
s právem státní jazykové zkoušky
"
,
r
"
,? ?okres .+$
"
,
]
def
remove_formalities
(
name
):
def
remove_formalities
(
name
):
for
formality
in
formalities
:
for
formality
in
formalities
:
name
=
re
.
sub
(
formality
,
""
,
name
,
flags
=
re
.
IGNORECASE
)
name
=
re
.
sub
(
formality
,
""
,
name
,
flags
=
re
.
IGNORECASE
)
...
@@ -190,6 +129,20 @@ def should_have_comma_after_name(p_name):
...
@@ -190,6 +129,20 @@ def should_have_comma_after_name(p_name):
return
True
return
True
def
postprocess_name_part
(
p_name
):
# Vyřeší okrajové případy části názvu před městem
p_name
=
p_name
.
strip
(
"
,-
"
)
p_name
=
re
.
sub
(
"
v$
"
,
""
,
p_name
)
# Pro případy jako "G v Kroměříži" -> "G v, Kroměříž"
if
should_have_comma_after_name
(
p_name
):
p_name
+=
"
,
"
return
p_name
def
shorten_all
(
schools
):
def
shorten_all
(
schools
):
for
sc
in
schools
:
for
sc
in
schools
:
sc
[
"
names
"
].
append
(
remove_formalities
(
sc
[
"
names
"
][
-
1
]))
sc
[
"
names
"
].
append
(
remove_formalities
(
sc
[
"
names
"
][
-
1
]))
...
@@ -204,7 +157,8 @@ def shorten_all(schools):
...
@@ -204,7 +157,8 @@ def shorten_all(schools):
if
sc
[
"
parts
"
]
is
not
None
:
if
sc
[
"
parts
"
]
is
not
None
:
if
len
(
sc
[
"
parts
"
])
==
1
:
if
len
(
sc
[
"
parts
"
])
==
1
:
# Název města nenalezen v názvu školy
# Název města nenalezen v názvu školy
sc
[
"
names
"
].
append
(
f
"
{
sc
[
'
names
'
][
-
1
]
}
,
{
sc
[
'
city
'
]
}
"
)
p_name
=
postprocess_name_part
(
sc
[
"
names
"
][
-
1
])
sc
[
"
names
"
].
append
(
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
"
)
else
:
else
:
# Když máme rozdělení, můžeme zkusit odstanit číslo popisné
# Když máme rozdělení, můžeme zkusit odstanit číslo popisné
# a případně i celý název ulice
# a případně i celý název ulice
...
@@ -213,9 +167,8 @@ def shorten_all(schools):
...
@@ -213,9 +167,8 @@ def shorten_all(schools):
p_name
,
p_place
=
sc
[
"
parts
"
]
p_name
,
p_place
=
sc
[
"
parts
"
]
p_name
=
shorten_name
(
p_name
).
strip
(
"
,-
"
)
p_name
=
shorten_name
(
p_name
)
if
should_have_comma_after_name
(
p_name
):
p_name
=
postprocess_name_part
(
p_name
)
p_name
+=
"
,
"
p_place2
,
changed
=
remove_house_number
(
p_place
)
p_place2
,
changed
=
remove_house_number
(
p_place
)
...
@@ -224,12 +177,14 @@ def shorten_all(schools):
...
@@ -224,12 +177,14 @@ def shorten_all(schools):
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
,
{
p_place2
.
strip
(
'
,-
'
)
}
"
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
,
{
p_place2
.
strip
(
'
,-
'
)
}
"
)
)
if
"
Praha
"
not
in
sc
[
"
city
"
]:
# např. "G Praha 2" nechceme
sc
[
"
names
"
].
append
(
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
"
)
sc
[
"
names
"
].
append
(
f
"
{
p_name
}
{
sc
[
'
city
'
]
}
"
)
eprint
(
f
"
Successfully split up
{
n_split
}
schools
"
)
eprint
(
f
"
Successfully split up
{
n_split
}
schools
"
)
return
schools
return
schools
def
is_conflict
(
names1
,
names2
):
def
is_conflict
(
names1
,
names2
):
return
any
([(
name
in
names1
)
for
name
in
names2
])
return
any
([(
name
in
names1
)
for
name
in
names2
])
...
@@ -268,27 +223,115 @@ def remove_conflicts(shortened):
...
@@ -268,27 +223,115 @@ def remove_conflicts(shortened):
eprint
(
"
Done (possible unremovable conflicts)
"
)
eprint
(
"
Done (possible unremovable conflicts)
"
)
city_rules
=
[
(
r
"
(\w)-(\w)
"
,
r
"
\1 - \2
"
),
# Mezery kolem pomlček jsou někdy nekonzistentní
(
"
Praha
"
,
"
v Praze
"
),
(
"
v Praze 4
"
,
"
v Praze 12
"
),
(
r
"
v Praze [0-9]+
"
,
"
v Praze
"
),
(
"
v Praze
"
,
"
Praha
"
),
None
,
# Dummy
]
school_kinds
=
[
(
"
Gymnázium
"
,
"
G
"
),
(
"
Vyšší odborná škola
"
,
"
VOŠ
"
),
(
"
Střední odborná škola
"
,
"
SOŠ
"
),
(
"
Střední zdravotnická škola
"
,
"
SZŠ
"
),
(
"
Střední průmyslová škola
"
,
"
SPŠ
"
),
(
"
Střední pedagogická škola
"
,
"
SPŠ
"
),
(
"
Střední odborné učiliště
"
,
"
SOU
"
),
(
"
Střední škola
"
,
"
SŠ
"
),
(
"
Základní škola
"
,
"
ZŠ
"
),
(
"
Základní umělecká škola
"
,
"
ZUŠ
"
),
(
"
Mateřská škola
"
,
"
MŠ
"
),
]
formalities
=
[
r
"
,?-? ?příspěvková organizace
"
,
r
"
,? s\.r\.o\.
"
,
r
"
,? o\.p\.s\.
"
,
r
"
s právem státní jazykové zkoušky
"
,
r
"
,? ?okres .+$
"
,
]
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"
Automaticky zkrátí jména škol v databázi
"
)
parser
.
add_argument
(
"
-n
"
,
"
--dry-run
"
,
action
=
"
store_true
"
,
help
=
"
Jen zobrazit vygenerovaná zkrácení, neměnit databázi
"
,
)
parser
.
add_argument
(
"
--restore
"
,
action
=
"
store_true
"
,
help
=
"
Vrátí se k oficiálním názvům
"
)
args
=
parser
.
parse_args
()
session
=
db
.
get_session
()
school_place_t
=
aliased
(
db
.
Place
)
parent_place_t
=
aliased
(
db
.
Place
)
schools_q
=
(
session
.
query
(
db
.
School
,
school_place_t
,
parent_place_t
)
.
filter
(
db
.
School
.
place_id
==
school_place_t
.
place_id
)
.
filter
(
parent_place_t
.
place_id
==
school_place_t
.
parent
)
.
all
()
)
if
args
.
restore
:
eprint
(
"
Vracím se k původním názvům.
"
)
for
school
,
place
,
parent_place
in
schools_q
:
place
.
name
=
school
.
official_name
session
.
commit
()
return
schools
=
[]
for
school
,
place
,
parent_place
in
schools_q
:
# Parent má být škola
assert
parent_place
.
level
==
3
# Toto platí před prvním spuštením skriptu, pak už ne (změníme place.name)
# assert place.name == school.official_name
schools
.
append
(
{
"
place_id
"
:
school
.
place_id
,
"
names
"
:
[
school
.
official_name
],
"
city
"
:
parent_place
.
name
,
"
db_place
"
:
place
,
}
)
shortened
=
shorten_all
(
schools
)
shortened
=
shorten_all
(
schools
)
remove_conflicts
(
shortened
)
remove_conflicts
(
shortened
)
summarize
(
shortened
,
k
=
10
)
summarize
(
shortened
,
k
=
10
)
try
:
if
args
.
dry_run
:
f
rom
tqdm
import
tqdm
f
ilename
=
"
prejmenovani.txt
"
except
:
with
open
(
filename
,
"
w
"
)
as
f
:
tqdm
=
lambda
x
:
x
shortened
.
sort
(
key
=
lambda
sc
:
sc
[
"
names
"
][
0
])
for
sc
in
shortened
:
# f.write(f"{sc['names'][0]} -> {sc['names'][-1]}\n")
f
.
write
(
f
"
{
sc
[
'
names
'
][
-
1
]
}
(
{
sc
[
'
names
'
][
0
]
}
)
\n
"
)
# f.write(f"{sc['names']}, {sc['city']}\n")
# Tohle trvá nesmyslně dlouho, určitě to jde rychleji
print
(
f
"
Seznam všech přejmenování uložen do
{
filename
}
.
"
)
places
=
session
.
query
(
db
.
Place
)
return
for
sc
in
tqdm
(
shortened
):
# Zapsat do DB
for
sc
in
shortened
:
sc
[
"
db_place
"
].
name
=
sc
[
"
names
"
][
-
1
]
sc
[
"
db_place
"
].
name
=
sc
[
"
names
"
][
-
1
]
session
.
commit
()
(
places
.
filter
(
db
.
Place
.
place_id
==
sc
[
"
place_id
"
]).
update
(
{
db
.
Place
.
name
:
sc
[
"
names
"
][
-
1
]}
)
)
session
.
commit
()
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment