Skip to content
Snippets Groups Projects
Commit e1fade4b authored by Václav Volhejn's avatar Václav Volhejn
Browse files

Skript na zkracování jmen škol

parent 749cecaa
No related branches found
No related tags found
1 merge request!8Skript na zkracování oficiálních jmen škol
#!/usr/bin/env python3
# Zkrátí v databázi oficiální dlouhá jména škol na něco čitelnějšího, uloží
# do sloupce places.name
import copy
import random
import re
from sqlalchemy.orm import aliased
import mo.db as db
session = db.get_session()
school_place_t = aliased(db.Place)
parent_place_t = aliased(db.Place)
schools_q = (
session.query(db.School, school_place_t, parent_place_t)
.filter(db.School.place_id == school_place_t.place_id)
.filter(parent_place_t.place_id == school_place_t.parent)
.all()
)
schools = []
for school, place, parent_place in schools_q:
assert parent_place.level == 3
n = place.name
on = school.official_name
# assert n == on
schools.append(
{
"place_id": school.place_id,
"names": [school.official_name],
"city": parent_place.name,
"db_place": place,
}
)
schools_orig = copy.deepcopy(schools)
def sorted_by_length(schools):
schools2 = copy.copy(schools)
schools2.sort(key=lambda sc: len(sc["names"][-1]))
return schools2
def summarize(schools, k=5):
lens = [len(sc["names"][-1]) for sc in schools]
avg_len = sum(lens) / len(schools)
print("Average length:", avg_len)
print("Maximum length:", max(lens))
names_by_lens = sorted_by_length(schools)
print()
print(f"{k} longest:")
for sc in names_by_lens[::-1][:k]:
print(f'{sc["names"][-1]} (@{sc["city"]})')
random.shuffle(names_by_lens)
print()
print(f"{k} random:")
for sc in names_by_lens[:k]:
print(f'Old: {sc["names"][0]}')
print(f'{sc["names"][-1]}')
print()
city_rules = [
(r"(\w)-(\w)", r"\1 - \2"),
("Praha", "v Praze"),
("v Praze 4", "v Praze 12"),
("v Praze 4", "v Praze 12"),
(r"v Praze [0-9]+", "v Praze"),
("v Praze", "Praha"),
None, # Dummy at the end
]
school_kinds = [
("Gymnázium", "G"),
("Vyšší odborná škola", "VOŠ"),
("Střední odborná škola", "SOŠ"),
("Střední zdravotnická škola", "SZŠ"),
("Střední průmyslová škola", "SPŠ"),
("Střední pedagogická škola", "SPŠ"),
("Střední odborné učiliště", "SOU"),
("Střední škola", ""),
("Základní škola", ""),
("Základní umělecká škola", "ZUŠ"),
("Mateřská škola", ""),
]
formalities = [
r",?-? ?příspěvková organizace",
r",? s.r.o.",
r",? o.p.s.",
r" s právem státní jazykové zkoušky",
r",? ?okres .+$",
]
def remove_formalities(name):
for formality in formalities:
name = re.sub(formality, "", name, flags=re.IGNORECASE)
return name
def shorten_name(name):
for re_from, re_to in school_kinds:
name = re.sub(re_from, re_to, name, flags=re.IGNORECASE)
return name
def partition(name, city):
for rule in city_rules:
# Eat up rest of the word for cases like "Táborské"
pat = r"\b{}\w*\b".format(city)
if re.search(pat, name) is not None:
parts = re.split(pat, name)
if len(parts) != 2:
# Multiple occurrences of city - what to do?
return None
else:
ok = True
for kind, _ in school_kinds:
if kind.lower() in parts[1].lower():
ok = False
if not ok:
# Part of the school kind follows after city name
return None
else:
return parts
if rule is not None:
city = re.sub(rule[0], rule[1], city)
# Failed to find match
return [name]
def shorten_in_city(city, schools):
for sc in schools:
name_p, place_p = sc["parts"]
sc["name"] = "|".join([name_p, city, place_p])
def remove_house_number(name):
name, n = re.subn(r"(, ([^\W\d_]| |\.)+) [0-9/]+[a-z]?$", r"\1", name)
# True if changed, False if not
return name, n > 0
def should_have_comma_after_name(p_name):
# Čárku chceme v případech jako
# "Základní škola generála Zdeňka Škarvady, Ostrava-Poruba"
# ale ne pro
# "Základní škola Dolní Ředice, okres Pardubice"
for sk in school_kinds:
if p_name.endswith(sk):
return False
return True
def shorten_all(schools):
for sc in schools:
sc["names"].append(remove_formalities(sc["names"][-1]))
sc["parts"] = partition(sc["names"][-1], sc["city"])
print("Total schools: {}".format(len(schools)))
n_split = 0
for sc in schools:
sc["names"].append(shorten_name(sc["names"][-1]))
if sc["parts"] is not None:
if len(sc["parts"]) == 1:
# City name not found in school name
sc["names"].append(f"{sc['names'][-1]}, {sc['city']}")
else:
n_split += 1
assert len(sc["parts"]) == 2
p_name, p_place = sc["parts"]
p_name = shorten_name(p_name).strip(" ,-")
if should_have_comma_after_name(p_name):
p_name += ","
p_place2, changed = remove_house_number(p_place)
if changed:
sc["names"].append(
f"{p_name} {sc['city']}, {p_place2.strip(' ,-')}"
)
sc["names"].append(f"{p_name} {sc['city']}")
print(f"Successfully split up {n_split} schools")
return schools
schools = copy.deepcopy(schools_orig)
shortened = shorten_all(schools)
def is_conflict(names1, names2):
return any([(name in names1) for name in names2])
def remove_conflicts(shortened):
n_conflicts = 0
again = True
while again:
shortened.sort(key=lambda sc: sc["names"][-1])
print("----------------------------")
n_conflicts = 0
again = False
bad_names = set()
for sc1, sc2 in zip(shortened, shortened[1:]):
if is_conflict(sc1["names"], sc2["names"]):
n_conflicts += 1
if sc1["names"][0] != sc2["names"][0]:
bad_names.add(sc1["names"][-1])
again = True
for sc in shortened:
if sc["names"][-1] in bad_names:
assert len(sc["names"]) > 1
sc["names"].pop()
print(f"Found {n_conflicts} conflicts")
# Hack - fix false positives, we always want to do these changes
for sc in shortened:
sc["names"].append(remove_formalities(shorten_name(sc["names"][-1])))
print("Done (possible unremovable conflicts)")
remove_conflicts(shortened)
summarize(shortened, k=10)
try:
from tqdm import tqdm
except:
tqdm = lambda x: x
# Tohle trvá nesmyslně dlouho, určitě to jde rychleji
places = session.query(db.Place)
for sc in tqdm(shortened):
sc["db_place"].name = sc["names"][-1]
(
places.filter(db.Place.place_id == sc["place_id"]).update(
{db.Place.name: sc["names"][-1]}
)
)
session.commit()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment