diff --git a/bin/shorten-schools b/bin/shorten-schools new file mode 100755 index 0000000000000000000000000000000000000000..62ebc3435ee882624d6b74e3bef82ec80260cfce --- /dev/null +++ b/bin/shorten-schools @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +# Zkrátí v databázi oficiální dlouhá jména škol na něco čitelnějšího, uloží +# do sloupce places.name + +import copy +import random +import re + +from sqlalchemy.orm import aliased + +import mo.db as db + +session = db.get_session() + +school_place_t = aliased(db.Place) +parent_place_t = aliased(db.Place) + +schools_q = ( + session.query(db.School, school_place_t, parent_place_t) + .filter(db.School.place_id == school_place_t.place_id) + .filter(parent_place_t.place_id == school_place_t.parent) + .all() +) + +schools = [] + +for school, place, parent_place in schools_q: + + assert parent_place.level == 3 + n = place.name + on = school.official_name + # assert n == on + + schools.append( + { + "place_id": school.place_id, + "names": [school.official_name], + "city": parent_place.name, + "db_place": place, + } + ) + +schools_orig = copy.deepcopy(schools) + + +def sorted_by_length(schools): + schools2 = copy.copy(schools) + schools2.sort(key=lambda sc: len(sc["names"][-1])) + return schools2 + + +def summarize(schools, k=5): + lens = [len(sc["names"][-1]) for sc in schools] + avg_len = sum(lens) / len(schools) + print("Average length:", avg_len) + print("Maximum length:", max(lens)) + + names_by_lens = sorted_by_length(schools) + + print() + print(f"{k} longest:") + for sc in names_by_lens[::-1][:k]: + print(f'{sc["names"][-1]} (@{sc["city"]})') + + random.shuffle(names_by_lens) + print() + print(f"{k} random:") + for sc in names_by_lens[:k]: + print(f'Old: {sc["names"][0]}') + print(f'{sc["names"][-1]}') + print() + + +city_rules = [ + (r"(\w)-(\w)", r"\1 - \2"), + ("Praha", "v Praze"), + ("v Praze 4", "v Praze 12"), + ("v Praze 4", "v Praze 12"), + (r"v Praze [0-9]+", "v Praze"), + ("v Praze", "Praha"), + None, # Dummy at the end +] + +school_kinds = [ + ("Gymnázium", "G"), + ("Vyšší odborná škola", "VOŠ"), + ("Střední odborná škola", "SOŠ"), + ("Střední zdravotnická škola", "SZŠ"), + ("Střední průmyslová škola", "SPŠ"), + ("Střední pedagogická škola", "SPŠ"), + ("Střední odborné učiliště", "SOU"), + ("Střední škola", "SŠ"), + ("Základní škola", "ZŠ"), + ("Základní umělecká škola", "ZUŠ"), + ("Mateřská škola", "MŠ"), +] + +formalities = [ + r",?-? ?příspěvková organizace", + r",? s.r.o.", + r",? o.p.s.", + r" s právem státní jazykové zkoušky", + r",? ?okres .+$", +] + + +def remove_formalities(name): + for formality in formalities: + name = re.sub(formality, "", name, flags=re.IGNORECASE) + + return name + + +def shorten_name(name): + for re_from, re_to in school_kinds: + name = re.sub(re_from, re_to, name, flags=re.IGNORECASE) + + return name + + +def partition(name, city): + for rule in city_rules: + # Eat up rest of the word for cases like "Táborské" + pat = r"\b{}\w*\b".format(city) + if re.search(pat, name) is not None: + parts = re.split(pat, name) + + if len(parts) != 2: + # Multiple occurrences of city - what to do? + return None + else: + ok = True + for kind, _ in school_kinds: + if kind.lower() in parts[1].lower(): + ok = False + + if not ok: + # Part of the school kind follows after city name + return None + else: + return parts + + if rule is not None: + city = re.sub(rule[0], rule[1], city) + + # Failed to find match + return [name] + + +def shorten_in_city(city, schools): + for sc in schools: + name_p, place_p = sc["parts"] + sc["name"] = "|".join([name_p, city, place_p]) + + +def remove_house_number(name): + name, n = re.subn(r"(, ([^\W\d_]| |\.)+) [0-9/]+[a-z]?$", r"\1", name) + # True if changed, False if not + return name, n > 0 + + +def should_have_comma_after_name(p_name): + # Čárku chceme v případech jako + # "Základní škola generála Zdeňka Škarvady, Ostrava-Poruba" + # ale ne pro + # "Základní škola Dolní Ředice, okres Pardubice" + + for sk in school_kinds: + if p_name.endswith(sk): + return False + + return True + + +def shorten_all(schools): + for sc in schools: + sc["names"].append(remove_formalities(sc["names"][-1])) + sc["parts"] = partition(sc["names"][-1], sc["city"]) + + print("Total schools: {}".format(len(schools))) + + n_split = 0 + + for sc in schools: + sc["names"].append(shorten_name(sc["names"][-1])) + if sc["parts"] is not None: + if len(sc["parts"]) == 1: + # City name not found in school name + sc["names"].append(f"{sc['names'][-1]}, {sc['city']}") + else: + n_split += 1 + assert len(sc["parts"]) == 2 + + p_name, p_place = sc["parts"] + + p_name = shorten_name(p_name).strip(" ,-") + if should_have_comma_after_name(p_name): + p_name += "," + + p_place2, changed = remove_house_number(p_place) + + if changed: + sc["names"].append( + f"{p_name} {sc['city']}, {p_place2.strip(' ,-')}" + ) + + sc["names"].append(f"{p_name} {sc['city']}") + + print(f"Successfully split up {n_split} schools") + + return schools + + +schools = copy.deepcopy(schools_orig) +shortened = shorten_all(schools) + + +def is_conflict(names1, names2): + return any([(name in names1) for name in names2]) + + +def remove_conflicts(shortened): + n_conflicts = 0 + again = True + + while again: + shortened.sort(key=lambda sc: sc["names"][-1]) + print("----------------------------") + n_conflicts = 0 + again = False + + bad_names = set() + + for sc1, sc2 in zip(shortened, shortened[1:]): + if is_conflict(sc1["names"], sc2["names"]): + n_conflicts += 1 + + if sc1["names"][0] != sc2["names"][0]: + bad_names.add(sc1["names"][-1]) + again = True + + for sc in shortened: + if sc["names"][-1] in bad_names: + assert len(sc["names"]) > 1 + sc["names"].pop() + + print(f"Found {n_conflicts} conflicts") + + # Hack - fix false positives, we always want to do these changes + for sc in shortened: + sc["names"].append(remove_formalities(shorten_name(sc["names"][-1]))) + + print("Done (possible unremovable conflicts)") + + +remove_conflicts(shortened) + +summarize(shortened, k=10) + +try: + from tqdm import tqdm +except: + tqdm = lambda x: x + + +# Tohle trvá nesmyslně dlouho, určitě to jde rychleji +places = session.query(db.Place) + +for sc in tqdm(shortened): + sc["db_place"].name = sc["names"][-1] + + ( + places.filter(db.Place.place_id == sc["place_id"]).update( + {db.Place.name: sc["names"][-1]} + ) + ) + +session.commit()