diff --git a/mo/jobs/protocols.py b/mo/jobs/protocols.py index 2655f26b7be4409a68ed44fcd4881df7ad9d8fbc..1390cfe913e05deb68e04866a18882839249012c 100644 --- a/mo/jobs/protocols.py +++ b/mo/jobs/protocols.py @@ -1,12 +1,20 @@ # Implementace jobů na práci s protokoly +from PIL import Image +from dataclasses import dataclass +import multiprocessing import os +import poppler +import pyzbar.pyzbar as pyzbar import re +from sqlalchemy import delete from sqlalchemy.orm import joinedload +from sqlalchemy.orm.query import Query import subprocess from typing import List, Optional import mo +import mo.config as config import mo.db as db from mo.jobs import TheJob, job_handler from mo.util import logger, part_path @@ -53,34 +61,42 @@ def tex_arg(s: str) -> str: return '{' + s + '}' +def _get_user_id_query(contest: db.Contest, site_id: Optional[int]) -> Query: + q = db.get_session().query(db.Participation.user_id).filter_by(contest=contest) + if site_id is not None: + q = q.filter_by(place_id=site_id) + q = q.filter(db.Participation.state.in_((db.PartState.invited, db.PartState.registered, db.PartState.present))) + return q + + +def _get_pants(contest: db.Contest, site_id: Optional[int]) -> List[db.Participant]: + user_id_subq = _get_user_id_query(contest, site_id).subquery() + + pants = (db.get_session().query(db.Participant) + .options(joinedload(db.Participant.user), joinedload(db.Participant.school_place)) + .filter(db.Participant.user_id.in_(user_id_subq)) + .all()) + pants.sort(key=lambda p: p.user.sort_key()) + + return pants + + @job_handler(db.JobType.create_protocols) def handle_create_protocols(the_job: TheJob): job = the_job.job assert job.in_json is not None - contest_id = job.in_json['contest_id'] # type: ignore - site_id = job.in_json['site_id'] # type: ignore - task_ids = job.in_json['task_ids'] # type: ignore - num_universal = job.in_json['num_universal'] # type: ignore - num_blank = job.in_json['num_blank'] # type: ignore + contest_id: int = job.in_json['contest_id'] # type: ignore + site_id: int = job.in_json['site_id'] # type: ignore + task_ids: List[int] = job.in_json['task_ids'] # type: ignore + num_universal: int = job.in_json['num_universal'] # type: ignore + num_blank: int = job.in_json['num_blank'] # type: ignore sess = db.get_session() contest = sess.query(db.Contest).options(joinedload(db.Contest.round)).get(contest_id) assert contest is not None round = contest.round - user_subq = sess.query(db.Participation.user_id).filter_by(contest=contest) - if site_id is not None: - user_subq = user_subq.filter_by(place_id=site_id) - user_subq = (user_subq - .filter(db.Participation.state.in_((db.PartState.invited, db.PartState.registered, db.PartState.present))) - .subquery()) - - pants = (sess.query(db.Participant) - .options(joinedload(db.Participant.user), joinedload(db.Participant.school_place)) - .filter(db.Participant.user_id.in_(user_subq)) - .all()) - pants.sort(key=lambda p: p.user.sort_key()) - + pants = _get_pants(contest, site_id) tasks = sess.query(db.Task).filter_by(round=round).filter(db.Task.task_id.in_(task_ids)).order_by(db.Task.code).all() pages = [] @@ -90,7 +106,7 @@ def handle_create_protocols(the_job: TheJob): ':'.join(['MO', round.round_code_short(), t.code, str(p.user_id)]), p.user.full_name(), p.grade, - p.school_place.name, + p.school_place.name or '???', t.code, ] pages.append('\\proto' + "".join([tex_arg(x) for x in args])) @@ -138,3 +154,201 @@ def handle_create_protocols(the_job: TheJob): job.out_file = 'protokoly.pdf' job.result = 'Celkem ' + mo.util_format.inflect_number(len(pages), 'list', 'listy', 'listů') + + +# +# Job process_scans: Zpracuje nascanované protokoly +# +# Vstupní JSON: +# { 'contest_id': ID contestu, +# 'site_id': ID soutěžního místa nebo none, +# 'task_ids': [task_id, ...], +# 'in_files': [názvy vstupních souborů] +# } +# +# Výstupní JSON: +# null +# +# Výstupn soubory: +# p-{file_nr:02d}-{page_nr:04d}-(full|small).png +# + + +def schedule_process_scans(contest: db.Contest, site: Optional[db.Place], for_user: db.User, tasks: List[db.Task], in_file_names: List[str]): + place = site or contest.place + + the_job = TheJob() + job = the_job.create(db.JobType.process_scans, for_user) + job.description = f'Zpracování scanů {contest.round.round_code_short()} {place.name}' + + in_files = [] + num_files = 0 + for ifn in in_file_names: + num_files += 1 + in_name = f'input-{num_files:03d}.pdf' + the_job.attach_file(ifn, in_name) + in_files.append(in_name) + assert in_files + + job.in_json = { + 'contest_id': contest.contest_id, + 'site_id': site.place_id if site else None, + 'task_ids': [t.task_id for t in tasks], + 'in_files': in_files, + } + the_job.submit() + + +@dataclass +class ScanJobArgs: + in_path: str + out_prefix: str + + +@dataclass +class ScanJobPage: + code: Optional[str] + + +@job_handler(db.JobType.process_scans) +def handle_process_scans(the_job: TheJob): + job = the_job.job + assert job.in_json is not None + contest_id = job.in_json['contest_id'] # type: ignore + site_id = job.in_json['site_id'] # type: ignore + task_ids = job.in_json['task_ids'] # type: ignore + in_files: List[str] = job.in_json['in_files'] # type: ignore + + sess = db.get_session() + contest = sess.query(db.Contest).options(joinedload(db.Contest.round)).get(contest_id) + assert contest is not None + round = contest.round + round_code = round.round_code_short() + + user_ids = set(u[0] for u in _get_user_id_query(contest, site_id).all()) + + tasks = sess.query(db.Task).filter(db.Task.task_id.in_(task_ids)).all() + tasks_by_code = {t.code: t for t in tasks} + + # Jelikož se plánujeme zamyslet na dlouhou dobu, uzavřeme databázovou session. + sess.commit() + + with multiprocessing.Pool(1) as pool: + args = [ScanJobArgs(in_path=job.file_path(fn), + out_prefix=job.file_path(f'p-{fi:02d}')) + for fi, fn in enumerate(in_files)] + results = pool.map(_process_scan_file, args) + + def _parse_code(pr: ScanJobPage, sp: db.ScanPage) -> Optional[str]: + if pr.code is None: + return None + + fields = pr.code.split(':') + if fields[0] != 'MO': + return 'Neznámý prefix' + + if len(fields) == 2: + if fields[1] == '*': + # Univerzální hlavička úlohy + sp.seq_id = db.SCAN_PAGE_FIX + return None + if fields[1] == '+': + # Pokračovací papír s kódem + sp.seq_id = db.SCAN_PAGE_CONTINUE + return None + + elif len(fields) == 4: + if not fields[3].isnumeric(): + return 'User ID není číslo' + user_id = int(fields[3]) + + if fields[1] != round_code: + return 'Nesouhlasí kód kola' + if fields[2] not in tasks_by_code: + return 'Neznámá úloha' + if user_id not in user_ids: + return 'Neznámý účastník' + sp.user_id = user_id + sp.task_id = tasks_by_code[fields[2]].task_id + sp.seq_id = 0 + return None + + return 'Neznamý formát kódu' + + # Pokud jsme job spustili podruhé (ruční retry), chceme smazat všechny záznamy v scan_pages. + # Pozor, nesynchronizujeme ORM, ale nevadí to, protože v této chvíli mame čerstvou session. + conn = sess.connection() + conn.execute(delete(db.ScanPage.__table__).where(db.ScanPage.job_id == job.job_id)) + + num_pages = 0 + for fi, fn in enumerate(in_files): + for pi, pr in enumerate(results[fi]): + sp = db.ScanPage( + job_id=job.job_id, + file_nr=fi, + page_nr=pi, + seq_id=db.SCAN_PAGE_FIX, + ) + + err = _parse_code(pr, sp) + if err is not None: + logger.debug(f'Scan: {fi}/{pi} ({pr.code}): {err}') + sp.seq_id = db.SCAN_PAGE_UFO + + sess.add(sp) + num_pages += 1 + + job.result = 'Celkem ' + mo.util_format.inflect_number(num_pages, 'strana', 'strany', 'stran') + the_job.expires_in_minutes = config.JOB_EXPIRATION_LONG + + +def _process_scan_file(args: ScanJobArgs) -> List[ScanJobPage]: + # Zpracuje jeden soubor se scany. Běží v odděleném procesu. + + # FIXME: Ošetření chyb + logger.debug(f'Scan: Analyzuji soubor {args.in_path}') + pdf = poppler.load_from_file(args.in_path) + + renderer = poppler.PageRenderer() + renderer.set_render_hint(poppler.RenderHint.antialiasing, True) + renderer.set_render_hint(poppler.RenderHint.text_antialiasing, True) + dpi = 300 + + output = [] + for page_nr in range(pdf.pages): + page = pdf.create_page(page_nr) + page_img = renderer.render_page(page, xres=dpi, yres=dpi) + + full_img = Image.frombytes( + "RGBA", + (page_img.width, page_img.height), + page_img.data, + "raw", + str(page_img.format), + ) + del page_img + + full_img = full_img.convert('L') # Grayscale + full_size = full_img.size + + codes = pyzbar.decode(full_img, symbols=[pyzbar.ZBarSymbol.QRCODE]) + codes = [c for c in codes if c.type == 'QRCODE' and c.data.startswith(b'MO:')] + qr = None + if codes: + if len(codes) > 1: + logger.warning(f'Scan: Strana #{page_nr} obsahuje více QR kódů') + code = codes[0] + qr = code.data.decode('US-ASCII') + # FIXME: Tady by se dala podle kódu otočit stránka + + output.append(ScanJobPage(code=qr)) + + full_img.save(f'{args.out_prefix}-{page_nr:04d}-full.png') + + # FIXME: Potřebujeme vytvářet miniaturu? + small_img = full_img.resize((full_size[0] // 4, full_size[1] // 4)) + small_img.save(f'{args.out_prefix}-{page_nr:04d}-small.png') + + logger.debug(f'Scan: Strana #{page_nr}: {qr}') + + return output