From aef43e44ddbdfe1cf43e1866dae4d53b3e8e9cce Mon Sep 17 00:00:00 2001
From: Martin Mares <mj@ucw.cz>
Date: Tue, 6 Jul 2021 14:53:15 +0200
Subject: [PATCH] =?UTF-8?q?Protokoly:=20Job=20na=20zpracov=C3=A1n=C3=AD=20?=
 =?UTF-8?q?scan=C5=AF?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mo/jobs/protocols.py | 252 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 233 insertions(+), 19 deletions(-)

diff --git a/mo/jobs/protocols.py b/mo/jobs/protocols.py
index 2655f26b..1390cfe9 100644
--- a/mo/jobs/protocols.py
+++ b/mo/jobs/protocols.py
@@ -1,12 +1,20 @@
 # Implementace jobů na práci s protokoly
 
+from PIL import Image
+from dataclasses import dataclass
+import multiprocessing
 import os
+import poppler
+import pyzbar.pyzbar as pyzbar
 import re
+from sqlalchemy import delete
 from sqlalchemy.orm import joinedload
+from sqlalchemy.orm.query import Query
 import subprocess
 from typing import List, Optional
 
 import mo
+import mo.config as config
 import mo.db as db
 from mo.jobs import TheJob, job_handler
 from mo.util import logger, part_path
@@ -53,34 +61,42 @@ def tex_arg(s: str) -> str:
     return '{' + s + '}'
 
 
+def _get_user_id_query(contest: db.Contest, site_id: Optional[int]) -> Query:
+    q = db.get_session().query(db.Participation.user_id).filter_by(contest=contest)
+    if site_id is not None:
+        q = q.filter_by(place_id=site_id)
+    q = q.filter(db.Participation.state.in_((db.PartState.invited, db.PartState.registered, db.PartState.present)))
+    return q
+
+
+def _get_pants(contest: db.Contest, site_id: Optional[int]) -> List[db.Participant]:
+    user_id_subq = _get_user_id_query(contest, site_id).subquery()
+
+    pants = (db.get_session().query(db.Participant)
+             .options(joinedload(db.Participant.user), joinedload(db.Participant.school_place))
+             .filter(db.Participant.user_id.in_(user_id_subq))
+             .all())
+    pants.sort(key=lambda p: p.user.sort_key())
+
+    return pants
+
+
 @job_handler(db.JobType.create_protocols)
 def handle_create_protocols(the_job: TheJob):
     job = the_job.job
     assert job.in_json is not None
-    contest_id = job.in_json['contest_id']  # type: ignore
-    site_id = job.in_json['site_id']        # type: ignore
-    task_ids = job.in_json['task_ids']      # type: ignore
-    num_universal = job.in_json['num_universal']    # type: ignore
-    num_blank = job.in_json['num_blank']    # type: ignore
+    contest_id: int = job.in_json['contest_id']  # type: ignore
+    site_id: int = job.in_json['site_id']        # type: ignore
+    task_ids: List[int] = job.in_json['task_ids']      # type: ignore
+    num_universal: int = job.in_json['num_universal']    # type: ignore
+    num_blank: int = job.in_json['num_blank']    # type: ignore
 
     sess = db.get_session()
     contest = sess.query(db.Contest).options(joinedload(db.Contest.round)).get(contest_id)
     assert contest is not None
     round = contest.round
 
-    user_subq = sess.query(db.Participation.user_id).filter_by(contest=contest)
-    if site_id is not None:
-        user_subq = user_subq.filter_by(place_id=site_id)
-    user_subq = (user_subq
-                 .filter(db.Participation.state.in_((db.PartState.invited, db.PartState.registered, db.PartState.present)))
-                 .subquery())
-
-    pants = (sess.query(db.Participant)
-             .options(joinedload(db.Participant.user), joinedload(db.Participant.school_place))
-             .filter(db.Participant.user_id.in_(user_subq))
-             .all())
-    pants.sort(key=lambda p: p.user.sort_key())
-
+    pants = _get_pants(contest, site_id)
     tasks = sess.query(db.Task).filter_by(round=round).filter(db.Task.task_id.in_(task_ids)).order_by(db.Task.code).all()
 
     pages = []
@@ -90,7 +106,7 @@ def handle_create_protocols(the_job: TheJob):
                 ':'.join(['MO', round.round_code_short(), t.code, str(p.user_id)]),
                 p.user.full_name(),
                 p.grade,
-                p.school_place.name,
+                p.school_place.name or '???',
                 t.code,
             ]
             pages.append('\\proto' + "".join([tex_arg(x) for x in args]))
@@ -138,3 +154,201 @@ def handle_create_protocols(the_job: TheJob):
 
     job.out_file = 'protokoly.pdf'
     job.result = 'Celkem ' + mo.util_format.inflect_number(len(pages), 'list', 'listy', 'listů')
+
+
+#
+# Job process_scans: Zpracuje nascanované protokoly
+#
+# Vstupní JSON:
+#        { 'contest_id': ID contestu,
+#          'site_id': ID soutěžního místa nebo none,
+#          'task_ids': [task_id, ...],
+#          'in_files': [názvy vstupních souborů]
+#        }
+#
+# Výstupní JSON:
+#        null
+#
+# Výstupn soubory:
+#       p-{file_nr:02d}-{page_nr:04d}-(full|small).png
+#
+
+
+def schedule_process_scans(contest: db.Contest, site: Optional[db.Place], for_user: db.User, tasks: List[db.Task], in_file_names: List[str]):
+    place = site or contest.place
+
+    the_job = TheJob()
+    job = the_job.create(db.JobType.process_scans, for_user)
+    job.description = f'Zpracování scanů {contest.round.round_code_short()} {place.name}'
+
+    in_files = []
+    num_files = 0
+    for ifn in in_file_names:
+        num_files += 1
+        in_name = f'input-{num_files:03d}.pdf'
+        the_job.attach_file(ifn, in_name)
+        in_files.append(in_name)
+    assert in_files
+
+    job.in_json = {
+        'contest_id': contest.contest_id,
+        'site_id': site.place_id if site else None,
+        'task_ids': [t.task_id for t in tasks],
+        'in_files': in_files,
+    }
+    the_job.submit()
+
+
+@dataclass
+class ScanJobArgs:
+    in_path: str
+    out_prefix: str
+
+
+@dataclass
+class ScanJobPage:
+    code: Optional[str]
+
+
+@job_handler(db.JobType.process_scans)
+def handle_process_scans(the_job: TheJob):
+    job = the_job.job
+    assert job.in_json is not None
+    contest_id = job.in_json['contest_id']  # type: ignore
+    site_id = job.in_json['site_id']        # type: ignore
+    task_ids = job.in_json['task_ids']      # type: ignore
+    in_files: List[str] = job.in_json['in_files']  # type: ignore
+
+    sess = db.get_session()
+    contest = sess.query(db.Contest).options(joinedload(db.Contest.round)).get(contest_id)
+    assert contest is not None
+    round = contest.round
+    round_code = round.round_code_short()
+
+    user_ids = set(u[0] for u in _get_user_id_query(contest, site_id).all())
+
+    tasks = sess.query(db.Task).filter(db.Task.task_id.in_(task_ids)).all()
+    tasks_by_code = {t.code: t for t in tasks}
+
+    # Jelikož se plánujeme zamyslet na dlouhou dobu, uzavřeme databázovou session.
+    sess.commit()
+
+    with multiprocessing.Pool(1) as pool:
+        args = [ScanJobArgs(in_path=job.file_path(fn),
+                            out_prefix=job.file_path(f'p-{fi:02d}'))
+                for fi, fn in enumerate(in_files)]
+        results = pool.map(_process_scan_file, args)
+
+    def _parse_code(pr: ScanJobPage, sp: db.ScanPage) -> Optional[str]:
+        if pr.code is None:
+            return None
+
+        fields = pr.code.split(':')
+        if fields[0] != 'MO':
+            return 'Neznámý prefix'
+
+        if len(fields) == 2:
+            if fields[1] == '*':
+                # Univerzální hlavička úlohy
+                sp.seq_id = db.SCAN_PAGE_FIX
+                return None
+            if fields[1] == '+':
+                # Pokračovací papír s kódem
+                sp.seq_id = db.SCAN_PAGE_CONTINUE
+                return None
+
+        elif len(fields) == 4:
+            if not fields[3].isnumeric():
+                return 'User ID není číslo'
+            user_id = int(fields[3])
+
+            if fields[1] != round_code:
+                return 'Nesouhlasí kód kola'
+            if fields[2] not in tasks_by_code:
+                return 'Neznámá úloha'
+            if user_id not in user_ids:
+                return 'Neznámý účastník'
+            sp.user_id = user_id
+            sp.task_id = tasks_by_code[fields[2]].task_id
+            sp.seq_id = 0
+            return None
+
+        return 'Neznamý formát kódu'
+
+    # Pokud jsme job spustili podruhé (ruční retry), chceme smazat všechny záznamy v scan_pages.
+    # Pozor, nesynchronizujeme ORM, ale nevadí to, protože v této chvíli mame čerstvou session.
+    conn = sess.connection()
+    conn.execute(delete(db.ScanPage.__table__).where(db.ScanPage.job_id == job.job_id))
+
+    num_pages = 0
+    for fi, fn in enumerate(in_files):
+        for pi, pr in enumerate(results[fi]):
+            sp = db.ScanPage(
+                job_id=job.job_id,
+                file_nr=fi,
+                page_nr=pi,
+                seq_id=db.SCAN_PAGE_FIX,
+            )
+
+            err = _parse_code(pr, sp)
+            if err is not None:
+                logger.debug(f'Scan: {fi}/{pi} ({pr.code}): {err}')
+                sp.seq_id = db.SCAN_PAGE_UFO
+
+            sess.add(sp)
+            num_pages += 1
+
+    job.result = 'Celkem ' + mo.util_format.inflect_number(num_pages, 'strana', 'strany', 'stran')
+    the_job.expires_in_minutes = config.JOB_EXPIRATION_LONG
+
+
+def _process_scan_file(args: ScanJobArgs) -> List[ScanJobPage]:
+    # Zpracuje jeden soubor se scany. Běží v odděleném procesu.
+
+    # FIXME: Ošetření chyb
+    logger.debug(f'Scan: Analyzuji soubor {args.in_path}')
+    pdf = poppler.load_from_file(args.in_path)
+
+    renderer = poppler.PageRenderer()
+    renderer.set_render_hint(poppler.RenderHint.antialiasing, True)
+    renderer.set_render_hint(poppler.RenderHint.text_antialiasing, True)
+    dpi = 300
+
+    output = []
+    for page_nr in range(pdf.pages):
+        page = pdf.create_page(page_nr)
+        page_img = renderer.render_page(page, xres=dpi, yres=dpi)
+
+        full_img = Image.frombytes(
+            "RGBA",
+            (page_img.width, page_img.height),
+            page_img.data,
+            "raw",
+            str(page_img.format),
+        )
+        del page_img
+
+        full_img = full_img.convert('L')               # Grayscale
+        full_size = full_img.size
+
+        codes = pyzbar.decode(full_img, symbols=[pyzbar.ZBarSymbol.QRCODE])
+        codes = [c for c in codes if c.type == 'QRCODE' and c.data.startswith(b'MO:')]
+        qr = None
+        if codes:
+            if len(codes) > 1:
+                logger.warning(f'Scan: Strana #{page_nr} obsahuje více QR kódů')
+            code = codes[0]
+            qr = code.data.decode('US-ASCII')
+            # FIXME: Tady by se dala podle kódu otočit stránka
+
+        output.append(ScanJobPage(code=qr))
+
+        full_img.save(f'{args.out_prefix}-{page_nr:04d}-full.png')
+
+        # FIXME: Potřebujeme vytvářet miniaturu?
+        small_img = full_img.resize((full_size[0] // 4, full_size[1] // 4))
+        small_img.save(f'{args.out_prefix}-{page_nr:04d}-small.png')
+
+        logger.debug(f'Scan: Strana #{page_nr}: {qr}')
+
+    return output
-- 
GitLab