Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Odevzdávací Systém MO
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Model registry
Analyze
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Martin Mareš
Odevzdávací Systém MO
Commits
aef43e44
Commit
aef43e44
authored
4 years ago
by
Martin Mareš
Browse files
Options
Downloads
Patches
Plain Diff
Protokoly: Job na zpracování scanů
parent
58d9c272
No related branches found
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
mo/jobs/protocols.py
+233
-19
233 additions, 19 deletions
mo/jobs/protocols.py
with
233 additions
and
19 deletions
mo/jobs/protocols.py
+
233
−
19
View file @
aef43e44
# Implementace jobů na práci s protokoly
from
PIL
import
Image
from
dataclasses
import
dataclass
import
multiprocessing
import
os
import
poppler
import
pyzbar.pyzbar
as
pyzbar
import
re
from
sqlalchemy
import
delete
from
sqlalchemy.orm
import
joinedload
from
sqlalchemy.orm.query
import
Query
import
subprocess
from
typing
import
List
,
Optional
import
mo
import
mo.config
as
config
import
mo.db
as
db
from
mo.jobs
import
TheJob
,
job_handler
from
mo.util
import
logger
,
part_path
...
...
@@ -53,34 +61,42 @@ def tex_arg(s: str) -> str:
return
'
{
'
+
s
+
'
}
'
def
_get_user_id_query
(
contest
:
db
.
Contest
,
site_id
:
Optional
[
int
])
->
Query
:
q
=
db
.
get_session
().
query
(
db
.
Participation
.
user_id
).
filter_by
(
contest
=
contest
)
if
site_id
is
not
None
:
q
=
q
.
filter_by
(
place_id
=
site_id
)
q
=
q
.
filter
(
db
.
Participation
.
state
.
in_
((
db
.
PartState
.
invited
,
db
.
PartState
.
registered
,
db
.
PartState
.
present
)))
return
q
def
_get_pants
(
contest
:
db
.
Contest
,
site_id
:
Optional
[
int
])
->
List
[
db
.
Participant
]:
user_id_subq
=
_get_user_id_query
(
contest
,
site_id
).
subquery
()
pants
=
(
db
.
get_session
().
query
(
db
.
Participant
)
.
options
(
joinedload
(
db
.
Participant
.
user
),
joinedload
(
db
.
Participant
.
school_place
))
.
filter
(
db
.
Participant
.
user_id
.
in_
(
user_id_subq
))
.
all
())
pants
.
sort
(
key
=
lambda
p
:
p
.
user
.
sort_key
())
return
pants
@job_handler
(
db
.
JobType
.
create_protocols
)
def
handle_create_protocols
(
the_job
:
TheJob
):
job
=
the_job
.
job
assert
job
.
in_json
is
not
None
contest_id
=
job
.
in_json
[
'
contest_id
'
]
# type: ignore
site_id
=
job
.
in_json
[
'
site_id
'
]
# type: ignore
task_ids
=
job
.
in_json
[
'
task_ids
'
]
# type: ignore
num_universal
=
job
.
in_json
[
'
num_universal
'
]
# type: ignore
num_blank
=
job
.
in_json
[
'
num_blank
'
]
# type: ignore
contest_id
:
int
=
job
.
in_json
[
'
contest_id
'
]
# type: ignore
site_id
:
int
=
job
.
in_json
[
'
site_id
'
]
# type: ignore
task_ids
:
List
[
int
]
=
job
.
in_json
[
'
task_ids
'
]
# type: ignore
num_universal
:
int
=
job
.
in_json
[
'
num_universal
'
]
# type: ignore
num_blank
:
int
=
job
.
in_json
[
'
num_blank
'
]
# type: ignore
sess
=
db
.
get_session
()
contest
=
sess
.
query
(
db
.
Contest
).
options
(
joinedload
(
db
.
Contest
.
round
)).
get
(
contest_id
)
assert
contest
is
not
None
round
=
contest
.
round
user_subq
=
sess
.
query
(
db
.
Participation
.
user_id
).
filter_by
(
contest
=
contest
)
if
site_id
is
not
None
:
user_subq
=
user_subq
.
filter_by
(
place_id
=
site_id
)
user_subq
=
(
user_subq
.
filter
(
db
.
Participation
.
state
.
in_
((
db
.
PartState
.
invited
,
db
.
PartState
.
registered
,
db
.
PartState
.
present
)))
.
subquery
())
pants
=
(
sess
.
query
(
db
.
Participant
)
.
options
(
joinedload
(
db
.
Participant
.
user
),
joinedload
(
db
.
Participant
.
school_place
))
.
filter
(
db
.
Participant
.
user_id
.
in_
(
user_subq
))
.
all
())
pants
.
sort
(
key
=
lambda
p
:
p
.
user
.
sort_key
())
pants
=
_get_pants
(
contest
,
site_id
)
tasks
=
sess
.
query
(
db
.
Task
).
filter_by
(
round
=
round
).
filter
(
db
.
Task
.
task_id
.
in_
(
task_ids
)).
order_by
(
db
.
Task
.
code
).
all
()
pages
=
[]
...
...
@@ -90,7 +106,7 @@ def handle_create_protocols(the_job: TheJob):
'
:
'
.
join
([
'
MO
'
,
round
.
round_code_short
(),
t
.
code
,
str
(
p
.
user_id
)]),
p
.
user
.
full_name
(),
p
.
grade
,
p
.
school_place
.
name
,
p
.
school_place
.
name
or
'
???
'
,
t
.
code
,
]
pages
.
append
(
'
\\
proto
'
+
""
.
join
([
tex_arg
(
x
)
for
x
in
args
]))
...
...
@@ -138,3 +154,201 @@ def handle_create_protocols(the_job: TheJob):
job
.
out_file
=
'
protokoly.pdf
'
job
.
result
=
'
Celkem
'
+
mo
.
util_format
.
inflect_number
(
len
(
pages
),
'
list
'
,
'
listy
'
,
'
listů
'
)
#
# Job process_scans: Zpracuje nascanované protokoly
#
# Vstupní JSON:
# { 'contest_id': ID contestu,
# 'site_id': ID soutěžního místa nebo none,
# 'task_ids': [task_id, ...],
# 'in_files': [názvy vstupních souborů]
# }
#
# Výstupní JSON:
# null
#
# Výstupn soubory:
# p-{file_nr:02d}-{page_nr:04d}-(full|small).png
#
def
schedule_process_scans
(
contest
:
db
.
Contest
,
site
:
Optional
[
db
.
Place
],
for_user
:
db
.
User
,
tasks
:
List
[
db
.
Task
],
in_file_names
:
List
[
str
]):
place
=
site
or
contest
.
place
the_job
=
TheJob
()
job
=
the_job
.
create
(
db
.
JobType
.
process_scans
,
for_user
)
job
.
description
=
f
'
Zpracování scanů
{
contest
.
round
.
round_code_short
()
}
{
place
.
name
}
'
in_files
=
[]
num_files
=
0
for
ifn
in
in_file_names
:
num_files
+=
1
in_name
=
f
'
input-
{
num_files
:
03
d
}
.pdf
'
the_job
.
attach_file
(
ifn
,
in_name
)
in_files
.
append
(
in_name
)
assert
in_files
job
.
in_json
=
{
'
contest_id
'
:
contest
.
contest_id
,
'
site_id
'
:
site
.
place_id
if
site
else
None
,
'
task_ids
'
:
[
t
.
task_id
for
t
in
tasks
],
'
in_files
'
:
in_files
,
}
the_job
.
submit
()
@dataclass
class
ScanJobArgs
:
in_path
:
str
out_prefix
:
str
@dataclass
class
ScanJobPage
:
code
:
Optional
[
str
]
@job_handler
(
db
.
JobType
.
process_scans
)
def
handle_process_scans
(
the_job
:
TheJob
):
job
=
the_job
.
job
assert
job
.
in_json
is
not
None
contest_id
=
job
.
in_json
[
'
contest_id
'
]
# type: ignore
site_id
=
job
.
in_json
[
'
site_id
'
]
# type: ignore
task_ids
=
job
.
in_json
[
'
task_ids
'
]
# type: ignore
in_files
:
List
[
str
]
=
job
.
in_json
[
'
in_files
'
]
# type: ignore
sess
=
db
.
get_session
()
contest
=
sess
.
query
(
db
.
Contest
).
options
(
joinedload
(
db
.
Contest
.
round
)).
get
(
contest_id
)
assert
contest
is
not
None
round
=
contest
.
round
round_code
=
round
.
round_code_short
()
user_ids
=
set
(
u
[
0
]
for
u
in
_get_user_id_query
(
contest
,
site_id
).
all
())
tasks
=
sess
.
query
(
db
.
Task
).
filter
(
db
.
Task
.
task_id
.
in_
(
task_ids
)).
all
()
tasks_by_code
=
{
t
.
code
:
t
for
t
in
tasks
}
# Jelikož se plánujeme zamyslet na dlouhou dobu, uzavřeme databázovou session.
sess
.
commit
()
with
multiprocessing
.
Pool
(
1
)
as
pool
:
args
=
[
ScanJobArgs
(
in_path
=
job
.
file_path
(
fn
),
out_prefix
=
job
.
file_path
(
f
'
p-
{
fi
:
02
d
}
'
))
for
fi
,
fn
in
enumerate
(
in_files
)]
results
=
pool
.
map
(
_process_scan_file
,
args
)
def
_parse_code
(
pr
:
ScanJobPage
,
sp
:
db
.
ScanPage
)
->
Optional
[
str
]:
if
pr
.
code
is
None
:
return
None
fields
=
pr
.
code
.
split
(
'
:
'
)
if
fields
[
0
]
!=
'
MO
'
:
return
'
Neznámý prefix
'
if
len
(
fields
)
==
2
:
if
fields
[
1
]
==
'
*
'
:
# Univerzální hlavička úlohy
sp
.
seq_id
=
db
.
SCAN_PAGE_FIX
return
None
if
fields
[
1
]
==
'
+
'
:
# Pokračovací papír s kódem
sp
.
seq_id
=
db
.
SCAN_PAGE_CONTINUE
return
None
elif
len
(
fields
)
==
4
:
if
not
fields
[
3
].
isnumeric
():
return
'
User ID není číslo
'
user_id
=
int
(
fields
[
3
])
if
fields
[
1
]
!=
round_code
:
return
'
Nesouhlasí kód kola
'
if
fields
[
2
]
not
in
tasks_by_code
:
return
'
Neznámá úloha
'
if
user_id
not
in
user_ids
:
return
'
Neznámý účastník
'
sp
.
user_id
=
user_id
sp
.
task_id
=
tasks_by_code
[
fields
[
2
]].
task_id
sp
.
seq_id
=
0
return
None
return
'
Neznamý formát kódu
'
# Pokud jsme job spustili podruhé (ruční retry), chceme smazat všechny záznamy v scan_pages.
# Pozor, nesynchronizujeme ORM, ale nevadí to, protože v této chvíli mame čerstvou session.
conn
=
sess
.
connection
()
conn
.
execute
(
delete
(
db
.
ScanPage
.
__table__
).
where
(
db
.
ScanPage
.
job_id
==
job
.
job_id
))
num_pages
=
0
for
fi
,
fn
in
enumerate
(
in_files
):
for
pi
,
pr
in
enumerate
(
results
[
fi
]):
sp
=
db
.
ScanPage
(
job_id
=
job
.
job_id
,
file_nr
=
fi
,
page_nr
=
pi
,
seq_id
=
db
.
SCAN_PAGE_FIX
,
)
err
=
_parse_code
(
pr
,
sp
)
if
err
is
not
None
:
logger
.
debug
(
f
'
Scan:
{
fi
}
/
{
pi
}
(
{
pr
.
code
}
):
{
err
}
'
)
sp
.
seq_id
=
db
.
SCAN_PAGE_UFO
sess
.
add
(
sp
)
num_pages
+=
1
job
.
result
=
'
Celkem
'
+
mo
.
util_format
.
inflect_number
(
num_pages
,
'
strana
'
,
'
strany
'
,
'
stran
'
)
the_job
.
expires_in_minutes
=
config
.
JOB_EXPIRATION_LONG
def
_process_scan_file
(
args
:
ScanJobArgs
)
->
List
[
ScanJobPage
]:
# Zpracuje jeden soubor se scany. Běží v odděleném procesu.
# FIXME: Ošetření chyb
logger
.
debug
(
f
'
Scan: Analyzuji soubor
{
args
.
in_path
}
'
)
pdf
=
poppler
.
load_from_file
(
args
.
in_path
)
renderer
=
poppler
.
PageRenderer
()
renderer
.
set_render_hint
(
poppler
.
RenderHint
.
antialiasing
,
True
)
renderer
.
set_render_hint
(
poppler
.
RenderHint
.
text_antialiasing
,
True
)
dpi
=
300
output
=
[]
for
page_nr
in
range
(
pdf
.
pages
):
page
=
pdf
.
create_page
(
page_nr
)
page_img
=
renderer
.
render_page
(
page
,
xres
=
dpi
,
yres
=
dpi
)
full_img
=
Image
.
frombytes
(
"
RGBA
"
,
(
page_img
.
width
,
page_img
.
height
),
page_img
.
data
,
"
raw
"
,
str
(
page_img
.
format
),
)
del
page_img
full_img
=
full_img
.
convert
(
'
L
'
)
# Grayscale
full_size
=
full_img
.
size
codes
=
pyzbar
.
decode
(
full_img
,
symbols
=
[
pyzbar
.
ZBarSymbol
.
QRCODE
])
codes
=
[
c
for
c
in
codes
if
c
.
type
==
'
QRCODE
'
and
c
.
data
.
startswith
(
b
'
MO:
'
)]
qr
=
None
if
codes
:
if
len
(
codes
)
>
1
:
logger
.
warning
(
f
'
Scan: Strana #
{
page_nr
}
obsahuje více QR kódů
'
)
code
=
codes
[
0
]
qr
=
code
.
data
.
decode
(
'
US-ASCII
'
)
# FIXME: Tady by se dala podle kódu otočit stránka
output
.
append
(
ScanJobPage
(
code
=
qr
))
full_img
.
save
(
f
'
{
args
.
out_prefix
}
-
{
page_nr
:
04
d
}
-full.png
'
)
# FIXME: Potřebujeme vytvářet miniaturu?
small_img
=
full_img
.
resize
((
full_size
[
0
]
//
4
,
full_size
[
1
]
//
4
))
small_img
.
save
(
f
'
{
args
.
out_prefix
}
-
{
page_nr
:
04
d
}
-small.png
'
)
logger
.
debug
(
f
'
Scan: Strana #
{
page_nr
}
:
{
qr
}
'
)
return
output
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment