diff --git a/db/skoly/parse-all b/db/skoly/parse-all index a6bf9efe54d97d07dc70844b06b6ccec6533aadf..450f4113d749b0589d79b5bd1916e30e18cb2291 100755 --- a/db/skoly/parse-all +++ b/db/skoly/parse-all @@ -1,12 +1,12 @@ #!/bin/bash set -e -rm -rf extra/parsed -mkdir extra/parsed +rm -rf extra/skoly/parsed +mkdir extra/skoly/parsed -for src in extra/html/*.html ; do - dst=extra/parsed/$(basename $src .html).tsv +for src in extra/skoly/html/*.html ; do + dst=extra/skoly/parsed/$(basename $src .html).tsv echo -n "$src -> " - ./rejskol-parse <$src >$dst + db/skoly/rejskol-parse <$src >$dst wc -l $dst done diff --git a/db/skoly/rejskol-download b/db/skoly/rejskol-download index c5a057c8615d880aa8844ce8e9d7506636e41af9..7d22dee05aea12c247dd0495b280d73d18d2b694 100755 --- a/db/skoly/rejskol-download +++ b/db/skoly/rejskol-download @@ -8,7 +8,7 @@ my $mech = WWW::Mechanize->new(autocheck => 1, strict_forms => 1); $mech->get('https://rejstriky.msmt.cz/rejskol/VREJVerejne/VerejneRozhrani.aspx'); $mech->form_id('form1'); -mkdir 'extra/html'; +mkdir 'extra/skoly/html'; download_type('B'); # Základní školy download_type('C'); # Střední školy exit 0; @@ -59,7 +59,7 @@ sub download_region { sleep 1; my $resp = $mech->click_button(id => 'btnVybrat'); - open my $f, '>:utf8', "extra/html/$type-$nuts.html"; + open my $f, '>:utf8', "extra/skoly/html/$type-$nuts.html"; print $f $resp->decoded_content; close $f; diff --git a/db/skoly/rejskol-parse b/db/skoly/rejskol-parse index 5184825c6380ec5587af5e8d97112c7af8ca202f..d2217c9627c9769fd1adb21dfde1a59472f28c62 100755 --- a/db/skoly/rejskol-parse +++ b/db/skoly/rejskol-parse @@ -18,10 +18,10 @@ my $div_with_tables = $divs[1] // die; my @last_main = (); my $tab_count = 0; for my $table ($div_with_tables->find_by_tag_name('table')) { - my $style = $table->attr('style'); - my $main = 0; - if ($tab_count == 0 || defined $style) { - $main = 1; + my $style = $table->attr('style') // ""; + my $main = 1; + if ($tab_count == 1 || $style =~ m{#f0f8ff}) { + $main = 0; } my $last_out = ""; for my $tr ($table->find_by_tag_name('tr')) {