Skip to content
Snippets Groups Projects
Commit a48fda43 authored by Martin Mareš's avatar Martin Mareš
Browse files

Skripty na chroustání Rejstříku škol umí aktuální formát

parent 6255a17b
No related branches found
No related tags found
No related merge requests found
#!/bin/bash #!/bin/bash
set -e set -e
rm -rf extra/parsed rm -rf extra/skoly/parsed
mkdir extra/parsed mkdir extra/skoly/parsed
for src in extra/html/*.html ; do for src in extra/skoly/html/*.html ; do
dst=extra/parsed/$(basename $src .html).tsv dst=extra/skoly/parsed/$(basename $src .html).tsv
echo -n "$src -> " echo -n "$src -> "
./rejskol-parse <$src >$dst db/skoly/rejskol-parse <$src >$dst
wc -l $dst wc -l $dst
done done
...@@ -8,7 +8,7 @@ my $mech = WWW::Mechanize->new(autocheck => 1, strict_forms => 1); ...@@ -8,7 +8,7 @@ my $mech = WWW::Mechanize->new(autocheck => 1, strict_forms => 1);
$mech->get('https://rejstriky.msmt.cz/rejskol/VREJVerejne/VerejneRozhrani.aspx'); $mech->get('https://rejstriky.msmt.cz/rejskol/VREJVerejne/VerejneRozhrani.aspx');
$mech->form_id('form1'); $mech->form_id('form1');
mkdir 'extra/html'; mkdir 'extra/skoly/html';
download_type('B'); # Základní školy download_type('B'); # Základní školy
download_type('C'); # Střední školy download_type('C'); # Střední školy
exit 0; exit 0;
...@@ -59,7 +59,7 @@ sub download_region { ...@@ -59,7 +59,7 @@ sub download_region {
sleep 1; sleep 1;
my $resp = $mech->click_button(id => 'btnVybrat'); my $resp = $mech->click_button(id => 'btnVybrat');
open my $f, '>:utf8', "extra/html/$type-$nuts.html"; open my $f, '>:utf8', "extra/skoly/html/$type-$nuts.html";
print $f $resp->decoded_content; print $f $resp->decoded_content;
close $f; close $f;
......
...@@ -18,10 +18,10 @@ my $div_with_tables = $divs[1] // die; ...@@ -18,10 +18,10 @@ my $div_with_tables = $divs[1] // die;
my @last_main = (); my @last_main = ();
my $tab_count = 0; my $tab_count = 0;
for my $table ($div_with_tables->find_by_tag_name('table')) { for my $table ($div_with_tables->find_by_tag_name('table')) {
my $style = $table->attr('style'); my $style = $table->attr('style') // "";
my $main = 0; my $main = 1;
if ($tab_count == 0 || defined $style) { if ($tab_count == 1 || $style =~ m{#f0f8ff}) {
$main = 1; $main = 0;
} }
my $last_out = ""; my $last_out = "";
for my $tr ($table->find_by_tag_name('tr')) { for my $tr ($table->find_by_tag_name('tr')) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment