| @@ -0,0 +1,85 @@ | |||||
| #!/usr/bin/perl -w | |||||
| use strict; | |||||
| use HTML::TreeBuilder; | |||||
| use File::Temp qw(tempfile); | |||||
| use LWP::Simple; | |||||
| use POSIX; | |||||
| use Encode; | |||||
| use CGI; | |||||
| use CGI::Carp; | |||||
| sub sanity_check { | |||||
| my ($var) = @_; | |||||
| defined $var or return 0; | |||||
| $var =~ /^[0-9]+$/ or return 0; | |||||
| return 1; | |||||
| } | |||||
| sub no_results { | |||||
| my ($y, $m, $d, $reason) = @_; | |||||
| print <<NIL; | |||||
| Content-type: text/xml | |||||
| <?xml version="1.0" encoding="UTF-8"?> | |||||
| <planning> | |||||
| <authority_name>North Ayrshire Council</authority_name> | |||||
| <authority_short_name>North Ayrshire</authority_short_name> | |||||
| <applications> | |||||
| </applications> | |||||
| </planning> | |||||
| NIL | |||||
| die "$y/$m/$d failed: $reason\n"; | |||||
| } | |||||
| my $cgi = new CGI; | |||||
| my $year = $cgi->param("year"); | |||||
| my $month = $cgi->param("month"); | |||||
| my $day = $cgi->param("day"); | |||||
| unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) { | |||||
| print <<ERROR; | |||||
| Content-type: text/plain | |||||
| Need year, month, day parameters | |||||
| ERROR | |||||
| exit 0; | |||||
| } | |||||
| my $tree = HTML::TreeBuilder->new; | |||||
| $tree->parse(decode_utf8(get('http://www.north-ayrshire.gov.uk/na/Home.nsf/OtherMenuPage?ReadForm&MenuType=Environment-Planning&DocDisplay=NoDoc&CatLevel=2||') or die "couldn't fetch index page")); | |||||
| $tree->eof; | |||||
| my $re = strftime('Planning Applications Received week ending %d %B %Y', 0, 0, 0, $day, $month-1, $year-1900); | |||||
| my ($day_link) = $tree->look_down( | |||||
| "_tag", "a", | |||||
| sub { $_[0]->as_text =~ /$re/i } | |||||
| ); | |||||
| $day_link or no_results($year, $month, $day, "Cannot find day link"); | |||||
| my $day_absurl = 'http://www.north-ayrshire.gov.uk'.$day_link->attr('href'); | |||||
| my $day_tree = HTML::TreeBuilder->new; | |||||
| $day_tree->parse(decode_utf8(get($day_absurl) or die "couldn't fetch day page")); | |||||
| $day_tree->eof; | |||||
| my ($pdf_img) = $day_tree->look_down( | |||||
| "_tag", "img", | |||||
| "alt", qr/\.pdf$/i | |||||
| ); | |||||
| $pdf_img or die "couldn't find pdf image on day page"; | |||||
| my $pdf_link = $pdf_img->parent; | |||||
| $pdf_link or die "couldn't find pdf link on day page"; | |||||
| my $pdf_absurl = 'http://www.north-ayrshire.gov.uk'.$pdf_link->attr('href'); | |||||
| my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); | |||||
| print $fh get($pdf_absurl); | |||||
| close($fh); | |||||
| print "Content-type: text/xml\n\n"; | |||||
| system "./NorthAyrshire.pl", $filename, $pdf_absurl and die "system failed: $|"; | |||||
| unlink $filename or die "cannot unlink temporary file $filename: $!"; | |||||
| @@ -0,0 +1,66 @@ | |||||
| #!/usr/bin/perl -w | |||||
| use strict; | |||||
| use XML::Writer; | |||||
| use Date::Parse; | |||||
| use POSIX; | |||||
| my $file = $ARGV[0]; | |||||
| my $info_url = $ARGV[1]; | |||||
| my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); | |||||
| $writer->xmlDecl("UTF-8"); | |||||
| $writer->startTag("planning"); | |||||
| $writer->dataElement("authority_name", "North Ayrshire Council"); | |||||
| $writer->dataElement("authority_short_name", "North Ayrshire"); | |||||
| $writer->startTag("applications"); | |||||
| open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!"; | |||||
| while (my $line = <$fh>) { | |||||
| if ($line =~ /^\s*Application No:\s*(\S+)/) { | |||||
| my $refno = $1; | |||||
| my $address = ""; my $proposal = ""; my $date_received = ""; | |||||
| my $cur_field; | |||||
| while (1) { | |||||
| chomp $line; | |||||
| $line =~ s/^\s+//; $line =~ s/\s+$//; | |||||
| if ($line =~ s/^Location://) { | |||||
| $cur_field = \$address; | |||||
| } elsif ($line =~ s/^Description://) { | |||||
| $cur_field = \$proposal; | |||||
| } elsif ($line =~ s/^Date Registered://) { | |||||
| $cur_field = \$date_received; | |||||
| } elsif (($line =~ s/^Applicant://) || ($line =~ s/^Agent://) || ($line =~ s/^Ward://)) { | |||||
| $cur_field = undef; | |||||
| } | |||||
| $line =~ s/^\s+//; | |||||
| if (defined $cur_field) { | |||||
| $$cur_field .= " " if $$cur_field ne "" and $line ne ""; | |||||
| $$cur_field .= $line; | |||||
| } | |||||
| last unless defined ($line = <$fh>); | |||||
| last if $line =~ /^\s*application:/; | |||||
| } | |||||
| my $postcode = "None"; | |||||
| if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { | |||||
| $postcode = $1; | |||||
| } | |||||
| my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received)); | |||||
| $writer->startTag("application"); | |||||
| $writer->dataElement("council_reference", $refno); | |||||
| $writer->dataElement("address", $address); | |||||
| $writer->dataElement("postcode", $postcode); | |||||
| $writer->dataElement("description", $proposal); | |||||
| $writer->dataElement("info_url", $info_url); | |||||
| $writer->dataElement("comment_url", 'dcontrol@north-ayrshire.gov.uk'); | |||||
| $writer->dataElement("date_received", $norm_date_received); | |||||
| $writer->endTag; | |||||
| } | |||||
| } | |||||
| $writer->endTag; | |||||
| $writer->endTag; | |||||
| $writer->end; | |||||
| @@ -38,3 +38,7 @@ | |||||
| "Highland.cgi", "493" | "Highland.cgi", "493" | ||||
| "Newport.pl", "493" | "Newport.pl", "493" | ||||
| "Newport.cgi", "493" | "Newport.cgi", "493" | ||||
| "NorthAyrshire.pl", "493" | |||||
| "NorthAyrshire.cgi", "493" | |||||
| "Redbridge.pl", "493" | |||||
| "Redbridge.cgi", "493" | |||||
| @@ -0,0 +1,85 @@ | |||||
| #!/usr/bin/perl -w | |||||
| use strict; | |||||
| use HTML::TreeBuilder; | |||||
| use File::Temp qw(tempfile); | |||||
| use LWP::Simple; | |||||
| use POSIX; | |||||
| use Encode; | |||||
| use CGI; | |||||
| use CGI::Carp; | |||||
| sub sanity_check { | |||||
| my ($var) = @_; | |||||
| defined $var or return 0; | |||||
| $var =~ /^[0-9]+$/ or return 0; | |||||
| return 1; | |||||
| } | |||||
| sub no_results { | |||||
| my ($y, $m, $d, $reason) = @_; | |||||
| print <<NIL; | |||||
| Content-type: text/xml | |||||
| <?xml version="1.0" encoding="UTF-8"?> | |||||
| <planning> | |||||
| <authority_name>London Borough of Redbridge</authority_name> | |||||
| <authority_short_name>Redbridge</authority_short_name> | |||||
| <applications> | |||||
| </applications> | |||||
| </planning> | |||||
| NIL | |||||
| die "$y/$m/$d failed: $reason\n"; | |||||
| } | |||||
| my $cgi = new CGI; | |||||
| my $year = $cgi->param("year"); | |||||
| my $month = $cgi->param("month"); | |||||
| my $day = $cgi->param("day"); | |||||
| unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) { | |||||
| print <<ERROR; | |||||
| Content-type: text/plain | |||||
| Need year, month, day parameters | |||||
| ERROR | |||||
| exit 0; | |||||
| } | |||||
| my $tree = HTML::TreeBuilder->new; | |||||
| $tree->parse(decode_utf8(get('http://www.redbridge.gov.uk/cms/environment__planning/planning_and_regeneration/planning_dc.aspx') or die "couldn't fetch index page")); | |||||
| $tree->eof; | |||||
| my $re = sprintf('Planning Applications Received %d', $year); | |||||
| my ($year_link) = $tree->look_down( | |||||
| "_tag", "a", | |||||
| sub { $_[0]->as_text =~ /$re/i } | |||||
| ); | |||||
| $year_link or no_results($year, $month, $day, "Cannot find year link"); | |||||
| my $year_absurl = 'http://www.redbridge.gov.uk'.$year_link->attr('href'); | |||||
| my $year_tree = HTML::TreeBuilder->new; | |||||
| $year_tree->parse(decode_utf8(get($year_absurl) or die "couldn't fetch day page")); | |||||
| $year_tree->eof; | |||||
| my $day_re = strftime('Received %e[a-z]* %B %Y', 0, 0, 0, $day, $month-1, $year-1900); | |||||
| $day_re =~ s/ +/\\s+/g; | |||||
| my ($pdf_link) = $year_tree->look_down( | |||||
| "_tag", "a", | |||||
| sub { $_[0]->as_text =~ /$day_re/i } | |||||
| ); | |||||
| $pdf_link or no_results($year, $month, $day, "Cannot find day link"); | |||||
| my $pdf_absurl = 'http://www.redbridge.gov.uk/cms/'.$pdf_link->attr('href'); | |||||
| my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); | |||||
| print $fh get($pdf_absurl); | |||||
| close($fh); | |||||
| print "Content-type: text/xml\n\n"; | |||||
| system "./Redbridge.pl", $filename, $pdf_absurl and die "system failed: $|"; | |||||
| unlink $filename or die "cannot unlink temporary file $filename: $!"; | |||||
| @@ -0,0 +1,115 @@ | |||||
| #!/usr/bin/perl -w | |||||
| use strict; | |||||
| use XML::Writer; | |||||
| use Date::Parse; | |||||
| use POSIX; | |||||
| my $file = $ARGV[0]; | |||||
| my $info_url = $ARGV[1]; | |||||
| my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); | |||||
| $writer->xmlDecl("UTF-8"); | |||||
| $writer->startTag("planning"); | |||||
| $writer->dataElement("authority_name", "London Borough of Redbridge"); | |||||
| $writer->dataElement("authority_short_name", "Redbridge"); | |||||
| $writer->startTag("applications"); | |||||
| open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!"; | |||||
| while (my $line = <$fh>) { | |||||
| $line =~ s///g; | |||||
| chomp $line; | |||||
| if ($line =~ /^\s*(App\.No:)\s*(Location:)\s+(Agent)/) { | |||||
| my $ofs_col1 = $-[1]; | |||||
| my $ofs_col2 = $-[2]; | |||||
| my $ofs_col3 = $-[3]; | |||||
| my $col1_full = ""; # sometimes col1 headings break up onto 2 lines so grab the whole thing and match on it | |||||
| my $address = ""; my $proposal = ""; | |||||
| my $cur_field = \$address; | |||||
| my $near_end = 0; my $redo_outer = 0; | |||||
| while ($line = <$fh>) { | |||||
| if ($line =~ s///g) { # alignment may have changed for new page, take care of adjustments of at most 2 | |||||
| # we may lose a few characters but luckily a page break during an entry doesn't | |||||
| # happen very often (only 4 times during 2005-July 2008) | |||||
| $ofs_col2 -= 2; | |||||
| $ofs_col3 -= 2; | |||||
| } | |||||
| if ($line =~ /^\s*App\.No:/) { | |||||
| $redo_outer = 1; | |||||
| last; | |||||
| } | |||||
| chomp $line; | |||||
| if ($line =~ /^\s{10,}[0-9]+\s*$/) { # a line with a page number... ignore unless near end | |||||
| if ($near_end) { | |||||
| last; | |||||
| } else { | |||||
| next; | |||||
| } | |||||
| } | |||||
| my $col1; my $col2; | |||||
| if (length $line > $ofs_col1) { | |||||
| $col1 = substr $line, $ofs_col1, $ofs_col2-$ofs_col1; | |||||
| } else { | |||||
| $col1 = ""; | |||||
| } | |||||
| if (length $line > $ofs_col2) { | |||||
| $col2 = substr $line, $ofs_col2, $ofs_col3-$ofs_col2; | |||||
| } else { | |||||
| $col2 = ""; | |||||
| } | |||||
| $col2 =~ s/\s{10,}\S.*//g; # remove any obvious spillover text (only needed for a page break during an entry) | |||||
| $col1 =~ s/^\s+//; $col1 =~ s/\s+$//; | |||||
| $col2 =~ s/^\s+//; $col2 =~ s/\s+$//; | |||||
| $col1_full .= " " if $col1_full ne "" and $col1 ne ""; | |||||
| $col1_full .= $col1; | |||||
| if ($col2 eq "") { | |||||
| if ($proposal eq "") { | |||||
| $cur_field = \$proposal; | |||||
| } else { | |||||
| $cur_field = undef; | |||||
| } | |||||
| } elsif (defined $cur_field) { | |||||
| $$cur_field .= " " if $$cur_field ne "" and $col2 ne ""; | |||||
| $$cur_field .= $col2; | |||||
| } | |||||
| last if $near_end and $line =~ /^\s*$/; | |||||
| $near_end = 1 if $col1_full =~ /Case Officer/; | |||||
| } | |||||
| my ($refno, $date_received) = ($col1_full =~ /^(.+) Deposit Date: (\S+)/); | |||||
| $refno ||= ""; $date_received ||= ""; | |||||
| $refno =~ s/-? //g; | |||||
| my $postcode = "None"; | |||||
| if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { | |||||
| $postcode = $1; | |||||
| } | |||||
| my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received)); | |||||
| $writer->startTag("application"); | |||||
| $writer->dataElement("council_reference", $refno); | |||||
| $writer->dataElement("address", $address); | |||||
| $writer->dataElement("postcode", $postcode); | |||||
| $writer->dataElement("description", $proposal); | |||||
| $writer->dataElement("info_url", $info_url); | |||||
| $writer->dataElement("comment_url", 'planning.enquiry@redbridge.gov.uk'); | |||||
| $writer->dataElement("date_received", $norm_date_received); | |||||
| $writer->endTag; | |||||
| if ($redo_outer) { | |||||
| $redo_outer = 0; | |||||
| redo; | |||||
| } | |||||
| } | |||||
| } | |||||
| $writer->endTag; | |||||
| $writer->endTag; | |||||
| $writer->end; | |||||