From 825ac7d147933a0c39ac8dc7e17a545eb6509d2e Mon Sep 17 00:00:00 2001 From: pcc03 Date: Fri, 1 Aug 2008 16:33:07 +0000 Subject: [PATCH] North Ayrshire, Redbridge: added --- python_scrapers/NorthAyrshire.cgi | 85 ++++++++++++++++++++ python_scrapers/NorthAyrshire.pl | 66 +++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 4 + python_scrapers/Redbridge.cgi | 85 ++++++++++++++++++++ python_scrapers/Redbridge.pl | 115 +++++++++++++++++++++++++++ 5 files changed, 355 insertions(+) create mode 100644 python_scrapers/NorthAyrshire.cgi create mode 100644 python_scrapers/NorthAyrshire.pl create mode 100644 python_scrapers/Redbridge.cgi create mode 100644 python_scrapers/Redbridge.pl diff --git a/python_scrapers/NorthAyrshire.cgi b/python_scrapers/NorthAyrshire.cgi new file mode 100644 index 0000000..ee3b7c6 --- /dev/null +++ b/python_scrapers/NorthAyrshire.cgi @@ -0,0 +1,85 @@ +#!/usr/bin/perl -w + +use strict; +use HTML::TreeBuilder; +use File::Temp qw(tempfile); +use LWP::Simple; +use POSIX; +use Encode; +use CGI; +use CGI::Carp; + +sub sanity_check { + my ($var) = @_; + defined $var or return 0; + $var =~ /^[0-9]+$/ or return 0; + return 1; +} + +sub no_results { + my ($y, $m, $d, $reason) = @_; + print < + + North Ayrshire Council + North Ayrshire + + + +NIL + die "$y/$m/$d failed: $reason\n"; +} + +my $cgi = new CGI; + +my $year = $cgi->param("year"); +my $month = $cgi->param("month"); +my $day = $cgi->param("day"); + +unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) { + print <new; +$tree->parse(decode_utf8(get('http://www.north-ayrshire.gov.uk/na/Home.nsf/OtherMenuPage?ReadForm&MenuType=Environment-Planning&DocDisplay=NoDoc&CatLevel=2||') or die "couldn't fetch index page")); +$tree->eof; + +my $re = strftime('Planning Applications Received week ending %d %B %Y', 0, 0, 0, $day, $month-1, $year-1900); + +my ($day_link) = $tree->look_down( + "_tag", "a", + sub { $_[0]->as_text =~ /$re/i } +); +$day_link or no_results($year, $month, $day, "Cannot find day link"); + +my $day_absurl = 'http://www.north-ayrshire.gov.uk'.$day_link->attr('href'); + +my $day_tree = HTML::TreeBuilder->new; +$day_tree->parse(decode_utf8(get($day_absurl) or die "couldn't fetch day page")); +$day_tree->eof; + +my ($pdf_img) = $day_tree->look_down( + "_tag", "img", + "alt", qr/\.pdf$/i +); +$pdf_img or die "couldn't find pdf image on day page"; +my $pdf_link = $pdf_img->parent; +$pdf_link or die "couldn't find pdf link on day page"; + +my $pdf_absurl = 'http://www.north-ayrshire.gov.uk'.$pdf_link->attr('href'); + +my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); +print $fh get($pdf_absurl); +close($fh); + +print "Content-type: text/xml\n\n"; +system "./NorthAyrshire.pl", $filename, $pdf_absurl and die "system failed: $|"; + +unlink $filename or die "cannot unlink temporary file $filename: $!"; diff --git a/python_scrapers/NorthAyrshire.pl b/python_scrapers/NorthAyrshire.pl new file mode 100644 index 0000000..bf6f326 --- /dev/null +++ b/python_scrapers/NorthAyrshire.pl @@ -0,0 +1,66 @@ +#!/usr/bin/perl -w + +use strict; +use XML::Writer; +use Date::Parse; +use POSIX; + +my $file = $ARGV[0]; +my $info_url = $ARGV[1]; + +my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); + +$writer->xmlDecl("UTF-8"); + +$writer->startTag("planning"); +$writer->dataElement("authority_name", "North Ayrshire Council"); +$writer->dataElement("authority_short_name", "North Ayrshire"); +$writer->startTag("applications"); + +open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!"; +while (my $line = <$fh>) { + if ($line =~ /^\s*Application No:\s*(\S+)/) { + my $refno = $1; + my $address = ""; my $proposal = ""; my $date_received = ""; + my $cur_field; + while (1) { + chomp $line; + $line =~ s/^\s+//; $line =~ s/\s+$//; + if ($line =~ s/^Location://) { + $cur_field = \$address; + } elsif ($line =~ s/^Description://) { + $cur_field = \$proposal; + } elsif ($line =~ s/^Date Registered://) { + $cur_field = \$date_received; + } elsif (($line =~ s/^Applicant://) || ($line =~ s/^Agent://) || ($line =~ s/^Ward://)) { + $cur_field = undef; + } + $line =~ s/^\s+//; + if (defined $cur_field) { + $$cur_field .= " " if $$cur_field ne "" and $line ne ""; + $$cur_field .= $line; + } + last unless defined ($line = <$fh>); + last if $line =~ /^\s*application:/; + } + my $postcode = "None"; + if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { + $postcode = $1; + } + my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received)); + + $writer->startTag("application"); + $writer->dataElement("council_reference", $refno); + $writer->dataElement("address", $address); + $writer->dataElement("postcode", $postcode); + $writer->dataElement("description", $proposal); + $writer->dataElement("info_url", $info_url); + $writer->dataElement("comment_url", 'dcontrol@north-ayrshire.gov.uk'); + $writer->dataElement("date_received", $norm_date_received); + $writer->endTag; + } +} + +$writer->endTag; +$writer->endTag; +$writer->end; diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index ba4bc17..acd5cb7 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -38,3 +38,7 @@ "Highland.cgi", "493" "Newport.pl", "493" "Newport.cgi", "493" +"NorthAyrshire.pl", "493" +"NorthAyrshire.cgi", "493" +"Redbridge.pl", "493" +"Redbridge.cgi", "493" diff --git a/python_scrapers/Redbridge.cgi b/python_scrapers/Redbridge.cgi new file mode 100644 index 0000000..b04cfa3 --- /dev/null +++ b/python_scrapers/Redbridge.cgi @@ -0,0 +1,85 @@ +#!/usr/bin/perl -w + +use strict; +use HTML::TreeBuilder; +use File::Temp qw(tempfile); +use LWP::Simple; +use POSIX; +use Encode; +use CGI; +use CGI::Carp; + +sub sanity_check { + my ($var) = @_; + defined $var or return 0; + $var =~ /^[0-9]+$/ or return 0; + return 1; +} + +sub no_results { + my ($y, $m, $d, $reason) = @_; + print < + + London Borough of Redbridge + Redbridge + + + +NIL + die "$y/$m/$d failed: $reason\n"; +} + +my $cgi = new CGI; + +my $year = $cgi->param("year"); +my $month = $cgi->param("month"); +my $day = $cgi->param("day"); + +unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) { + print <new; +$tree->parse(decode_utf8(get('http://www.redbridge.gov.uk/cms/environment__planning/planning_and_regeneration/planning_dc.aspx') or die "couldn't fetch index page")); +$tree->eof; + +my $re = sprintf('Planning Applications Received %d', $year); + +my ($year_link) = $tree->look_down( + "_tag", "a", + sub { $_[0]->as_text =~ /$re/i } +); +$year_link or no_results($year, $month, $day, "Cannot find year link"); + +my $year_absurl = 'http://www.redbridge.gov.uk'.$year_link->attr('href'); + +my $year_tree = HTML::TreeBuilder->new; +$year_tree->parse(decode_utf8(get($year_absurl) or die "couldn't fetch day page")); +$year_tree->eof; + +my $day_re = strftime('Received %e[a-z]* %B %Y', 0, 0, 0, $day, $month-1, $year-1900); +$day_re =~ s/ +/\\s+/g; +my ($pdf_link) = $year_tree->look_down( + "_tag", "a", + sub { $_[0]->as_text =~ /$day_re/i } +); +$pdf_link or no_results($year, $month, $day, "Cannot find day link"); + +my $pdf_absurl = 'http://www.redbridge.gov.uk/cms/'.$pdf_link->attr('href'); + +my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); +print $fh get($pdf_absurl); +close($fh); + +print "Content-type: text/xml\n\n"; +system "./Redbridge.pl", $filename, $pdf_absurl and die "system failed: $|"; + +unlink $filename or die "cannot unlink temporary file $filename: $!"; diff --git a/python_scrapers/Redbridge.pl b/python_scrapers/Redbridge.pl new file mode 100644 index 0000000..254432a --- /dev/null +++ b/python_scrapers/Redbridge.pl @@ -0,0 +1,115 @@ +#!/usr/bin/perl -w + +use strict; +use XML::Writer; +use Date::Parse; +use POSIX; + +my $file = $ARGV[0]; +my $info_url = $ARGV[1]; + +my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); + +$writer->xmlDecl("UTF-8"); + +$writer->startTag("planning"); +$writer->dataElement("authority_name", "London Borough of Redbridge"); +$writer->dataElement("authority_short_name", "Redbridge"); +$writer->startTag("applications"); + +open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!"; +while (my $line = <$fh>) { + $line =~ s/ //g; + chomp $line; + if ($line =~ /^\s*(App\.No:)\s*(Location:)\s+(Agent)/) { + my $ofs_col1 = $-[1]; + my $ofs_col2 = $-[2]; + my $ofs_col3 = $-[3]; + my $col1_full = ""; # sometimes col1 headings break up onto 2 lines so grab the whole thing and match on it + my $address = ""; my $proposal = ""; + my $cur_field = \$address; + my $near_end = 0; my $redo_outer = 0; + while ($line = <$fh>) { + if ($line =~ s/ //g) { # alignment may have changed for new page, take care of adjustments of at most 2 + # we may lose a few characters but luckily a page break during an entry doesn't + # happen very often (only 4 times during 2005-July 2008) + $ofs_col2 -= 2; + $ofs_col3 -= 2; + } + + if ($line =~ /^\s*App\.No:/) { + $redo_outer = 1; + last; + } + + chomp $line; + if ($line =~ /^\s{10,}[0-9]+\s*$/) { # a line with a page number... ignore unless near end + if ($near_end) { + last; + } else { + next; + } + } + my $col1; my $col2; + if (length $line > $ofs_col1) { + $col1 = substr $line, $ofs_col1, $ofs_col2-$ofs_col1; + } else { + $col1 = ""; + } + if (length $line > $ofs_col2) { + $col2 = substr $line, $ofs_col2, $ofs_col3-$ofs_col2; + } else { + $col2 = ""; + } + + $col2 =~ s/\s{10,}\S.*//g; # remove any obvious spillover text (only needed for a page break during an entry) + + $col1 =~ s/^\s+//; $col1 =~ s/\s+$//; + $col2 =~ s/^\s+//; $col2 =~ s/\s+$//; + + $col1_full .= " " if $col1_full ne "" and $col1 ne ""; + $col1_full .= $col1; + + if ($col2 eq "") { + if ($proposal eq "") { + $cur_field = \$proposal; + } else { + $cur_field = undef; + } + } elsif (defined $cur_field) { + $$cur_field .= " " if $$cur_field ne "" and $col2 ne ""; + $$cur_field .= $col2; + } + last if $near_end and $line =~ /^\s*$/; + $near_end = 1 if $col1_full =~ /Case Officer/; + } + my ($refno, $date_received) = ($col1_full =~ /^(.+) Deposit Date: (\S+)/); + $refno ||= ""; $date_received ||= ""; + $refno =~ s/-? //g; + + my $postcode = "None"; + if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { + $postcode = $1; + } + my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received)); + + $writer->startTag("application"); + $writer->dataElement("council_reference", $refno); + $writer->dataElement("address", $address); + $writer->dataElement("postcode", $postcode); + $writer->dataElement("description", $proposal); + $writer->dataElement("info_url", $info_url); + $writer->dataElement("comment_url", 'planning.enquiry@redbridge.gov.uk'); + $writer->dataElement("date_received", $norm_date_received); + $writer->endTag; + + if ($redo_outer) { + $redo_outer = 0; + redo; + } + } +} + +$writer->endTag; +$writer->endTag; +$writer->end;