From 9a3b38c3256c9c656330ad2907e12090f2c624ba Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 17 Jul 2008 10:46:43 +0000 Subject: [PATCH] Add Peter's scrapers for Highland, Glasgow, and Brentwood. --- python_scrapers/Brentwood.cgi | 55 +++++++++++++++++++ python_scrapers/Brentwood.pl | 72 ++++++++++++++++++++++++ python_scrapers/Glasgow.cgi | 54 ++++++++++++++++++ python_scrapers/Glasgow.pl | 63 +++++++++++++++++++++ python_scrapers/Highland.cgi | 82 ++++++++++++++++++++++++++++ python_scrapers/Highland.pl | 72 ++++++++++++++++++++++++ python_scrapers/Ocella.py | 2 +- python_scrapers/OtherFilesToCopy.csv | 6 ++ 8 files changed, 405 insertions(+), 1 deletion(-) create mode 100644 python_scrapers/Brentwood.cgi create mode 100644 python_scrapers/Brentwood.pl create mode 100644 python_scrapers/Glasgow.cgi create mode 100644 python_scrapers/Glasgow.pl create mode 100644 python_scrapers/Highland.cgi create mode 100644 python_scrapers/Highland.pl diff --git a/python_scrapers/Brentwood.cgi b/python_scrapers/Brentwood.cgi new file mode 100644 index 0000000..38b5f79 --- /dev/null +++ b/python_scrapers/Brentwood.cgi @@ -0,0 +1,55 @@ +#!/usr/bin/perl -w + +use strict; + +use LWP::Simple; +use File::Temp qw(tempfile); +use POSIX; +use CGI; + +my $cgi = new CGI; + +my $year = $cgi->param("year"); +my $month = $cgi->param("month"); +my $day = $cgi->param("day"); + +unless (defined $year && defined $month && defined $day) { + print <]*>)*[^<]*$date/); +unless (defined $url) { + print < + + Brentwood Borough Council + Brentwood + + + +NIL + exit 0; +} + +my $dmy = sprintf("%02d/%02d/%04d", $day, $month, $year); + +my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); +print $fh get($url); +close($fh); + +print "Content-type: text/xml\n\n"; +system "./Brentwood.pl", $filename, $url, $dmy; + +unlink $filename; diff --git a/python_scrapers/Brentwood.pl b/python_scrapers/Brentwood.pl new file mode 100644 index 0000000..0ef848e --- /dev/null +++ b/python_scrapers/Brentwood.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w + +use strict; +use XML::Writer; + +my $file = $ARGV[0]; +my $info_url = $ARGV[1]; +my $date = $ARGV[2]; + +my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); + +$writer->xmlDecl("UTF-8"); + +$writer->startTag("planning"); +$writer->dataElement("authority_name", "Brentwood Borough Council"); +$writer->dataElement("authority_short_name", "Brentwood"); +$writer->startTag("applications"); + +open (my $fh, "pdftotext -layout $file -|"); +while (my $line = <$fh>) { + chomp $line; + $line =~ s/ //g; + if ($line =~ /Address:/) { + my $ofs_col2 = $-[0]; + my $refno = substr $line, 0, $ofs_col2; + $refno =~ s/ +$//g; + my $address = ""; my $proposal = ""; + my $cur_field; + while (1) { + if (length($line) > $ofs_col2) { + my $col2 = substr $line, $ofs_col2; + $col2 =~ s/^ +//; + if ($col2 =~ s/^((A?d)?d)?ress://) { + $cur_field = \$address; + } elsif ($col2 =~ s/^((P?r)?o)?posal://) { + $cur_field = \$proposal; + } elsif ($col2 =~ s/^((A?p)?p)?licant://) { + $cur_field = undef; + } elsif ($col2 =~ s/^((A?g)?e)?nt://) { + $cur_field = undef; + } + $col2 =~ s/^ +//; $col2 =~ s/ +$//; + if (defined $cur_field) { + $$cur_field .= " " if $$cur_field ne ""; + $$cur_field .= $col2; + } + } + last unless defined ($line = <$fh>); + chomp $line; + $line =~ s/ //g; + last if length $line == 0; + } + my $postcode = "None"; + if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { + $postcode = $1; + } + + $writer->startTag("application"); + $writer->dataElement("council_reference", $refno); + $writer->dataElement("address", $address); + $writer->dataElement("postcode", $postcode); + $writer->dataElement("description", $proposal); + $writer->dataElement("info_url", $info_url); + $writer->dataElement("comment_url", "planning\@brentwood.gov.uk"); + $writer->dataElement("date_received", $date); + $writer->endTag; + } +} + +$writer->endTag; +$writer->endTag; +$writer->end; diff --git a/python_scrapers/Glasgow.cgi b/python_scrapers/Glasgow.cgi new file mode 100644 index 0000000..20fa4af --- /dev/null +++ b/python_scrapers/Glasgow.cgi @@ -0,0 +1,54 @@ +#!/usr/bin/perl -w + +use strict; + +use LWP::Simple; +use File::Temp qw(tempfile); +use POSIX; +use CGI; + +my $cgi = new CGI; + +my $year = $cgi->param("year"); +my $month = $cgi->param("month"); +my $day = $cgi->param("day"); + +unless (defined $year && defined $month && defined $day) { + print < + + Glasgow City Council + Glasgow + + + +NIL + exit 0; +} +my $absurl = "http://www.glasgow.gov.uk$url"; + +my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); +print $fh get($absurl); +close($fh); + +print "Content-type: text/xml\n\n"; +system "./Glasgow.pl", $filename, $absurl; + +unlink $filename; diff --git a/python_scrapers/Glasgow.pl b/python_scrapers/Glasgow.pl new file mode 100644 index 0000000..470b4c9 --- /dev/null +++ b/python_scrapers/Glasgow.pl @@ -0,0 +1,63 @@ +#!/usr/bin/perl -w + +use strict; +use XML::Writer; + +my $file = $ARGV[0]; +my $info_url = $ARGV[1]; + +my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); + +$writer->xmlDecl("UTF-8"); + +$writer->startTag("planning"); +$writer->dataElement("authority_name", "Glasgow City Council"); +$writer->dataElement("authority_short_name", "Glasgow"); +$writer->startTag("applications"); + +open (my $fh, "pdftotext -layout $file -|"); +while (my $line = <$fh>) { + if ($line =~ /^\s*Reference:\s*(\S+)/) { + my $refno = $1; + my $address = ""; my $proposal = ""; my $date_received; + my $cur_field; + while (1) { + chomp $line; + $line =~ s/^\s+//; $line =~ s/\s+$//; + if ($line =~ s/^Address://) { + $cur_field = \$address; + } elsif ($line =~ s/^Proposal://) { + $cur_field = \$proposal; + } elsif ($line =~ /^Date Received:\s*(\S+)/) { + $date_received = $1; + $date_received =~ s#\.#/#g; + $cur_field = undef; + } + $line =~ s/^\s+//; + if (defined $cur_field) { + $$cur_field .= " " if $$cur_field ne ""; + $$cur_field .= $line; + } + last if $line =~ /Map Reference:/; + last unless defined ($line = <$fh>); + } + my $postcode = "None"; + if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { + $postcode = $1; + } + + $writer->startTag("application"); + $writer->dataElement("council_reference", $refno); + $writer->dataElement("address", $address); + $writer->dataElement("postcode", $postcode); + $writer->dataElement("description", $proposal); + $writer->dataElement("info_url", $info_url); + $writer->dataElement("comment_url", "planning.representations\@drs.glasgow.gov.uk"); + $writer->dataElement("date_received", $date_received); + $writer->endTag; + } +} + +$writer->endTag; +$writer->endTag; +$writer->end; diff --git a/python_scrapers/Highland.cgi b/python_scrapers/Highland.cgi new file mode 100644 index 0000000..f7f77d8 --- /dev/null +++ b/python_scrapers/Highland.cgi @@ -0,0 +1,82 @@ +#!/usr/bin/perl -w + +use strict; +use HTML::TreeBuilder; +use File::Temp qw(tempfile); +use LWP::Simple; +use POSIX; +use Encode; +use CGI; +use CGI::Carp; + +sub sanity_check { + my ($var) = @_; + defined $var or return 0; + $var =~ /^[0-9]+$/ or return 0; + return 1; +} + +sub no_results { + my ($y, $m, $d, $reason) = @_; + print < + + Highland Council + Highland + + + +NIL + die "$y/$m/$d failed: $reason\n"; +} + +my $cgi = new CGI; + +my $year = $cgi->param("year"); +my $month = $cgi->param("month"); +my $day = $cgi->param("day"); + +unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) { + print <new; +# $tree->parse_file('weekly-planning-bw-lists.htm'); +$tree->parse(decode_utf8(get('http://www.highland.gov.uk/yourenvironment/planning/planningapplications/weekly-planning-bw-lists.htm') or die "couldn't fetch index page")); +$tree->eof; + +my $monthyear_re = strftime('%B[ \xa0]%Y', 0, 0, 0, 1, $month-1, $year-1900); + +my ($month_h2) = $tree->look_down( + "_tag", "h2", + sub { $_[0]->as_text =~ /$monthyear_re/ } +); +$month_h2 or no_results($year, $month, $day, "Cannot find month header"); + +my $month_list = $month_h2->right; + +my $day_re = strftime('Planning Applications (?:[A-Za-z0-9 ]*?to )?%b[a-z]* ?%e[a-z]', 0, 0, 0, $day, $month-1, $year-1900); + +my ($day_link) = $month_list->look_down( + "_tag", "a", + sub { $_[0]->as_text =~ /$day_re/ } +); +$day_link or no_results($year, $month, $day, "Cannot find day link"); + +my $day_absurl = "http://www.highland.gov.uk".$day_link->attr('href'); + +my ($fh, $filename) = tempfile(SUFFIX => ".pdf"); +print $fh get($day_absurl); +close($fh); + +print "Content-type: text/xml\n\n"; +system "./Highland.pl", $filename, $day_absurl and die "system failed: $|"; + +unlink $filename or die "cannot unlink temporary file $filename: $!"; diff --git a/python_scrapers/Highland.pl b/python_scrapers/Highland.pl new file mode 100644 index 0000000..c70edc4 --- /dev/null +++ b/python_scrapers/Highland.pl @@ -0,0 +1,72 @@ +#!/usr/bin/perl -w + +use strict; +use XML::Writer; + +my $file = $ARGV[0]; +my $info_url = $ARGV[1]; + +my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2); + +$writer->xmlDecl("UTF-8"); + +$writer->startTag("planning"); +$writer->dataElement("authority_name", "Highland Council"); +$writer->dataElement("authority_short_name", "Highland"); +$writer->startTag("applications"); + +open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!"; +while (my $line = <$fh>) { + if ($line =~ /^\s*Ref Number\s*(\S+)/) { + my $refno = $1; + my $address = ""; my $proposal = ""; my $case_officer = ""; my $date_received; + my $cur_field; + my $near_end; + while (1) { + chomp $line; + $line =~ s/^\s+//; $line =~ s/\s+$//; + if ($line =~ s/^Location of Works//) { + $cur_field = \$address; + } elsif ($line =~ s/^Description of Works//) { + $cur_field = \$proposal; + } elsif ($line =~ s/^Case Officer//) { + $cur_field = \$case_officer; + } elsif (($line =~ s/^Community Council//) || ($line =~ s/^Applicant Name//) || ($line =~ s/^Applicant Address//)) { + $cur_field = undef; + } elsif ($line =~ /^Validation Date\s*(\S+)/) { + $date_received = $1; + $cur_field = undef; + } + $line =~ s/^\s+//; + if (defined $cur_field) { + $$cur_field .= " " if $$cur_field ne ""; + $$cur_field .= $line; + } + last unless defined ($line = <$fh>); + last if $near_end && length $line == 1; + $near_end = 1 if $line =~ /^\s*Case Officer/; + } + my $postcode = "None"; + if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) { + $postcode = $1; + } + my $comment_url = "None"; + if ($case_officer =~ /([A-Za-z0-9\.]+\@[A-Za-z0-9\.]+)/) { + $comment_url = "$1"; + } + + $writer->startTag("application"); + $writer->dataElement("council_reference", $refno); + $writer->dataElement("address", $address); + $writer->dataElement("postcode", $postcode); + $writer->dataElement("description", $proposal); + $writer->dataElement("info_url", $info_url); + $writer->dataElement("comment_url", $comment_url); + $writer->dataElement("date_received", $date_received); + $writer->endTag; + } +} + +$writer->endTag; +$writer->endTag; +$writer->end; diff --git a/python_scrapers/Ocella.py b/python_scrapers/Ocella.py index eef26b6..924a349 100644 --- a/python_scrapers/Ocella.py +++ b/python_scrapers/Ocella.py @@ -208,7 +208,7 @@ if __name__ == '__main__': # parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL") parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL") -# parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly") + parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly") print parser.getResults(21,5,2008) diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index c4994cd..55b8b0a 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -31,3 +31,9 @@ "Maldon.py", "420" "Medway.py", "420" "Shropshire.py", "420" +"Brentwood.pl", "493" +"Brentwood.cgi", "493" +"Glasgow.pl", "493" +"Glasgow.cgi", "493" +"Highland.pl", "493" +"Highland.cgi", "493"