diff --git a/trunk/python_scrapers/EastHerts.cgi b/trunk/python_scrapers/EastHerts.cgi new file mode 100644 index 0000000..7b64f45 --- /dev/null +++ b/trunk/python_scrapers/EastHerts.cgi @@ -0,0 +1,122 @@ +#!/usr/local/bin/perl + +use strict; +use warnings; + +use CGI qw(:cgi); +use HTML::TreeBuilder; +use LWP::UserAgent; +use XML::Writer; + +# The master URLs for the East Herts planning search +our $SearchURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA"; +our $InfoURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID="; +our $CommentURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/wphmakerep.displayURL?ApnID="; + +# We're a CGI script... +my $query = CGI->new(); + +# Get the date to fetch +my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); + +# Construct an LWP user agent +our $UA = LWP::UserAgent->new(env_proxy => 1); + +# Do the search +my $page = do_post($SearchURL, + {"REGFROMDATE.MAINBODY.WPACIS.1." => $date, + "REGTODATE.MAINBODY.WPACIS.1." => $date, + "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"}); + +# Output an HTTP response header +print $query->header(-type => "text/xml"); + +# Create an XML output stream +my $Writer = XML::Writer->new(DATA_MODE => 1); + +# Output the XML header data +$Writer->xmlDecl("UTF-8"); +$Writer->startTag("planning"); +$Writer->dataElement("authority_name", "East Herts Council"); +$Writer->dataElement("authority_short_name", "East Herts"); +$Writer->startTag("applications"); + +# Output any applications on the first page +output_applications($page); + +# Loop over any additional results pages +foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/)) +{ + # Fetch this page... + $page = do_get(URI->new_abs($link->attr("href"), $SearchURL)); + + # ...and output the applications from it + output_applications($page); +} + +# Finish off XML output +$Writer->endTag("applications"); +$Writer->endTag("planning"); +$Writer->end(); + +exit 0; + +# Make a GET request +sub do_get +{ + my $response = $UA->get(@_); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} + +# Make a POST request +sub do_post +{ + my $response = $UA->post(@_); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} + +# Output applications from a results page +sub output_applications +{ + my $page = shift; + + # Find the result table + my $table = $page->look_down("_tag" => "table", "cellspacing" => "2", "cellpadding" => "2"); + + # Process each row of the results + foreach my $row ($table->look_down("_tag" => "tr")) + { + my @cells = $row->look_down("_tag" => "td"); + + if (@cells >= 3) + { + my $reference = $cells[0]->as_trimmed_text; + my $description = $cells[1]->as_trimmed_text; + my $address = $cells[2]->as_trimmed_text; + my $postcode; + + if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) + { + $postcode = $1; + } + + $Writer->startTag("application"); + $Writer->dataElement("council_reference", $reference); + $Writer->dataElement("address", $address); + $Writer->dataElement("postcode", $postcode); + $Writer->dataElement("description", $description); + $Writer->dataElement("info_url", $InfoURL . $reference); + $Writer->dataElement("comment_url", $CommentURL . $reference); + $Writer->dataElement("date_received", $date); + $Writer->endTag("application"); + } + } + + return; +} diff --git a/trunk/python_scrapers/NorthHerts.cgi b/trunk/python_scrapers/NorthHerts.cgi new file mode 100644 index 0000000..ab8bb67 --- /dev/null +++ b/trunk/python_scrapers/NorthHerts.cgi @@ -0,0 +1,98 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use CGI qw(:cgi); +use HTML::TreeBuilder; +use LWP::UserAgent; +use XML::Writer; + +# The master URLs for the North Hertfordshire planning search +our $SearchURL = "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"; + +# We're a CGI script... +my $query = CGI->new(); + +# Get the date to fetch +my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); + +# Construct an LWP user agent +our $UA = LWP::UserAgent->new(env_proxy => 1); + +# Fetch the search page +my $page = do_get($SearchURL); + +# Find the form submission URL +my $form = $page->look_down("_tag" => "form", name => "frmSearch"); +my $url = URI->new_abs($form->attr("action"), $SearchURL); + +# Do the search +$page = do_post($url, {"regdate1" => $date, "regdate2" => $date}); + +# Output an HTTP response header +print $query->header(-type => "text/xml"); + +# Create an XML output stream +my $Writer = XML::Writer->new(DATA_MODE => 1); + +# Output the XML header data +$Writer->xmlDecl("UTF-8"); +$Writer->startTag("planning"); +$Writer->dataElement("authority_name", "North Hertfordshire District Council"); +$Writer->dataElement("authority_short_name", "North Hertfordshire"); +$Writer->startTag("applications"); + +# Process each table of the results +foreach my $table ($page->look_down("_tag" => "table", "class" => "results-table")) +{ + my @rows = map { $_->look_down("_tag" => "td") } $table->look_down("_tag" => "tr"); + my $reference = $rows[0]->as_trimmed_text; + my $infourl = $rows[0]->look_down("_tag" => "a")->attr("href"); + my $date = $rows[1]->as_trimmed_text; + my $address = $rows[3]->as_trimmed_text; + my $description = $rows[4]->as_trimmed_text; + my $postcode; + + if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) + { + $postcode = $1; + } + + $Writer->startTag("application"); + $Writer->dataElement("council_reference", $reference); + $Writer->dataElement("address", $address); + $Writer->dataElement("postcode", $postcode); + $Writer->dataElement("description", $description); + $Writer->dataElement("info_url", $infourl); + $Writer->dataElement("comment_url", "mailto:service\@north-herts.gov.uk?subject=Comment on Planning Application"); + $Writer->dataElement("date_received", $date); + $Writer->endTag("application"); +} + +# Finish off XML output +$Writer->endTag("applications"); +$Writer->endTag("planning"); +$Writer->end(); + +exit 0; + +# Make a GET request +sub do_get +{ + my $response = $UA->get(@_); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} + +# Make a POST request +sub do_post +{ + my $response = $UA->post(@_, Content_Type => "form-data"); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index 8ba04d7..803fb1a 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -7,3 +7,6 @@ "AcolnetParser.py", "420" "MultipartPostHandler.py", "420" "FastWeb.py", "420" +"broxbourne.cgi", "493" +"EastHerts.cgi", "493" +"NorthHerts.cgi", "493" diff --git a/trunk/python_scrapers/broxbourne.cgi b/trunk/python_scrapers/broxbourne.cgi new file mode 100644 index 0000000..53bfea3 --- /dev/null +++ b/trunk/python_scrapers/broxbourne.cgi @@ -0,0 +1,162 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use CGI qw(:cgi); +use DateTime; +use HTML::TreeBuilder; +use LWP::UserAgent; +use XML::Writer; + +# The master URL for the Broxbourne planning search +our $SearchURL = "http://www2.broxbourne.gov.uk/planningsearch/webform1.aspx"; + +# We're a CGI script... +my $query = CGI->new(); + +# Get the date as an offset from 2000-01-01 +my $epoch = DateTime->new(year => 2000, month => 1, day => 1); +my $querydate = DateTime->new(year => $query->param("year"), + month => $query->param("month"), + day => $query->param("day")); +$querydate = $querydate->delta_days($epoch)->delta_days; + +# Construct an LWP user agent +our $UA = LWP::UserAgent->new(env_proxy => 1); + +# Post the URL to get an initial blank form +my $state = get_state(do_post()); + +# Post each date in turn to build up the state - you can thank +# Microsoft and ASP.NET for the horrible way we have to do this +# by posting each argument in turn to build up the state +$state = get_state(do_post_back($state, 'DateSelector1$Calendar1', $querydate)); +$state = get_state(do_post_back($state, 'DateSelector2$Calendar1', $querydate)); + +# Output an HTTP response header +print $query->header(-type => "text/xml"); + +# Create an XML output stream +my $Writer = XML::Writer->new(DATA_MODE => 1); + +# Output the XML header data +$Writer->xmlDecl("UTF-8"); +$Writer->startTag("planning"); +$Writer->dataElement("authority_name", "Borough of Broxbourne"); +$Writer->dataElement("authority_short_name", "Broxbourne"); +$Writer->startTag("applications"); + +# Get the arguments for the search... +my $args = { + "Srch" => "rb1", + "__VIEWSTATE" => $state, + "btnSearch" => "Search", + "tbReference" => "", + "tbRef2" => "" +}; + +# ...and then (at last) we can do the search! +my $page = do_post($args); + +# Loop processing pages of results +while ($page) +{ + my $table = $page->look_down("_tag" => "table", "id" => "DataGrid1"); + + # Remember the state + $state = get_state($page); + + # Clear the page for now - this will be reinitialised if we + # find another page of results to make us go round the loop + # all over again + undef $page; + + # Check that we found a table - searches that find no results + # produce a page with no table in it + if ($table) + { + # Process each row of the results + foreach my $row ($table->look_down("_tag" => "tr")) + { + my @cells = $row->look_down("_tag" => "td"); + + if ($cells[0]->look_down("_tag" => "input")) + { + my $reference = $cells[1]->as_trimmed_text; + my $date = $cells[2]->as_trimmed_text; + my $address = $cells[3]->as_trimmed_text; + my $description = $cells[4]->as_trimmed_text; + my $postcode; + + if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) + { + $postcode = $1; + } + + $Writer->startTag("application"); + $Writer->dataElement("council_reference", $reference); + $Writer->dataElement("address", $address); + $Writer->dataElement("postcode", $postcode); + $Writer->dataElement("description", $description); + $Writer->dataElement("date_received", $date); + $Writer->endTag("application"); + } + elsif ($cells[0]->attr("colspan") && $cells[0]->attr("colspan") eq "5") + { + foreach my $link ($cells[0]->look_down("_tag" => "a")) + { + if ($link->as_trimmed_text eq ">" && + $link->attr("href") =~ /^javascript:__doPostBack\('([^\']*)','([^\']*)'\)$/) + { + $page = do_post_back($state, $1, $2); + } + } + } + } + } +} + +# Finish off XML output +$Writer->endTag("applications"); +$Writer->endTag("planning"); +$Writer->end(); + +exit 0; + +# Extract the state from a page so we can repost it +sub get_state +{ + my $page = shift; + my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE"); + + return $viewstate->attr("value"); +} + +# Fake up what the doPostBack javascript function in the page does... +sub do_post_back +{ + my $state = shift; + my $target = shift; + my $argument = shift; + + $target =~ s/\$/:/g; + + my $args = { + "__EVENTTARGET" => $target, + "__EVENTARGUMENT" => $argument, + "__VIEWSTATE" => $state + }; + + return do_post($args); +} + +# Post to the planning search page +sub do_post +{ + my $response = $UA->post($SearchURL, @_); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +}