From 593e9d82724253ddb8bba30e809bfbc0c30ed509 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Mon, 10 Dec 2007 13:47:09 +0000 Subject: [PATCH] Add SouthSomerset, Christchurch and West Dorset. We still need to sort out comment urls here. --- trunk/python_scrapers/Christchurch.cgi | 148 +++++++++++++++++++ trunk/python_scrapers/OtherFilesToCopy.csv | 3 + trunk/python_scrapers/SouthSomerset.cgi | 163 +++++++++++++++++++++ trunk/python_scrapers/WestDorset.cgi | 148 +++++++++++++++++++ 4 files changed, 462 insertions(+) create mode 100644 trunk/python_scrapers/Christchurch.cgi create mode 100644 trunk/python_scrapers/SouthSomerset.cgi create mode 100644 trunk/python_scrapers/WestDorset.cgi diff --git a/trunk/python_scrapers/Christchurch.cgi b/trunk/python_scrapers/Christchurch.cgi new file mode 100644 index 0000000..91c5e71 --- /dev/null +++ b/trunk/python_scrapers/Christchurch.cgi @@ -0,0 +1,148 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use CGI qw(:cgi); +use HTML::TreeBuilder; +use LWP::UserAgent; +use XML::Writer; + + +# The master URLs for the Christchurch planning search +our $SearchURL = "http://webapps.christchurch.gov.uk/PlanningApplications/pages/ApplicationSearch.aspx"; +our $InfoURL = "http://webapps.chirstchurch.gov.uk/PlanningApplications/pages/ApplicationDetails.aspx?Authority=Christchurch%20Borough%20Council&Application="; + +# We're a CGI script... +my $query = CGI->new(); + +# Get the date to fetch +my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); + +# Construct an LWP user agent +our $UA = LWP::UserAgent->new(env_proxy => 1); + +# Post the URL to get an initial blank form +my $page = do_post(); + +# Do the search +$page = do_post($page, + {"DetailedSearch:TextBox_DateRaisedFrom" => $date, + "DetailedSearch:TextBox_DateRaisedTo" => $date, + "QuickSearchApplicationNumber:Button_SearchApplicationNumber" => "Search"}); + +# Output an HTTP response header +print $query->header(-type => "text/xml"); + +# Create an XML output stream +my $Writer = XML::Writer->new(DATA_MODE => 1); + +# Output the XML header data +$Writer->xmlDecl("UTF-8"); +$Writer->startTag("planning"); +$Writer->dataElement("authority_name", "Christchurch Council"); +$Writer->dataElement("authority_short_name", "Christchurch"); +$Writer->startTag("applications"); + +# Output any applications on the first page +output_applications($page); + +# Loop over any additional results pages +while (my $link = $page->look_down("_tag" => "a", "id" => "MatchingApplications_ResultsNavigationTop_LinkButton_Next")) +{ + # Fetch this page... + $page = do_post_back($page, 'MatchingApplications$ResultsNavigationTop$LinkButton_Next', ''); + + # ...and output the applications from it + output_applications($page); +} + +# Finish off XML output +$Writer->endTag("applications"); +$Writer->endTag("planning"); +$Writer->end(); + +exit 0; + +# Fake up what the doPostBack javascript function in the page does... +sub do_post_back +{ + my $previous = shift; + my $target = shift; + my $argument = shift; + + $target =~ s/\$/:/g; + + my $args = { + "__EVENTTARGET" => $target, + "__EVENTARGUMENT" => $argument, + }; + + return do_post($previous, $args); +} + +# Make a POST request +sub do_post +{ + my $previous = shift; + my $args = shift || {}; + + if (defined($previous)) + { + my $viewstate = $previous->look_down("_tag" => "input", "name" => "__VIEWSTATE"); +# my $eventvalidation = $previous->look_down("_tag" => "input", "name" => "__EVENTVALIDATION"); + + $args->{"__VIEWSTATE"} = $viewstate->attr("value"); +# $args->{"__EVENTVALIDATION"} = $eventvalidation->attr("value"); + } + + my $response = $UA->post($SearchURL, $args); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} + +# Output applications from a results page +sub output_applications +{ + my $page = shift; + + # Find the result table + my $table = $page->look_down("_tag" => "table", "class" => "searchresults"); + + # No results means no results table + if (defined($table)) + { + # Process each row of the results + foreach my $row ($table->look_down("_tag" => "tr")) + { + my $class = $row->attr("class") || ""; + + next if $class eq "searchresultsheader"; + + my @cells = $row->look_down("_tag" => "td"); + my $reference = $cells[0]->as_trimmed_text; + my $date = $cells[1]->as_trimmed_text; + my $address = $cells[2]->as_trimmed_text; + my $description = $cells[3]->as_trimmed_text; + my $postcode; + + if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) + { + $postcode = $1; + } + + $Writer->startTag("application"); + $Writer->dataElement("council_reference", $reference); + $Writer->dataElement("address", $address); + $Writer->dataElement("postcode", $postcode); + $Writer->dataElement("description", $description); + $Writer->dataElement("info_url", $InfoURL . $reference); + $Writer->dataElement("date_received", $date); + $Writer->endTag("application"); + } + } + + return; +} diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index a8a0f9c..432bb93 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -16,3 +16,6 @@ "PlanningExplorer.py", "420" "SwiftLG.py", "420" "Dacorum.cgi", "493" +"SouthSomerset.cgi", "493" +"WestDorset.cgi", "493" +"Christchurch.cgi", "493" diff --git a/trunk/python_scrapers/SouthSomerset.cgi b/trunk/python_scrapers/SouthSomerset.cgi new file mode 100644 index 0000000..333575e --- /dev/null +++ b/trunk/python_scrapers/SouthSomerset.cgi @@ -0,0 +1,163 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use CGI qw(:cgi); +use HTML::TreeBuilder; +use LWP::UserAgent; +use XML::Writer; + +# Month names +our %Months = ( 1 => "Jan", 2 => "Feb", 3 => "Mar", 4 => "Apr", + 5 => "May", 5 => "Jun", 7 => "Jul", 8 => "Aug", + 9 => "Sep", 10 => "Oct", 11 => "Nov", 12 => "Dec" ); + +# The master URLs for the Enfield planning search +our $StartURL = "http://www.southsomerset.gov.uk/index.jsp?articleid=1925&page_name=startsearch"; +our $SearchURL = "http://www.southsomerset.gov.uk/index.jsp?articleid=1925&page_name=searchresults"; + +# We're a CGI script... +my $query = CGI->new(); + +# Get the date to fetch +my $date = $query->param("day") . "-" . $Months{$query->param("month")} . "-" . $query->param("year"); + +# Construct an LWP user agent +our $UA = LWP::UserAgent->new(env_proxy => 1, cookie_jar => {}); + +# Post acceptance of terms and conditions to get a cookie +do_post($StartURL, {"acceptTC" => "on"}); + +# Do the search +my $page = do_post($SearchURL, + {"startdate" => "12-Nov-2007", #$date, + "enddate" => $date, + "datesearch" => "applications", + "timeframe" => "yearonly", + "btnsubmit" => "search", + "address" => "", + "area" => "", + "caseno" => "", + "decision" => "", + "location" => "", + "parish" => "", + "postcode" => "", + "recentweeks" => "", + "ward" => ""}); + +# Output an HTTP response header +print $query->header(-type => "text/xml"); + +# Create an XML output stream +my $Writer = XML::Writer->new(DATA_MODE => 1); + +# Output the XML header data +$Writer->xmlDecl("UTF-8"); +$Writer->startTag("planning"); +$Writer->dataElement("authority_name", "South Somerset District Council"); +$Writer->dataElement("authority_short_name", "South Somerset"); +$Writer->startTag("applications"); + +# Output any applications on the first page +output_applications($page); + +# Loop over any additional results pages +while (my $link = $page->look_down("_tag" => "a", sub { $_[0]->as_text eq "Next Page" })) +{ + # Fetch this page... + $page = do_get(URI->new_abs($link->attr("href"), $SearchURL)); + + # ...and output the applications from it + output_applications($page); +} + +# Finish off XML output +$Writer->endTag("applications"); +$Writer->endTag("planning"); +$Writer->end(); + +exit 0; + +# Make a GET request +sub do_get +{ + my $response = $UA->get(@_); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} + +# Make a POST request +sub do_post +{ + my $response = $UA->post(@_); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} + +# Output applications from a results page +sub output_applications +{ + my $page = shift; + my $reference; + my $address; + my $postcode; + my $description; + my $date_received; + my $info_url; + + # Find the result table + my $table = $page->look_down("_tag" => "div", "class" => "mainText")->look_down("_tag" => "table"); + + # Process each row of the results + foreach my $row ($table->look_down("_tag" => "tr")) + { + my @cells = $row->look_down("_tag" => "td"); + + if (@cells == 1 && $cells[0]->look_down("_tag" => "hr")) + { + if (defined($reference)) + { + $Writer->startTag("application"); + $Writer->dataElement("council_reference", $reference); + $Writer->dataElement("address", $address); + $Writer->dataElement("postcode", $postcode); + $Writer->dataElement("description", $description); + $Writer->dataElement("info_url", $info_url); + $Writer->dataElement("date_received", $date_received); + $Writer->endTag("application"); + } + + undef $reference; + undef $address; + undef $postcode; + undef $description; + undef $date_received; + undef $info_url + } + elsif (@cells == 1 && defined($reference)) + { + $description = $cells[0]->as_trimmed_text; + + $description =~ s/^Proposal:\s*//; + } + elsif (@cells == 5 && $cells[0]->as_trimmed_text =~ /^\d+/) + { + $reference = $cells[0]->as_trimmed_text; + $date_received = $cells[1]->as_trimmed_text; + $address = $cells[2]->as_trimmed_text; + $info_url = URI->new_abs($cells[4]->look_down("_tag" => "a")->attr("href"), $SearchURL); + + if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) + { + $postcode = $1; + } + } + } + + return; +} diff --git a/trunk/python_scrapers/WestDorset.cgi b/trunk/python_scrapers/WestDorset.cgi new file mode 100644 index 0000000..5b0b6b0 --- /dev/null +++ b/trunk/python_scrapers/WestDorset.cgi @@ -0,0 +1,148 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use CGI qw(:cgi); +use HTML::TreeBuilder; +use LWP::UserAgent; +use XML::Writer; + + +# The master URLs for the West Dorset planning search +our $SearchURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/applicationsearch.aspx"; +our $InfoURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/ApplicationDetails.aspx?Authority=West%20Dorset%20District%20Council&Application="; + +# We're a CGI script... +my $query = CGI->new(); + +# Get the date to fetch +my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); + +# Construct an LWP user agent +our $UA = LWP::UserAgent->new(env_proxy => 1); + +# Post the URL to get an initial blank form +my $page = do_post(); + +# Do the search +$page = do_post($page, + {"DetailedSearch\$TextBox_DateRaisedFrom" => $date, + "DetailedSearch\$TextBox_DateRaisedTo" => $date, + "DetailedSearch\$Button_DetailedSearch" => "Search"}); + +# Output an HTTP response header +print $query->header(-type => "text/xml"); + +# Create an XML output stream +my $Writer = XML::Writer->new(DATA_MODE => 1); + +# Output the XML header data +$Writer->xmlDecl("UTF-8"); +$Writer->startTag("planning"); +$Writer->dataElement("authority_name", "West Dorset District Council"); +$Writer->dataElement("authority_short_name", "West Dorset"); +$Writer->startTag("applications"); + +# Output any applications on the first page +output_applications($page); + +# Loop over any additional results pages +while (my $link = $page->look_down("_tag" => "a", "id" => "MatchingApplications_ResultsNavigationTop_LinkButton_Next")) +{ + # Fetch this page... + $page = do_post_back($page, 'MatchingApplications$ResultsNavigationTop$LinkButton_Next', ''); + + # ...and output the applications from it + output_applications($page); +} + +# Finish off XML output +$Writer->endTag("applications"); +$Writer->endTag("planning"); +$Writer->end(); + +exit 0; + +# Fake up what the doPostBack javascript function in the page does... +sub do_post_back +{ + my $previous = shift; + my $target = shift; + my $argument = shift; + + $target =~ s/\$/:/g; + + my $args = { + "__EVENTTARGET" => $target, + "__EVENTARGUMENT" => $argument, + }; + + return do_post($previous, $args); +} + +# Make a POST request +sub do_post +{ + my $previous = shift; + my $args = shift || {}; + + if (defined($previous)) + { + my $viewstate = $previous->look_down("_tag" => "input", "name" => "__VIEWSTATE"); + my $eventvalidation = $previous->look_down("_tag" => "input", "name" => "__EVENTVALIDATION"); + + $args->{"__VIEWSTATE"} = $viewstate->attr("value"); + $args->{"__EVENTVALIDATION"} = $eventvalidation->attr("value"); + } + + my $response = $UA->post($SearchURL, $args); + + die $response->status_line unless $response->is_success; + + return HTML::TreeBuilder->new_from_content($response->content); +} + +# Output applications from a results page +sub output_applications +{ + my $page = shift; + + # Find the result table + my $table = $page->look_down("_tag" => "table", "class" => "searchresults"); + + # No results means no results table + if (defined($table)) + { + # Process each row of the results + foreach my $row ($table->look_down("_tag" => "tr")) + { + my $class = $row->attr("class") || ""; + + next if $class eq "searchresultsheader"; + + my @cells = $row->look_down("_tag" => "td"); + my $reference = $cells[0]->as_trimmed_text; + my $date = $cells[1]->as_trimmed_text; + my $address = $cells[2]->as_trimmed_text; + my $description = $cells[3]->as_trimmed_text; + my $postcode; + + if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) + { + $postcode = $1; + } + + $Writer->startTag("application"); + $Writer->dataElement("council_reference", $reference); + $Writer->dataElement("address", $address); + $Writer->dataElement("postcode", $postcode); + $Writer->dataElement("description", $description); + $Writer->dataElement("info_url", $InfoURL . $reference); + $Writer->dataElement("date_received", $date); + $Writer->endTag("application"); + } + } + + return; +}