diff --git a/trunk/cgi-bin/EastHerts.cgi b/trunk/cgi-bin/EastHerts.cgi deleted file mode 100644 index 7b64f45..0000000 --- a/trunk/cgi-bin/EastHerts.cgi +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/local/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - -# The master URLs for the East Herts planning search -our $SearchURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA"; -our $InfoURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID="; -our $CommentURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/wphmakerep.displayURL?ApnID="; - -# We're a CGI script... -my $query = CGI->new(); - -# Get the date to fetch -my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1); - -# Do the search -my $page = do_post($SearchURL, - {"REGFROMDATE.MAINBODY.WPACIS.1." => $date, - "REGTODATE.MAINBODY.WPACIS.1." => $date, - "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"}); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "East Herts Council"); -$Writer->dataElement("authority_short_name", "East Herts"); -$Writer->startTag("applications"); - -# Output any applications on the first page -output_applications($page); - -# Loop over any additional results pages -foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/)) -{ - # Fetch this page... - $page = do_get(URI->new_abs($link->attr("href"), $SearchURL)); - - # ...and output the applications from it - output_applications($page); -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Make a GET request -sub do_get -{ - my $response = $UA->get(@_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Make a POST request -sub do_post -{ - my $response = $UA->post(@_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Output applications from a results page -sub output_applications -{ - my $page = shift; - - # Find the result table - my $table = $page->look_down("_tag" => "table", "cellspacing" => "2", "cellpadding" => "2"); - - # Process each row of the results - foreach my $row ($table->look_down("_tag" => "tr")) - { - my @cells = $row->look_down("_tag" => "td"); - - if (@cells >= 3) - { - my $reference = $cells[0]->as_trimmed_text; - my $description = $cells[1]->as_trimmed_text; - my $address = $cells[2]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("info_url", $InfoURL . $reference); - $Writer->dataElement("comment_url", $CommentURL . $reference); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); - } - } - - return; -} diff --git a/trunk/cgi-bin/NorthHerts.cgi b/trunk/cgi-bin/NorthHerts.cgi deleted file mode 100644 index ab8bb67..0000000 --- a/trunk/cgi-bin/NorthHerts.cgi +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - -# The master URLs for the North Hertfordshire planning search -our $SearchURL = "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"; - -# We're a CGI script... -my $query = CGI->new(); - -# Get the date to fetch -my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1); - -# Fetch the search page -my $page = do_get($SearchURL); - -# Find the form submission URL -my $form = $page->look_down("_tag" => "form", name => "frmSearch"); -my $url = URI->new_abs($form->attr("action"), $SearchURL); - -# Do the search -$page = do_post($url, {"regdate1" => $date, "regdate2" => $date}); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "North Hertfordshire District Council"); -$Writer->dataElement("authority_short_name", "North Hertfordshire"); -$Writer->startTag("applications"); - -# Process each table of the results -foreach my $table ($page->look_down("_tag" => "table", "class" => "results-table")) -{ - my @rows = map { $_->look_down("_tag" => "td") } $table->look_down("_tag" => "tr"); - my $reference = $rows[0]->as_trimmed_text; - my $infourl = $rows[0]->look_down("_tag" => "a")->attr("href"); - my $date = $rows[1]->as_trimmed_text; - my $address = $rows[3]->as_trimmed_text; - my $description = $rows[4]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("info_url", $infourl); - $Writer->dataElement("comment_url", "mailto:service\@north-herts.gov.uk?subject=Comment on Planning Application"); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Make a GET request -sub do_get -{ - my $response = $UA->get(@_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Make a POST request -sub do_post -{ - my $response = $UA->post(@_, Content_Type => "form-data"); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} diff --git a/trunk/cgi-bin/broxbourne.cgi b/trunk/cgi-bin/broxbourne.cgi deleted file mode 100644 index 53bfea3..0000000 --- a/trunk/cgi-bin/broxbourne.cgi +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use DateTime; -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - -# The master URL for the Broxbourne planning search -our $SearchURL = "http://www2.broxbourne.gov.uk/planningsearch/webform1.aspx"; - -# We're a CGI script... -my $query = CGI->new(); - -# Get the date as an offset from 2000-01-01 -my $epoch = DateTime->new(year => 2000, month => 1, day => 1); -my $querydate = DateTime->new(year => $query->param("year"), - month => $query->param("month"), - day => $query->param("day")); -$querydate = $querydate->delta_days($epoch)->delta_days; - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1); - -# Post the URL to get an initial blank form -my $state = get_state(do_post()); - -# Post each date in turn to build up the state - you can thank -# Microsoft and ASP.NET for the horrible way we have to do this -# by posting each argument in turn to build up the state -$state = get_state(do_post_back($state, 'DateSelector1$Calendar1', $querydate)); -$state = get_state(do_post_back($state, 'DateSelector2$Calendar1', $querydate)); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "Borough of Broxbourne"); -$Writer->dataElement("authority_short_name", "Broxbourne"); -$Writer->startTag("applications"); - -# Get the arguments for the search... -my $args = { - "Srch" => "rb1", - "__VIEWSTATE" => $state, - "btnSearch" => "Search", - "tbReference" => "", - "tbRef2" => "" -}; - -# ...and then (at last) we can do the search! -my $page = do_post($args); - -# Loop processing pages of results -while ($page) -{ - my $table = $page->look_down("_tag" => "table", "id" => "DataGrid1"); - - # Remember the state - $state = get_state($page); - - # Clear the page for now - this will be reinitialised if we - # find another page of results to make us go round the loop - # all over again - undef $page; - - # Check that we found a table - searches that find no results - # produce a page with no table in it - if ($table) - { - # Process each row of the results - foreach my $row ($table->look_down("_tag" => "tr")) - { - my @cells = $row->look_down("_tag" => "td"); - - if ($cells[0]->look_down("_tag" => "input")) - { - my $reference = $cells[1]->as_trimmed_text; - my $date = $cells[2]->as_trimmed_text; - my $address = $cells[3]->as_trimmed_text; - my $description = $cells[4]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); - } - elsif ($cells[0]->attr("colspan") && $cells[0]->attr("colspan") eq "5") - { - foreach my $link ($cells[0]->look_down("_tag" => "a")) - { - if ($link->as_trimmed_text eq ">" && - $link->attr("href") =~ /^javascript:__doPostBack\('([^\']*)','([^\']*)'\)$/) - { - $page = do_post_back($state, $1, $2); - } - } - } - } - } -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Extract the state from a page so we can repost it -sub get_state -{ - my $page = shift; - my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE"); - - return $viewstate->attr("value"); -} - -# Fake up what the doPostBack javascript function in the page does... -sub do_post_back -{ - my $state = shift; - my $target = shift; - my $argument = shift; - - $target =~ s/\$/:/g; - - my $args = { - "__EVENTTARGET" => $target, - "__EVENTARGUMENT" => $argument, - "__VIEWSTATE" => $state - }; - - return do_post($args); -} - -# Post to the planning search page -sub do_post -{ - my $response = $UA->post($SearchURL, @_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} diff --git a/trunk/docs/include/scraper_support.php b/trunk/docs/include/scraper_support.php index 45d229e..4ada89c 100644 --- a/trunk/docs/include/scraper_support.php +++ b/trunk/docs/include/scraper_support.php @@ -287,43 +287,27 @@ function scrape_applications_islington ($search_url, $info_url_base, $comment_ur return "http://maps.google.co.uk/maps?q=$postcode&z=$zoom"; } + //postcode to location function postcode_to_location($postcode){ - - $x = 0; - $y = 0; - + + // We don't actually need to fetch the page, we + // can get everything we need from the url we are + // redirected to. $clean_postcode = strtolower($postcode); $clean_postcode = str_replace(" ","+", $clean_postcode); + $url = "http://www.streetmap.co.uk/newsearch.srf?type=Postcode&name=" . $clean_postcode; - $html = file_get_contents($url); - $x_pattern = "/var _LocationX=\d*;/"; - $y_pattern = "/var _LocationY=\d*;/"; - - //X - preg_match($x_pattern, $html, $matches); - if(sizeof($matches) >0){ - $x = $matches[0]; - $x = str_replace('var _LocationX=',"", $x); - $x = str_replace(";","", $x); - } - - //Y - preg_match($y_pattern, $html, $matches); - if(sizeof($matches) >0){ - $y = str_replace("var _LocationY=","", $matches[0]); - $y = str_replace(";","", $y); - } - - $return = array(); - - $return[0] = $x; - $return[1] = $y; - return $return; + $headers = get_headers($url, 1); + $location = $headers["Location"]; + $location_regex = "/x=(\d*)&y=(\d*)&/"; + + preg_match ($location_regex, $location, $matches); + + return array_slice ($matches, 1); } - - + function valid_email ($string) { $valid = false;