diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index eae8963..dcdda49 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -16,7 +16,6 @@ "SwiftLG.py", "420" "Dacorum.cgi", "493" "SouthSomerset.cgi", "493" -"WestDorset.cgi", "493" "Christchurch.cgi", "493" "WAM.py", "420" "Planet.py", "420" @@ -60,3 +59,4 @@ "Exmoor.py", "420" "Eastbourne.py", "420" "Gosport.py", "420" +"WestDorset.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 0661ca7..2c4bfa8 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -265,3 +265,4 @@ "Waltham Forest Council", "Waltham Forest", "http://planning.walthamforest.gov.uk/", "PlanningExplorer", "WalthamForestParser" "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser" +"West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser" diff --git a/trunk/python_scrapers/WestDorset.cgi b/trunk/python_scrapers/WestDorset.cgi deleted file mode 100644 index b888315..0000000 --- a/trunk/python_scrapers/WestDorset.cgi +++ /dev/null @@ -1,149 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - - -# The master URLs for the West Dorset planning search -our $SearchURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/applicationsearch.aspx"; -our $InfoURL = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/ApplicationDetails.aspx?Authority=West%20Dorset%20District%20Council&Application="; - -# We're a CGI script... -my $query = CGI->new(); - -# Get the date to fetch -my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1); - -# Post the URL to get an initial blank form -my $page = do_post(); - -# Do the search -$page = do_post($page, - {"DetailedSearch\$TextBox_DateRaisedFrom" => $date, - "DetailedSearch\$TextBox_DateRaisedTo" => $date, - "DetailedSearch\$Button_DetailedSearch" => "Search"}); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "West Dorset District Council"); -$Writer->dataElement("authority_short_name", "West Dorset"); -$Writer->startTag("applications"); - -# Output any applications on the first page -output_applications($page); - -# Loop over any additional results pages -while (my $link = $page->look_down("_tag" => "a", "id" => "MatchingApplications_ResultsNavigationTop_LinkButton_Next")) -{ - # Fetch this page... - $page = do_post_back($page, 'MatchingApplications$ResultsNavigationTop$LinkButton_Next', ''); - - # ...and output the applications from it - output_applications($page); -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Fake up what the doPostBack javascript function in the page does... -sub do_post_back -{ - my $previous = shift; - my $target = shift; - my $argument = shift; - - $target =~ s/\$/:/g; - - my $args = { - "__EVENTTARGET" => $target, - "__EVENTARGUMENT" => $argument, - }; - - return do_post($previous, $args); -} - -# Make a POST request -sub do_post -{ - my $previous = shift; - my $args = shift || {}; - - if (defined($previous)) - { - my $viewstate = $previous->look_down("_tag" => "input", "name" => "__VIEWSTATE"); - my $eventvalidation = $previous->look_down("_tag" => "input", "name" => "__EVENTVALIDATION"); - - $args->{"__VIEWSTATE"} = $viewstate->attr("value"); - $args->{"__EVENTVALIDATION"} = $eventvalidation->attr("value"); - } - - my $response = $UA->post($SearchURL, $args); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Output applications from a results page -sub output_applications -{ - my $page = shift; - - # Find the result table - my $table = $page->look_down("_tag" => "table", "class" => "searchresults"); - - # No results means no results table - if (defined($table)) - { - # Process each row of the results - foreach my $row ($table->look_down("_tag" => "tr")) - { - my $class = $row->attr("class") || ""; - - next if $class eq "searchresultsheader"; - - my @cells = $row->look_down("_tag" => "td"); - my $reference = $cells[0]->as_trimmed_text; - my $date = $cells[1]->as_trimmed_text; - my $address = $cells[2]->as_trimmed_text; - my $description = $cells[3]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("info_url", $InfoURL . $reference); - $Writer->dataElement("comment_url", $InfoURL . $reference); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); - } - } - - return; -} diff --git a/trunk/python_scrapers/WestDorset.py b/trunk/python_scrapers/WestDorset.py new file mode 100644 index 0000000..6a88036 --- /dev/null +++ b/trunk/python_scrapers/WestDorset.py @@ -0,0 +1,88 @@ +import urllib2 +import urllib +import datetime +import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class WestDorsetParser: + def __init__(self, *args): + + self.authority_name = "West Dorset District Council" + self.authority_short_name = "West Dorset" + + self.base_url = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/applicationsearch.aspx" + self.info_url = "http://webapps.westdorset-dc.gov.uk/planningapplications/pages/ApplicationDetails.aspx?Application=%s&Authority=West+Dorset+District+Council+" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + get_response = urllib2.urlopen(self.base_url) + + get_soup = BeautifulSoup(get_response.read()) + + post_data = ( + ("__VIEWSTATE", get_soup.find("input", id="__VIEWSTATE")["value"]), +# ("QuickSearchApplicationNumber$TextBox_ApplicationNumber", ""), +# ("QuickSearchThisWeek$DropDownList_PastWeek", ""), +# ("DetailedSearch$TextBox_PropertyNameNumber", ""), +# ("DetailedSearch$Textbox_StreetName", ""), +# ("DetailedSearch$Textbox_TownVillage", ""), +# ("DetailedSearch$Textbox_Postcode", ""), +# ("DetailedSearch$Textbox_Parish", ""), +# ("DetailedSearch$Textbox_ApplicantSurname", ""), +# ("DetailedSearch$TextBox_AgentName", ""), + ("DetailedSearch$TextBox_DateRaisedFrom", search_date.strftime(date_format)), + ("DetailedSearch$TextBox_DateRaisedTo", search_date.strftime(date_format)), +# ("DetailedSearch$TextBox_DecisionFrom", "dd%2Fmm%2Fyyyy"), +# ("DetailedSearch$TextBox_DecisionTo", "dd%2Fmm%2Fyyyy"), + ("DetailedSearch$Button_DetailedSearch", "Search"), + ("__EVENTVALIDATION", get_soup.find("input", id="__EVENTVALIDATION")["value"]), + ) + + # The response to the GET is a redirect. We'll need to post to the new url. + post_response = urllib2.urlopen(get_response.url, urllib.urlencode(post_data)) + post_soup = BeautifulSoup(post_response.read()) + + if not post_soup.find(text = re.compile("No matching record")): + # The first row contains headers. + trs = post_soup.find("table", {"class": "searchresults"}).findAll("tr")[1:] + + for tr in trs: + application = PlanningApplication() + + # We can fill the date received in straight away from the date we searched for. + application.date_received = search_date + + tds = tr.findAll("td") + + application.council_reference = tds[0].font.string.strip() + application.address = tds[2].font.string.strip() + application.postcode = getPostcodeFromText(application.address) + application.description = tds[3].font.string.strip() + + # Set the info url and the comment url to be the same - can't get to the comment + # one directly without javascript. + application.info_url = self.info_url %(application.council_reference) + application.comment_url = application.info_url + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = WestDorsetParser() + print parser.getResults(1,10,2008) +