Add python parser for West Dorset, and remove the non-working perl one.

duncan.parkes 16 years ago
4 changed files with 90 additions and 150 deletions
  1. +1
  2. +1
  3. +0
  4. +88

+ 1
- 1
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -16,7 +16,6 @@
"", "420"
"Dacorum.cgi", "493"
"SouthSomerset.cgi", "493"
"WestDorset.cgi", "493"
"Christchurch.cgi", "493"
"", "420"
"", "420"
@@ -60,3 +59,4 @@
"", "420"
"", "420"
"", "420"
"", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -265,3 +265,4 @@
"Waltham Forest Council", "Waltham Forest", "", "PlanningExplorer", "WalthamForestParser"
"Caerphilly County Borough Council", "Caerphilly", "", "PublicAccess", "PublicAccessParser"
"Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"
"West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser"

+ 0
- 149
trunk/python_scrapers/WestDorset.cgi View File

@@ -1,149 +0,0 @@

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# The master URLs for the West Dorset planning search
our $SearchURL = "";
our $InfoURL = "";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Post the URL to get an initial blank form
my $page = do_post();

# Do the search
$page = do_post($page,
{"DetailedSearch\$TextBox_DateRaisedFrom" => $date,
"DetailedSearch\$TextBox_DateRaisedTo" => $date,
"DetailedSearch\$Button_DetailedSearch" => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->dataElement("authority_name", "West Dorset District Council");
$Writer->dataElement("authority_short_name", "West Dorset");

# Output any applications on the first page

# Loop over any additional results pages
while (my $link = $page->look_down("_tag" => "a", "id" => "MatchingApplications_ResultsNavigationTop_LinkButton_Next"))
# Fetch this page...
$page = do_post_back($page, 'MatchingApplications$ResultsNavigationTop$LinkButton_Next', '');

# ...and output the applications from it

# Finish off XML output

exit 0;

# Fake up what the doPostBack javascript function in the page does...
sub do_post_back
my $previous = shift;
my $target = shift;
my $argument = shift;

$target =~ s/\$/:/g;

my $args = {
"__EVENTTARGET" => $target,
"__EVENTARGUMENT" => $argument,

return do_post($previous, $args);

# Make a POST request
sub do_post
my $previous = shift;
my $args = shift || {};

if (defined($previous))
my $viewstate = $previous->look_down("_tag" => "input", "name" => "__VIEWSTATE");
my $eventvalidation = $previous->look_down("_tag" => "input", "name" => "__EVENTVALIDATION");

$args->{"__VIEWSTATE"} = $viewstate->attr("value");
$args->{"__EVENTVALIDATION"} = $eventvalidation->attr("value");

my $response = $UA->post($SearchURL, $args);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);

# Output applications from a results page
sub output_applications
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "searchresults");

# No results means no results table
if (defined($table))
# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
my $class = $row->attr("class") || "";

next if $class eq "searchresultsheader";

my @cells = $row->look_down("_tag" => "td");
my $reference = $cells[0]->as_trimmed_text;
my $date = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $description = $cells[3]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
$postcode = $1;

$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("comment_url", $InfoURL . $reference);
$Writer->dataElement("date_received", $date);


+ 88
- 0
trunk/python_scrapers/ View File

@@ -0,0 +1,88 @@
import urllib2
import urllib
import datetime
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \

date_format = "%d/%m/%Y"

class WestDorsetParser:
def __init__(self, *args):

self.authority_name = "West Dorset District Council"
self.authority_short_name = "West Dorset"

self.base_url = ""
self.info_url = ""

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

def getResultsByDayMonthYear(self, day, month, year):
search_date =, month, day)

get_response = urllib2.urlopen(self.base_url)
get_soup = BeautifulSoup(

post_data = (
("__VIEWSTATE", get_soup.find("input", id="__VIEWSTATE")["value"]),
# ("QuickSearchApplicationNumber$TextBox_ApplicationNumber", ""),
# ("QuickSearchThisWeek$DropDownList_PastWeek", ""),
# ("DetailedSearch$TextBox_PropertyNameNumber", ""),
# ("DetailedSearch$Textbox_StreetName", ""),
# ("DetailedSearch$Textbox_TownVillage", ""),
# ("DetailedSearch$Textbox_Postcode", ""),
# ("DetailedSearch$Textbox_Parish", ""),
# ("DetailedSearch$Textbox_ApplicantSurname", ""),
# ("DetailedSearch$TextBox_AgentName", ""),
("DetailedSearch$TextBox_DateRaisedFrom", search_date.strftime(date_format)),
("DetailedSearch$TextBox_DateRaisedTo", search_date.strftime(date_format)),
# ("DetailedSearch$TextBox_DecisionFrom", "dd%2Fmm%2Fyyyy"),
# ("DetailedSearch$TextBox_DecisionTo", "dd%2Fmm%2Fyyyy"),
("DetailedSearch$Button_DetailedSearch", "Search"),
("__EVENTVALIDATION", get_soup.find("input", id="__EVENTVALIDATION")["value"]),

# The response to the GET is a redirect. We'll need to post to the new url.
post_response = urllib2.urlopen(get_response.url, urllib.urlencode(post_data))
post_soup = BeautifulSoup(

if not post_soup.find(text = re.compile("No matching record")):
# The first row contains headers.
trs = post_soup.find("table", {"class": "searchresults"}).findAll("tr")[1:]

for tr in trs:
application = PlanningApplication()

# We can fill the date received in straight away from the date we searched for.
application.date_received = search_date

tds = tr.findAll("td")

application.council_reference = tds[0].font.string.strip()
application.address = tds[2].font.string.strip()
application.postcode = getPostcodeFromText(application.address)
application.description = tds[3].font.string.strip()

# Set the info url and the comment url to be the same - can't get to the comment
# one directly without javascript.
application.info_url = self.info_url %(application.council_reference)
application.comment_url = application.info_url


return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = WestDorsetParser()
print parser.getResults(1,10,2008)
