Browse Source

Add perl scraper from Peter for Newport.

Add scrapers for Merton and Conwy (PlanningExplorer)
import/raw
duncan.parkes 16 years ago
parent
commit
65719ee6ad
5 changed files with 248 additions and 67 deletions
  1. +75
    -0
      trunk/python_scrapers/Newport.cgi
  2. +66
    -0
      trunk/python_scrapers/Newport.pl
  3. +2
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  4. +103
    -67
      trunk/python_scrapers/PlanningExplorer.py
  5. +2
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 75
- 0
trunk/python_scrapers/Newport.cgi View File

@@ -0,0 +1,75 @@
#!/usr/bin/perl -w

use strict;
use HTML::TreeBuilder;
use File::Temp qw(tempfile);
use LWP::Simple;
use POSIX;
use Encode;
use CGI;
use CGI::Carp;

sub sanity_check {
my ($var) = @_;
defined $var or return 0;
$var =~ /^[0-9]+$/ or return 0;
return 1;
}

sub no_results {
my ($y, $m, $d, $reason) = @_;
print <<NIL;
Content-type: text/xml

<?xml version="1.0" encoding="UTF-8"?>
<planning>
<authority_name>Newport City Council</authority_name>
<authority_short_name>Newport</authority_short_name>
<applications>
</applications>
</planning>
NIL
die "$y/$m/$d failed: $reason\n";
}

my $cgi = new CGI;

my $year = $cgi->param("year");
my $month = $cgi->param("month");
my $day = $cgi->param("day");

unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
print <<ERROR;
Content-type: text/plain

Need year, month, day parameters
ERROR
exit 0;
}

my $tree = HTML::TreeBuilder->new;
# $tree->parse_file('weekly-planning-bw-lists.htm');
$tree->parse(decode_utf8(get('http://www.newport.gov.uk/_dc/index.cfm?fuseaction=planapps.applist') or die "couldn't fetch index page"));
$tree->eof;

my $re = sprintf('Lists?\s+for %02d/%02d/%04d', $day, $month, $year);

my ($day_p) = $tree->look_down(
"_tag", "p",
sub { $_[0]->as_text =~ /$re/i }
);
$day_p or no_results($year, $month, $day, "Cannot find day paragraph");

my ($day_link) = $day_p->find_by_tag_name("a");
$day_link or no_results($year, $month, $day, "Cannot find day link");

my $day_absurl = $day_link->attr('href');

my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
print $fh get($day_absurl);
close($fh);

print "Content-type: text/xml\n\n";
system "./Newport.pl", $filename, $day_absurl and die "system failed: $|";

unlink $filename or die "cannot unlink temporary file $filename: $!";

+ 66
- 0
trunk/python_scrapers/Newport.pl View File

@@ -0,0 +1,66 @@
#!/usr/bin/perl -w

use strict;
use XML::Writer;

my $file = $ARGV[0];
my $info_url = $ARGV[1];

my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

$writer->xmlDecl("UTF-8");

$writer->startTag("planning");
$writer->dataElement("authority_name", "Newport City Council");
$writer->dataElement("authority_short_name", "Newport");
$writer->startTag("applications");

open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
while (my $line = <$fh>) {
if ($line =~ /^\s*App No:\s*(\S+)/) {
my $refno = $1;
my $address = ""; my $proposal = ""; my $date_received;
my $cur_field;
my $near_end;
while (1) {
chomp $line;
$line =~ s/^\s+//; $line =~ s/\s+$//;
if ($line =~ s/^ApplicationSite://) {
$cur_field = \$address;
} elsif ($line =~ s/^Proposal://) {
$cur_field = \$proposal;
} elsif (($line =~ s/^Applicant://) || ($line =~ s/^Agent://) || ($line =~ s/^App Type://)) {
$cur_field = undef;
} elsif ($line =~ /^Date Registered:\s*(\S+)/) {
$date_received = $1;
$cur_field = undef;
}
$line =~ s/^\s+//;
if (defined $cur_field) {
$$cur_field .= " " if $$cur_field ne "" and $line ne "";
$$cur_field .= $line;
}
last unless defined ($line = <$fh>);
last if $near_end && length $line == 1;
$near_end = 1 if $line =~ /^\s*Case Officer:/;
}
my $postcode = "None";
if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
$postcode = $1;
}

$writer->startTag("application");
$writer->dataElement("council_reference", $refno);
$writer->dataElement("address", $address);
$writer->dataElement("postcode", $postcode);
$writer->dataElement("description", $proposal);
$writer->dataElement("info_url", $info_url);
$writer->dataElement("comment_url", 'planning@newport.gov.uk');
$writer->dataElement("date_received", $date_received);
$writer->endTag;
}
}

$writer->endTag;
$writer->endTag;
$writer->end;

+ 2
- 0
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -37,3 +37,5 @@
"Glasgow.cgi", "493"
"Highland.pl", "493"
"Highland.cgi", "493"
"Newport.pl", "493"
"Newport.cgi", "493"

+ 103
- 67
trunk/python_scrapers/PlanningExplorer.py View File

@@ -60,6 +60,10 @@ class PlanningExplorerParser:
address_td_no = 1
description_td_no = 2

# In some cases we won't be able to get the full address/description/postcode without getting the info page for each app.
# If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
fetch_info_page = False

def _modify_response(self, response):
"""For most sites, we have managed to get all the apps on a
single page by choosing the right parameters.
@@ -129,7 +133,20 @@ class PlanningExplorerParser:
return post_data

def _getPostCode(self):

def _getAddress(self, tds, info_soup):
# If this td contains a div, then the address is the
# string in there - otherwise, use the string in the td.
address_td = tds[self.address_td_no]
if address_td.div is not None:
address = address_td.div.string
else:
address = address_td.string
return address


def _getPostCode(self, info_soup):
"""In most cases, the postcode can be got from the address in
the results table. Some councils put the address there without the
postcode. In this case we will have to go to the info page to get
@@ -138,6 +155,21 @@ class PlanningExplorerParser:

return getPostcodeFromText(self._current_application.address)
def _getDescription(self, tds, info_soup):
description_td = tds[self.description_td_no]
if description_td.div is not None:
# Mostly this is in a div
# Use the empty string if the description is missing
description = description_td.div.string or ""
else:
# But sometimes (eg Crewe) it is directly in the td.
# Use the empty string if the description is missing
description = description_td.string or ""

return description


def __init__(self,
authority_name,
authority_short_name,
@@ -214,53 +246,39 @@ class PlanningExplorerParser:

tds = tr.findAll("td")

for td_no in range(len(tds)):
if td_no == self.reference_td_no:
# This td contains the reference number and a link to details
self._current_application.council_reference = tds[td_no].a.string

relative_info_url = self._sanitiseInfoUrl(tds[td_no].a['href'])

self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)

self._current_application.council_reference = tds[self.reference_td_no].a.string

# What about a comment url?
# There doesn't seem to be one, so we'll use the email address
if self.comments_email_address is not None:
# We're using the email address, as there doesn't seem
# to be a web form for comments
self._current_application.comment_url = self.comments_email_address
else:
# This link contains a code which we need for the comments url
# (on those sites that use it)
application_code = app_code_regex.search(relative_info_url).groups()[0]
relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href'])
self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)

relative_comments_url = self.comments_path %(application_code)
self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)
# Fetch the info page if we need it, otherwise set it to None

elif td_no == self.address_td_no:
# If this td contains a div, then the address is the
# string in there - otherwise, use the string in the td.
if tds[td_no].div is not None:
address = tds[td_no].div.string
else:
address = tds[td_no].string
if self.fetch_info_page:
# We need to quote the spaces in the info url
info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))
info_soup = BeautifulSoup(urllib2.urlopen(info_request))
else:
info_soup = None

self._current_application.address = address
# What about a comment url?
# There doesn't seem to be one, so we'll use the email address
if self.comments_email_address is not None:
# We're using the email address, as there doesn't seem
# to be a web form for comments
self._current_application.comment_url = self.comments_email_address
else:
# This link contains a code which we need for the comments url
# (on those sites that use it)
application_code = app_code_regex.search(relative_info_url).groups()[0]

self._current_application.postcode = self._getPostCode()
relative_comments_url = self.comments_path %(application_code)
self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)

elif td_no == self.description_td_no:
if tds[td_no].div is not None:
# Mostly this is in a div
# Use the empty string if the description is missing
description = tds[td_no].div.string or ""
else:
# But sometimes (eg Crewe) it is directly in the td.
# Use the empty string if the description is missing
description = tds[td_no].string or ""

self._current_application.description = description
self._current_application.address = self._getAddress(tds, info_soup)
self._current_application.postcode = self._getPostCode(info_soup)
self._current_application.description = self._getDescription(tds, info_soup)

self._results.addApplication(self._current_application)

@@ -470,11 +488,17 @@ class LiverpoolParser(PlanningExplorerParser):

return ''.join(ws_re.split(url))

# FIXME - Merton needs to be done here when it is back up.

class MertonParser(PlanningExplorerParser):
use_firefox_user_agent = True
fetch_info_page = True

def _getAddress(self, tds, info_soup):
return info_soup.find(text="Site Address").findNext("td").string.strip()

def _getDescription(self, tds, info_soup):
return info_soup.find(text="Development Proposal").findNext("td").string.strip()


class ShrewsburyParser(PlanningExplorerParser):
use_firefox_user_agent = True

@@ -574,6 +598,16 @@ class WalthamForestParser(PlanningExplorerParser):
print post_data
return post_data

class ConwyParser(BroadlandLike, PlanningExplorerParser):
search_url_path = "Northgate/planningexplorerenglish/generalsearch.aspx"
info_url_path = "Northgate/PlanningExplorerEnglish/Generic/"
comments_path = "Northgate/PlanningExplorerEnglish/PLComments.aspx?pk=%s"

use_firefox_user_agent = True


#&txtApplicationNumber=&txtProposal=&txtSiteAddress=&cboWardCode=&cboParishCode=&cboApplicationTypeCode=&cboDevelopmentTypeCode=&cboStatusCode=&cboSelectDateValue=DATE_RECEIVED&cboMonths=1&cboDays=1&rbGroup=rbRange&dateStart=10%2F07%2F2008&dateEnd=20%2F07%2F2008&edrDateSelection=&csbtnSearch=Search


#txtApplicantName=
#txtAgentName=
@@ -597,29 +631,31 @@ if __name__ == '__main__':
# NOTE - 04/11/2007 is a sunday
# I'm using it to test that the scrapers behave on days with no apps.
#parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
#parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
#parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
#parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
#parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
#parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
#parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
#parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
#parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
#parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
#parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
#parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
#parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
#parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
#parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/")
#parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/")
parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/")
#parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/")
#parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/")
#parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/")
#parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/")
#parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
print parser.getResults(18, 4, 2008)
parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
# parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
# parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
# parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
# parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/")
# parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/")
# parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/")
# parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/")
# parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/")
# parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/")
# parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/")
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
print parser.getResults(3, 7, 2008)

# To Do



+ 2
- 0
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -243,3 +243,5 @@
"London Borough of Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Castle Point Borough Council", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Great Yarmouth Borough Council", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"
"Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/", "PlanningExplorer", "ConwyParser"
"London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser"

Loading…
Cancel
Save