diff --git a/trunk/python_scrapers/PublicAccess.py b/trunk/python_scrapers/PublicAccess.py
index 823d0cf..19403df 100644
--- a/trunk/python_scrapers/PublicAccess.py
+++ b/trunk/python_scrapers/PublicAccess.py
@@ -1,9 +1,12 @@
#!/usr/local/bin/python
import urllib, urllib2
-import HTMLParser
+
import urlparse
-import datetime, time
+import datetime
+import re
+
+import BeautifulSoup
import cookielib
@@ -17,7 +20,13 @@ search_form_url_end = "DcApplication/application_searchform.aspx"
search_results_url_end = "DcApplication/application_searchresults.aspx"
comments_url_end = "DcApplication/application_comments_entryform.aspx"
-class PublicAccessParser(HTMLParser.HTMLParser):
+def index_or_none(a_list, item):
+ """
+ Returns the index of item in a_list, or None, if it isn't in the list.
+ """
+ return a_list.count(item) and a_list.index(item)
+
+class PublicAccessParser:
"""This is the class which parses the PublicAccess search results page.
"""
@@ -27,183 +36,19 @@ class PublicAccessParser(HTMLParser.HTMLParser):
base_url,
debug=False):
- HTMLParser.HTMLParser.__init__(self)
-
self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url
self.debug = debug
- # this will change to True when we enter the table of results
- self._in_results_table = False
-
- # this will be set to True when we have passed the header row
- # in the results table
- self._past_header_row = False
-
- # this will be true when we are in a
in the results table
- self._in_td = False
-
- # For each row, this will say how many tds we have seen so far
- self._td_count = 0
-
# The object which stores our set of planning application results
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
- # This will store the planning application we are currently working on.
- self._current_application = None
-
- def handle_starttag(self, tag, attrs):
- if tag == "table":
- self.handle_start_table(attrs)
- # we are only interested in tr tags if we are in the results table
- elif self._in_results_table and tag == "tr":
- self.handle_start_tr(attrs)
- # we are only interested in td tags if we are in the results table
- elif self._in_results_table and tag == "td":
- self.handle_start_td(attrs)
- # we are only interested in tags if we are in the 6th td in
- # the results table.
- # UPDATE: It seems that, in the case of Chiltern, we are interested in
- # td 5.
- elif self._in_td and (self._td_count == 5 or self._td_count == 6) and tag == "a":
- self.handle_start_a(attrs)
- # If the tag is not one of these then we aren't interested
-
- def handle_endtag(self, tag):
- # we only need to consider end tags if we are in the results table
- if self._in_results_table:
- if tag == "table":
- self.handle_end_table()
- if tag == "tr":
- self.handle_end_tr()
- if tag == "td":
- self.handle_end_td()
-
- def handle_start_table(self, attrs):
- for attr,value in attrs:
- if attr == "class":
- if value == "cResultsForm":
- self._in_results_table = True
- break
-
- def handle_end_table(self):
- # If we see an end table tag, then note that we have left the
- # results table. This method is only called if we are in that table.
- self._in_results_table = False
-
-
- def handle_start_tr(self, attrs):
- # The first tr we meet in the results table is just headers
- # We will set a flag at the end of that tr to avoid creating
- # a blank PlanningApplication
- if self._past_header_row:
- # Create a candidate result object
- self._current_application = PlanningApplication()
- self._td_count = 0
-
- def handle_end_tr(self):
- # If we are in the results table, and not finishing the header row
- # append the current result to the results list.
- if self._past_header_row:
- self._results.addApplication(self._current_application)
- else:
- # The first row of the results table is headers
- # We want to do nothing until after it
- self._past_header_row = True
-
- def handle_start_td(self, attrs):
- # increase the td count by one
- self._td_count += 1
-
- # note that we are now in a td
- self._in_td = True
-
- def handle_end_td(self):
- # note that we are now not in a td
- self._in_td = False
-
- def handle_start_a(self, attrs):
- # this method is only getting called if we are in the
- # 6th td of a non-header row of the results table.
-
- # go through the attributes of the looking for one
- # named 'href'
- for attr,value in attrs:
- if attr == "href":
- # the value of this tag is a relative url.
- # parse it so we can get the query string from it
- parsed_info_url = urlparse.urlparse(value)
-
- # the 4th part of the tuple is the query string
- query_string = parsed_info_url[4]
-
- # join this query string to the search URL, and store this as
- # the info URL of the current planning application
- self._current_application.info_url = urlparse.urljoin(self.base_url, value)
-
- # Join this query string to the comments URL, and store this as
- # the comments URL of the current planning application
- comments_url = urlparse.urljoin(self.base_url, comments_url_end)
- self._current_application.comment_url = "?".join([comments_url, query_string])
-
- # while we're here, let's follow some links to find the postcode...
- # the postcode is in an input tag in the property page. This page
- # can be found by following the info url.
- # The newlines in the info page need fixing.
- info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read())
-
- info_file_parser = PublicAccessInfoPageParser()
- info_file_parser.feed(info_file_contents)
-
- property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url)
-
- # the newlines in this page need fixing
- property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read())
-
- property_file_parser = PublicAccessPropertyPageParser()
- property_file_parser.feed(property_file_contents)
-
- # Set the postcode on the current planning application from the
- # one found on the property page
- if property_file_parser.postcode is not None:
- self._current_application.postcode = property_file_parser.postcode
- else:
- # If there is no postcode in here, then we'll have to make do with regexing one out of the address.
- self._current_application.postcode = getPostcodeFromText(self._current_application.address)
-
- # There is no need for us to look at any more attributes.
- break
-
-
- def handle_data(self, data):
- if self._in_td:
- # The first td contains the reference
- if self._td_count == 1:
- self._current_application.council_reference = data
-
- # The second td contains the date the application was received
- elif self._td_count == 2:
- year, month, day = time.strptime(data, "%d/%m/%Y")[:3]
- received_date = datetime.date(year, month, day)
-
- self._current_application.date_received = received_date
-
- # The third td contains the address
- elif self._td_count == 3:
- #data = data.replace("^M","\n")
- self._current_application.address = data
-
- # The fourth td contains the description
- elif self._td_count == 4:
- self._current_application.description = data
- # 5 is status - we don't need it.
- # 6 is a button - this is where we will get our postcode,
- # comment_url, and info_url from (when handling the tag).
-
def getResultsByDayMonthYear(self, day, month, year):
+ search_date = datetime.date(year, month, day)
+
# First download the search form (in order to get a session cookie
search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end))
search_form_response = urllib2.urlopen(search_form_request)
@@ -267,91 +112,54 @@ class PublicAccessParser(HTMLParser.HTMLParser):
if self.debug:
print contents
- self.feed(contents)
-
- return self._results
-
-
- def getResults(self, day, month, year):
- return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+ soup = BeautifulSoup.BeautifulSoup(contents)
+ results_table = soup.find("table", {"class": "cResultsForm"})
+ # First, we work out what column each thing of interest is in from the headings
+ headings = [x.string for x in results_table.findAll("th")]
+ ref_col = index_or_none(headings, "Application Ref.") or \
+ index_or_none(headings, "Case Number") or \
+ index_or_none(headings, "Application Number")
-class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
- """A parser to get the URL for the property details page out of the
- info page (this url is needed in order to get the postcode of the
- application.
- """
+ address_col = headings.index("Address")
+ description_col = headings.index("Proposal")
- def __init__(self):
- HTMLParser.HTMLParser.__init__(self)
+ comments_url = urlparse.urljoin(self.base_url, comments_url_end)
- self.property_page_url = None
- def handle_starttag(self, tag, attrs):
- """The URL of the property details page is contained in an tag in
- an attribute with key 'A_btnPropertyDetails'. There is some garbage on
- either side of it which we will have to clear up before storing it...
+ for tr in results_table.findAll("tr")[1:]:
+ application = PlanningApplication()
- We go through the tags looking for one with an attribute with
- key 'id' and value 'A_btnPropertyDetails'. When we find it we go through
- its attributes looking for one with key 'href' - the value of this attribute
- contains the URL we want, after a bit of tidying up.
+ application.date_received = search_date
- Once we have got the URL, there is no need for us to look at any more tags.
- """
- if tag == "a" and self.property_page_url is None:
-
- #print attrs
- if attrs.count(("id","A_btnPropertyDetails")) > 0:
- for attr,value in attrs:
- if attr == "href":
- the_link = value
+ tds = tr.findAll(re.compile("t[dh]"))
- # this may have some garbage on either side of it...
- # let's strip that off
+ application.council_reference = tds[ref_col].string.strip()
+ application.address = tds[address_col].string.strip()
+ application.description = tds[description_col].string.strip()
- # If the stripping fails, take the whole link
+ application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
- # the garbage on the left is separated by whitespace.
- # the garbage on the right is separated by a "'".
- try:
- self.property_page_url = the_link.split()[1].split("'")[0]
- except IndexError:
- self.property_page_url = the_link
+ # We need the query string from this url to make the comments_url
+ query_string = urlparse.urlsplit(application.info_url)[3]
+ # This is probably slightly naughty, but I'm just going to add the querystring
+ # on to the end manually
+ application.comment_url = "%s?%s" %(comments_url, query_string)
-class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):
- """A parser to get the postcode out of the property details page."""
- def __init__(self):
- HTMLParser.HTMLParser.__init__(self)
+ self._results.addApplication(application)
- self.postcode = None
-
- def handle_starttag(self, tag, attrs):
- """The postcode is contained in an tag.
- This tag has an attribute 'name' with value postcode.
- It also has an attribute 'value' with value the postcode of this application.
-
- We go through the input tags looking for one with an attribute with
- key 'name' and value 'postcode'. When we find one,
- we look through its attributes for one with key 'value' - we store the value of this
- attribute as self.postcode.
+ return self._results
- Once we have the postcode, there is no need to look at any more input tags.
- """
-
- if tag == "input" and self.postcode is None:
- if attrs.count(("name","postcode")) > 0:
- for attr,value in attrs:
- if attr == "value":
- self.postcode = value
+ def getResults(self, day, month, year):
+ return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
if __name__ == '__main__':
day = 20
- month = 11
+ month = 12
year = 2008
#parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True)
@@ -360,6 +168,12 @@ if __name__ == '__main__':
#parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True)
#parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True)
# parser = PublicAccessParser("Sheffield City Council", "Sheffield", "http://planning.sheffield.gov.uk/publicaccess/tdc/")
- parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/")
+# parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/")
+# parser = PublicAccessParser("Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/")
+# parser = PublicAccessParser("Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/")
+# parser = PublicAccessParser("Harrogate Borough Council", "Harrogate", "http://publicaccess.harrogate.gov.uk/publicaccess/tdc/")
+# parser = PublicAccessParser("West Lancashire District Council", "West Lancashire", "http://publicaccess.westlancsdc.gov.uk/PublicAccess/tdc/")
+# parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/")
+ parser = PublicAccessParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/")
print parser.getResults(day, month, year)
diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv
index aceefcf..8de830b 100644
--- a/trunk/python_scrapers/SitesToGenerate.csv
+++ b/trunk/python_scrapers/SitesToGenerate.csv
@@ -8,7 +8,7 @@
"Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Bristol City Council", "Bristol", "http://e2eweb.bristol-city.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Portsmouth City Council", "Portsmouth", "http://planning.portsmouth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
-"The Borough of Oadby and Wigston", "Oadby and Wigston", "http://web.owbc.net/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"The Borough of Oadby and Wigston", "Oadby and Wigston", "http://pa.owbc.net/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Test Valley Borough Council", "Test Valley", "http://publicaccess.testvalley.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Kings Lynn and West Norfolk Borough Council", "West Norfolk", "http://online.west-norfolk.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Sunderland City Council", "Sunderland", "http://www.sunderland.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
@@ -69,7 +69,7 @@
"Denbighshire County Council", "Denbighshire", "http://planning.denbighshire.gov.uk/", "ApplicationSearchServletParser", "DenbighshireSearchParser"
"Wear Valley District Council", "Wear Valley", "http://planning.wearvalley.gov.uk/", "ApplicationSearchServletParser", "WearValleySearchParser"
"Chorley Borough Council", "Chorley", "http://planning.chorley.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
-"Gravesham Borough Council", "Gravesham", "http://plan.gravesham.gov.uk/PublicAccess/TDC/", "PublicAccess", "PublicAccessParser"
+"Gravesham Borough Council", "Gravesham", "http://plan.gravesham.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"London Borough Of Newham", "Newham", "http://pacaps.newham.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"North West Leicestershire District Council", "NW Leicestershire", "http://paccess.nwleics.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Redditch Borough Council", "Redditch", "http://access.redditchbc.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
@@ -103,7 +103,7 @@
"New Forest National Park", "New Forest NP", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BoltonLikeParser"
"Bridgnorth District Council", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BridgnorthParser"
"Carlisle City Council", "Carlisle", "http://planning.carlisle.gov.uk/PlanData/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
-"Newcastle City Council", "Newcastle", "http://gispublic.newcastle.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Newcastle City Council", "Newcastle", "http://gisccs013.newcastle.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"North Wiltshire District Council", "North Wiltshire", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
"Oldham Metropolitan Borough Council", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "OldhamParser"
"Renfrewshire Council", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BoltonLikeParser"
@@ -131,7 +131,7 @@
"Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Hinckley and Bosworth Borough Council", "Hinckley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Tendring District Council", "Tendring", "http://195.99.151.54/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
-"Argyl And Bute Council", "Argyl and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Argyll And Bute Council", "Argyll and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Havant Borough Council", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
"Rochford District Council", "Rochford", "http://www.rochford.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
@@ -275,3 +275,30 @@
"Mendip District Council", "Mendip", "", "Mendip", "MendipParser"
"Weymouth and Portland Borough Council", "Weymouth and Portland", "", "Weymouth", "WeymouthParser"
"Solihull Metropolitan Borough Council", "Solihull", "", "Solihull", "SolihullParser"
+"Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Bedford Borough Council", "Bedford", "http://www.publicaccess.bedford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Bradford Metropolitan District Council", "Bradford", "http://www.planning4bradford.com/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Cambridge City Council", "Cambridge", "http://www.cambridge.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Dartford Borough Council", "Dartford", "http://publicaccess.dartford.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"East Cambridgeshire District Council", "East Cambridgeshire", "http://pa.eastcambs.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"East Riding of Yorkshire Council", "East Riding", "http://www.eastriding.gov.uk/PublicAccess731c/tdc/", "PublicAccess", "PublicAccessParser"
+"Gloucester City Council", "Gloucester", "http://www.glcstrplnng11.co.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Horsham District Council", "Horsham", "http://publicaccess.horsham.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"London Borough of Lambeth", "Lambeth", "http://planning.lambeth.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Leeds City Council", "Leeds", "http://planningapplications.leeds.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Mid Sussex District Council", "Mid Sussex", "http://dc.midsussex.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"North East Derbyshire District Council", "North East Derbyshire", "http://planapps-online.ne-derbyshire.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Norwich City Council", "Norwich", "http://publicaccess.norwich.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Richmondshire District Council", "Richmondshire", "http://publicaccess.richmondshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Salford City Council", "Salford", "http://publicaccess.salford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Sandwell Metropolitan Borough Council", "Sandwell", "http://webcaps.sandwell.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Scottish Borders Council", "Scottish Borders", "http://eplanning.scotborders.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Stafford Borough Council", "Stafford", "http://www3.staffordbc.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Swindon Borough Council", "Swindon", "http://194.73.99.13/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Three Rivers District Council", "Three Rivers", "http://www2.threerivers.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Torridge District Council", "Torridge", "http://www.torridge.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Tunbridge Wells Borough Council", "Tunbridge Wells", "http://secure.tunbridgewells.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Wakefield Metropolitan District Council", "Wakefield", "http://planning.wakefield.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"West Wiltshire District Council", "West Wiltshire", "http://planning.westwiltshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Worthing Borough Council", "Worthing", "http://planning.worthing.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Wycombe District Council", "Wycombe", "http://planningpa.wycombe.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"