diff --git a/trunk/SitesToGenerate.csv b/trunk/SitesToGenerate.csv
index 8dc34fd..51a219e 100644
--- a/trunk/SitesToGenerate.csv
+++ b/trunk/SitesToGenerate.csv
@@ -129,7 +129,7 @@
"Cyngor Gwynedd Council","Gwynedd",,,,,,"http://www.gwynedd.gov.uk/swiftlg/apas/run/","SwiftLG","GwyneddParser",
"London Borough of Hackney","Hackney",,,,,,"http://www.hackney.gov.uk/servapps/","PlanningExplorer","HackneyParser",
"Halton Borough Council","Halton",,,,,,,"Halton","HaltonParser",
-"Hambleton District Council","Hambleton",,,,,,"http://planning.hambleton.gov.uk/publicaccess/tdc/","PublicAccess","PublicAccessParser",
+"Hambleton District Council","Hambleton",,,,,,"http://planning.hambleton.gov.uk/publicaccess/tdc/","PublicAccess","HambletonParser",
"London Borough of Hammersmith and Fulham","Hammersmith and Fulham",,,,,,"http://www.apps.lbhf.gov.uk/PublicAccess/tdc/","PublicAccess","PublicAccessParser",
"Hampshire County Council","Hampshire",,,,,,,"Hampshire","HampshireParser",
"Harborough District Council","Harborough",,,,,,"http://pa.harborough.gov.uk/PublicAccess/tdc/","PublicAccess","PublicAccessParser",
diff --git a/trunk/python_scrapers/PublicAccess.py b/trunk/python_scrapers/PublicAccess.py
index 5ba680d..15e6db6 100644
--- a/trunk/python_scrapers/PublicAccess.py
+++ b/trunk/python_scrapers/PublicAccess.py
@@ -16,20 +16,43 @@ cookie_jar = cookielib.CookieJar()
from PlanningUtils import fixNewlines, getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
-search_form_url_end = "DcApplication/application_searchform.aspx"
-search_results_url_end = "DcApplication/application_searchresults.aspx"
-comments_url_end = "DcApplication/application_comments_entryform.aspx"
-
def index_or_none(a_list, item):
"""
Returns the index of item in a_list, or None, if it isn't in the list.
"""
return a_list.count(item) and a_list.index(item)
-class PublicAccessParser:
+class PublicAccessParser(object):
"""This is the class which parses the PublicAccess search results page.
"""
+ search_form_url_end = "DcApplication/application_searchform.aspx"
+ search_results_url_end = "DcApplication/application_searchresults.aspx"
+ comments_url_end = "DcApplication/application_comments_entryform.aspx"
+
+ # For some sites (Hambleton, for example) we need to leave in the empty
+ # strings.
+ data_template = (
+ ("searchtype", "ADV"),
+ ("caseNo", ""),
+ ("PPReference", ""),
+ ("AltReference", ""),
+ ("srchtype", ""),
+ ("srchstatus", ""),
+ ("srchdecision", ""),
+ ("srchapstatus", ""),
+ ("srchappealdecision", ""),
+ ("srchwardcode", ""),
+ ("srchparishcode", ""),
+ ("srchagentdetails", ""),
+ ("srchDateReceivedStart", "%(day)02d/%(month)02d/%(year)04d"),
+ ("srchDateReceivedEnd", "%(day)02d/%(month)02d/%(year)04d"),
+ ("srchDateValidStart", ""),
+ ("srchDateValidEnd", ""),
+ ("srchDateCommitteeStart", ""),
+ ("srchDateCommitteeEnd", ""),
+ )
+
def __init__(self,
authority_name,
authority_short_name,
@@ -45,73 +68,70 @@ class PublicAccessParser:
# The object which stores our set of planning application results
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+ def fetch_setting_cookie(self, url, data=None):
+ request = urllib2.Request(url, data)
+ cookie_jar.add_cookie_header(request)
+ response = urllib2.urlopen(request)
+ cookie_jar.extract_cookies(response, request)
+ return response
- def getResultsByDayMonthYear(self, day, month, year):
- search_date = datetime.date(year, month, day)
-
- # First download the search form (in order to get a session cookie
- search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end))
- search_form_response = urllib2.urlopen(search_form_request)
-
- cookie_jar.extract_cookies(search_form_response, search_form_request)
-
-
- # We are only doing this first search in order to get a cookie
- # The paging on the site doesn't work with cookies turned off.
-
- search_data1 = urllib.urlencode({"searchType":"ADV",
- "caseNo":"",
- "PPReference":"",
- "AltReference":"",
- "srchtype":"",
- "srchstatus":"",
- "srchdecision":"",
- "srchapstatus":"",
- "srchappealdecision":"",
- "srchwardcode":"",
- "srchparishcode":"",
- "srchagentdetails":"",
- "srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year},
- "srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} })
+ def get_search_page(self):
+ return self.fetch_setting_cookie(urlparse.urljoin(self.base_url, self.search_form_url_end))
- if self.debug:
- print search_data1
+ def get_response_1(self, data):
+ return self.fetch_setting_cookie(urlparse.urljoin(self.base_url, self.search_results_url_end), data)
+# def get_data_2(self, day, month, year):
- search_url = urlparse.urljoin(self.base_url, search_results_url_end)
- request1 = urllib2.Request(search_url, search_data1)
- cookie_jar.add_cookie_header(request1)
- response1 = urllib2.urlopen(request1)
+# search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3")))
- # This search is the one we will actually use.
- # a maximum of 100 results are returned on this site,
- # hence setting "pagesize" to 100. I doubt there will ever
- # be more than 100 in one day in PublicAccess...
- # "currentpage" = 1 gets us to the first page of results
- # (there will only be one anyway, as we are asking for 100 results...)
+# if self.debug:
+# print search_data2
-#http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=¤tpage=2&pagesize=10&module=P3
+# return search_data2
- search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3")))
+# def get_response_2(self, data):
+# # This time we want to do a get request, so add the search data into the url
+# url = urlparse.urljoin(self.base_url, self.search_results_url_end + "?" + data)
+# return self.fetch_setting_cookie(url)
- if self.debug:
- print search_data2
+ def get_data_1(self, replacement_dict):
+ # It seems urllib.urlencode isn't happy with the generator here,
+ # so we'd best make it a tuple...
+ data_tuple = tuple(((key, value %replacement_dict) for (key, value) in self.data_template))
- # This time we want to do a get request, so add the search data into the url
- request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2)
+ data = urllib.urlencode(data_tuple)
+ return data
+
+ def get_replacement_dict(self, day, month, year, search_response):
+ return {"day": day,
+ "month": month,
+ "year": year}
+
+ def get_useful_response(self, day, month, year):
+ # We're only doing this to get a cookie
+ search_response = self.get_search_page()
- request2 = urllib2.Request(request2_url)
+ replacement_dict = self.get_replacement_dict(day, month, year, search_response)
+ data = self.get_data_1(replacement_dict)
- # add the cookie we stored from our first search
- cookie_jar.add_cookie_header(request2)
+ return self.get_response_1(data)
- response2 = urllib2.urlopen(request2)
+ def get_contents(self, day, month, year):
+ useful_response = self.get_useful_response(day, month, year)
- contents = fixNewlines(response2.read())
+ contents = fixNewlines(useful_response.read())
if self.debug:
print contents
+ return contents
+
+ def getResultsByDayMonthYear(self, day, month, year):
+ search_date = datetime.date(year, month, day)
+
+ contents = self.get_contents(day, month, year)
+
soup = BeautifulSoup.BeautifulSoup(contents)
results_table = soup.find("table", {"class": "cResultsForm"})
@@ -126,7 +146,7 @@ class PublicAccessParser:
address_col = headings.index("Address")
description_col = headings.index("Proposal")
- comments_url = urlparse.urljoin(self.base_url, comments_url_end)
+ comments_url = urlparse.urljoin(self.base_url, self.comments_url_end)
for tr in results_table.findAll("tr")[1:]:
@@ -157,23 +177,41 @@ class PublicAccessParser:
def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+class HambletonParser(PublicAccessParser):
+ data_template = PublicAccessParser.data_template + (("s8fid", "%(s8fid)s"),)
+
+ def get_replacement_dict(self, day, month, year, search_response):
+ replacement_dict = super(HambletonParser, self).get_replacement_dict(day, month, year, search_response)
+
+ # We need an input s8fid from this.
+ # BeautifulSoup doesn't like it, so we'll have to use a regex.
+ search_contents = search_response.read()
+
+ #
+ s8fid_re = re.compile(']*name="s8fid" value="(\d*)" />')
+
+ replacement_dict["s8fid"] = s8fid_re.search(search_contents).groups()[0]
+
+ return replacement_dict
+
+
if __name__ == '__main__':
- day = 20
- month = 12
- year = 2008
-
- #parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True)
- parser = PublicAccessParser("Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/tdc/", False)
- #parser = PublicAccessParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", True)
- #parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True)
- #parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True)
+ day = 11
+ month = 6
+ year = 2009
+
+# parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True)
+# parser = PublicAccessParser("Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/tdc/", True)
+# parser = HambletonParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", True)
+# parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True)
+# parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True)
# parser = PublicAccessParser("Sheffield City Council", "Sheffield", "http://planning.sheffield.gov.uk/publicaccess/tdc/")
# parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/")
# parser = PublicAccessParser("Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/")
# parser = PublicAccessParser("Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/")
# parser = PublicAccessParser("Harrogate Borough Council", "Harrogate", "http://publicaccess.harrogate.gov.uk/publicaccess/tdc/")
# parser = PublicAccessParser("West Lancashire District Council", "West Lancashire", "http://publicaccess.westlancsdc.gov.uk/PublicAccess/tdc/")
-# parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/")
-# parser = PublicAccessParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/")
+ parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/")
+# parser = PublicAccessParser("Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", debug=True)
print parser.getResults(day, month, year)