From ddc81f06ea762201298b2bf86227f361571b7d8c Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Tue, 14 Oct 2008 09:46:56 +0000 Subject: [PATCH] Add scraper for Gosport. Factor out CookieAddingHTTPRedirectHandler. --- trunk/python_scrapers/Gosport.py | 100 +++++++++++++++++++++ trunk/python_scrapers/HTTPHandlers.py | 18 ++++ trunk/python_scrapers/Ocella.py | 13 +-- trunk/python_scrapers/OtherFilesToCopy.csv | 2 + trunk/python_scrapers/SitesToGenerate.csv | 1 + 5 files changed, 124 insertions(+), 10 deletions(-) create mode 100644 trunk/python_scrapers/Gosport.py create mode 100644 trunk/python_scrapers/HTTPHandlers.py diff --git a/trunk/python_scrapers/Gosport.py b/trunk/python_scrapers/Gosport.py new file mode 100644 index 0000000..14d48a0 --- /dev/null +++ b/trunk/python_scrapers/Gosport.py @@ -0,0 +1,100 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +import re + +import cookielib + +cookie_jar = cookielib.CookieJar() + + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + + +from HTTPHandlers import CookieAddingHTTPRedirectHandler + +cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar)) + + +search_date_format = "%m/%d/%Y" #That's right, the search date is US style. +info_page_date_format = "%d/%m/%Y" # and the info page is UK style + +class GosportParser: + def __init__(self, *args): + + self.authority_name = "Gosport Borough Council" + self.authority_short_name = "Gosport" + + self.base_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationSearch2.aspx" + self.info_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationDetails.aspx?ID=%s" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + get_request = urllib2.Request(self.base_url) + get_response = urllib2.urlopen(get_request) + cookie_jar.extract_cookies(get_response, get_request) + + get_soup = BeautifulSoup(get_response.read()) + + post_data = ( + ("__VIEWSTATE", get_soup.find("input", {"name": "__VIEWSTATE"})["value"]), + ("pgid", get_soup.find("input", {"name": "pgid"})["value"]), + ("action", "Search"), +# ("ApplicationSearch21%3AtbDevAddress", ""), +# ("ApplicationSearch21%3AtbApplicantName", ""), +# ("ApplicationSearch21%3AtbAgentName", ""), + ("ApplicationSearch21:tbDateSubmitted", "10/01/2008"), + ("ApplicationSearch21:btnDateSubmitted", "Search"), +# ("ApplicationSearch21%3AtbDateDetermined", ""), + ) + + + post_request = urllib2.Request(self.base_url, urllib.urlencode(post_data)) + cookie_jar.add_cookie_header(post_request) + post_response = cookie_handling_opener.open(post_request) + + post_soup = BeautifulSoup(post_response.read()) + + # Discard the first , which contains headers + trs = post_soup.find("table", id="SearchResults1_dgSearchResults").findAll("tr")[1:] + + for tr in trs: + application = PlanningApplication() + + tds = tr.findAll("td") + + application.council_reference = tds[0].string.strip() + application.address = tds[1].string.strip() + application.postcode = getPostcodeFromText(application.address) + application.description = tds[2].string.strip() + + application.date_received = datetime.datetime(*(time.strptime(tds[3].string.strip(), info_page_date_format)[0:6])) + application.info_url = self.info_url %(application.council_reference) + + # The comment url must be accessed by a POST, so we'll just use the info url for that as well + + application.comment_url = application.info_url + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = GosportParser() + print parser.getResults(1,10,2008) + diff --git a/trunk/python_scrapers/HTTPHandlers.py b/trunk/python_scrapers/HTTPHandlers.py new file mode 100644 index 0000000..0015c5e --- /dev/null +++ b/trunk/python_scrapers/HTTPHandlers.py @@ -0,0 +1,18 @@ + +from urllib2 import HTTPRedirectHandler + +class CookieAddingHTTPRedirectHandler(HTTPRedirectHandler): + """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does.""" + + def __init__(self, cookie_jar): + self.cookie_jar = cookie_jar + + # This really ought to call the superclasses init method, but there doesn't seem to be one. + + + def redirect_request(self, *args): + new_request = HTTPRedirectHandler.redirect_request(self, *args) + # We need to add a cookie from the cookie_jar + self.cookie_jar.add_cookie_header(new_request) + + return new_request diff --git a/trunk/python_scrapers/Ocella.py b/trunk/python_scrapers/Ocella.py index 924a349..61ffa99 100644 --- a/trunk/python_scrapers/Ocella.py +++ b/trunk/python_scrapers/Ocella.py @@ -18,16 +18,9 @@ search_date_format = "%d-%m-%Y" # Format used for the accepted date when searchi possible_date_formats = [search_date_format, "%d/%m/%Y"] -class CookieAddingHTTPRedirectHandler(urllib2.HTTPRedirectHandler): - """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does.""" - def redirect_request(self, req, fp, code, msg, headers, newurl): - new_request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl) - # We need to add a cookie from the cookie_jar - cookie_jar.add_cookie_header(new_request) +from HTTPHandlers import CookieAddingHTTPRedirectHandler - return new_request - -cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler()) +cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar)) class OcellaParser: @@ -206,7 +199,7 @@ if __name__ == '__main__': # parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly") # parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL") - parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL") parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly") diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index f5819a5..eae8963 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -1,6 +1,7 @@ "filename", "permissions" "PublicAccess.py", "420" "PlanningUtils.py", "420" +"HTTPHandlers.py", "420" "SouthOxfordshireParser.py", "420" "SouthOxfordshire.cgi", "493" "ApplicationSearchServletParser.py", "420" @@ -58,3 +59,4 @@ "Herefordshire.py", "420" "Exmoor.py", "420" "Eastbourne.py", "420" +"Gosport.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 95bac87..0661ca7 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -264,3 +264,4 @@ "Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser" "Waltham Forest Council", "Waltham Forest", "http://planning.walthamforest.gov.uk/", "PlanningExplorer", "WalthamForestParser" "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"