From e6082ec032aa6785fb671fd6cc5cc8ca25a00821 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 21 Jun 2009 11:58:20 +0000 Subject: [PATCH] Bit of a refactor of PublicAccess in order to fix Hambleton. --- SitesToGenerate.csv | 2 +- python_scrapers/PublicAccess.py | 172 +++++++++++++++++++------------- 2 files changed, 106 insertions(+), 68 deletions(-) diff --git a/SitesToGenerate.csv b/SitesToGenerate.csv index 8dc34fd..51a219e 100644 --- a/SitesToGenerate.csv +++ b/SitesToGenerate.csv @@ -129,7 +129,7 @@ "Cyngor Gwynedd Council","Gwynedd",,,,,,"http://www.gwynedd.gov.uk/swiftlg/apas/run/","SwiftLG","GwyneddParser", "London Borough of Hackney","Hackney",,,,,,"http://www.hackney.gov.uk/servapps/","PlanningExplorer","HackneyParser", "Halton Borough Council","Halton",,,,,,,"Halton","HaltonParser", -"Hambleton District Council","Hambleton",,,,,,"http://planning.hambleton.gov.uk/publicaccess/tdc/","PublicAccess","PublicAccessParser", +"Hambleton District Council","Hambleton",,,,,,"http://planning.hambleton.gov.uk/publicaccess/tdc/","PublicAccess","HambletonParser", "London Borough of Hammersmith and Fulham","Hammersmith and Fulham",,,,,,"http://www.apps.lbhf.gov.uk/PublicAccess/tdc/","PublicAccess","PublicAccessParser", "Hampshire County Council","Hampshire",,,,,,,"Hampshire","HampshireParser", "Harborough District Council","Harborough",,,,,,"http://pa.harborough.gov.uk/PublicAccess/tdc/","PublicAccess","PublicAccessParser", diff --git a/python_scrapers/PublicAccess.py b/python_scrapers/PublicAccess.py index 5ba680d..15e6db6 100644 --- a/python_scrapers/PublicAccess.py +++ b/python_scrapers/PublicAccess.py @@ -16,20 +16,43 @@ cookie_jar = cookielib.CookieJar() from PlanningUtils import fixNewlines, getPostcodeFromText, PlanningAuthorityResults, PlanningApplication -search_form_url_end = "DcApplication/application_searchform.aspx" -search_results_url_end = "DcApplication/application_searchresults.aspx" -comments_url_end = "DcApplication/application_comments_entryform.aspx" - def index_or_none(a_list, item): """ Returns the index of item in a_list, or None, if it isn't in the list. """ return a_list.count(item) and a_list.index(item) -class PublicAccessParser: +class PublicAccessParser(object): """This is the class which parses the PublicAccess search results page. """ + search_form_url_end = "DcApplication/application_searchform.aspx" + search_results_url_end = "DcApplication/application_searchresults.aspx" + comments_url_end = "DcApplication/application_comments_entryform.aspx" + + # For some sites (Hambleton, for example) we need to leave in the empty + # strings. + data_template = ( + ("searchtype", "ADV"), + ("caseNo", ""), + ("PPReference", ""), + ("AltReference", ""), + ("srchtype", ""), + ("srchstatus", ""), + ("srchdecision", ""), + ("srchapstatus", ""), + ("srchappealdecision", ""), + ("srchwardcode", ""), + ("srchparishcode", ""), + ("srchagentdetails", ""), + ("srchDateReceivedStart", "%(day)02d/%(month)02d/%(year)04d"), + ("srchDateReceivedEnd", "%(day)02d/%(month)02d/%(year)04d"), + ("srchDateValidStart", ""), + ("srchDateValidEnd", ""), + ("srchDateCommitteeStart", ""), + ("srchDateCommitteeEnd", ""), + ) + def __init__(self, authority_name, authority_short_name, @@ -45,73 +68,70 @@ class PublicAccessParser: # The object which stores our set of planning application results self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + def fetch_setting_cookie(self, url, data=None): + request = urllib2.Request(url, data) + cookie_jar.add_cookie_header(request) + response = urllib2.urlopen(request) + cookie_jar.extract_cookies(response, request) + return response - def getResultsByDayMonthYear(self, day, month, year): - search_date = datetime.date(year, month, day) - - # First download the search form (in order to get a session cookie - search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end)) - search_form_response = urllib2.urlopen(search_form_request) - - cookie_jar.extract_cookies(search_form_response, search_form_request) - - - # We are only doing this first search in order to get a cookie - # The paging on the site doesn't work with cookies turned off. - - search_data1 = urllib.urlencode({"searchType":"ADV", - "caseNo":"", - "PPReference":"", - "AltReference":"", - "srchtype":"", - "srchstatus":"", - "srchdecision":"", - "srchapstatus":"", - "srchappealdecision":"", - "srchwardcode":"", - "srchparishcode":"", - "srchagentdetails":"", - "srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year}, - "srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} }) + def get_search_page(self): + return self.fetch_setting_cookie(urlparse.urljoin(self.base_url, self.search_form_url_end)) - if self.debug: - print search_data1 + def get_response_1(self, data): + return self.fetch_setting_cookie(urlparse.urljoin(self.base_url, self.search_results_url_end), data) +# def get_data_2(self, day, month, year): - search_url = urlparse.urljoin(self.base_url, search_results_url_end) - request1 = urllib2.Request(search_url, search_data1) - cookie_jar.add_cookie_header(request1) - response1 = urllib2.urlopen(request1) +# search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3"))) - # This search is the one we will actually use. - # a maximum of 100 results are returned on this site, - # hence setting "pagesize" to 100. I doubt there will ever - # be more than 100 in one day in PublicAccess... - # "currentpage" = 1 gets us to the first page of results - # (there will only be one anyway, as we are asking for 100 results...) +# if self.debug: +# print search_data2 -#http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=¤tpage=2&pagesize=10&module=P3 +# return search_data2 - search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3"))) +# def get_response_2(self, data): +# # This time we want to do a get request, so add the search data into the url +# url = urlparse.urljoin(self.base_url, self.search_results_url_end + "?" + data) +# return self.fetch_setting_cookie(url) - if self.debug: - print search_data2 + def get_data_1(self, replacement_dict): + # It seems urllib.urlencode isn't happy with the generator here, + # so we'd best make it a tuple... + data_tuple = tuple(((key, value %replacement_dict) for (key, value) in self.data_template)) - # This time we want to do a get request, so add the search data into the url - request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2) + data = urllib.urlencode(data_tuple) + return data + + def get_replacement_dict(self, day, month, year, search_response): + return {"day": day, + "month": month, + "year": year} + + def get_useful_response(self, day, month, year): + # We're only doing this to get a cookie + search_response = self.get_search_page() - request2 = urllib2.Request(request2_url) + replacement_dict = self.get_replacement_dict(day, month, year, search_response) + data = self.get_data_1(replacement_dict) - # add the cookie we stored from our first search - cookie_jar.add_cookie_header(request2) + return self.get_response_1(data) - response2 = urllib2.urlopen(request2) + def get_contents(self, day, month, year): + useful_response = self.get_useful_response(day, month, year) - contents = fixNewlines(response2.read()) + contents = fixNewlines(useful_response.read()) if self.debug: print contents + return contents + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + contents = self.get_contents(day, month, year) + soup = BeautifulSoup.BeautifulSoup(contents) results_table = soup.find("table", {"class": "cResultsForm"}) @@ -126,7 +146,7 @@ class PublicAccessParser: address_col = headings.index("Address") description_col = headings.index("Proposal") - comments_url = urlparse.urljoin(self.base_url, comments_url_end) + comments_url = urlparse.urljoin(self.base_url, self.comments_url_end) for tr in results_table.findAll("tr")[1:]: @@ -157,23 +177,41 @@ class PublicAccessParser: def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() +class HambletonParser(PublicAccessParser): + data_template = PublicAccessParser.data_template + (("s8fid", "%(s8fid)s"),) + + def get_replacement_dict(self, day, month, year, search_response): + replacement_dict = super(HambletonParser, self).get_replacement_dict(day, month, year, search_response) + + # We need an input s8fid from this. + # BeautifulSoup doesn't like it, so we'll have to use a regex. + search_contents = search_response.read() + + # + s8fid_re = re.compile(']*name="s8fid" value="(\d*)" />') + + replacement_dict["s8fid"] = s8fid_re.search(search_contents).groups()[0] + + return replacement_dict + + if __name__ == '__main__': - day = 20 - month = 12 - year = 2008 - - #parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True) - parser = PublicAccessParser("Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/tdc/", False) - #parser = PublicAccessParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", True) - #parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True) - #parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True) + day = 11 + month = 6 + year = 2009 + +# parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True) +# parser = PublicAccessParser("Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/tdc/", True) +# parser = HambletonParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", True) +# parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True) +# parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True) # parser = PublicAccessParser("Sheffield City Council", "Sheffield", "http://planning.sheffield.gov.uk/publicaccess/tdc/") # parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/") # parser = PublicAccessParser("Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/") # parser = PublicAccessParser("Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/") # parser = PublicAccessParser("Harrogate Borough Council", "Harrogate", "http://publicaccess.harrogate.gov.uk/publicaccess/tdc/") # parser = PublicAccessParser("West Lancashire District Council", "West Lancashire", "http://publicaccess.westlancsdc.gov.uk/PublicAccess/tdc/") -# parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/") -# parser = PublicAccessParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/") + parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/") +# parser = PublicAccessParser("Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", debug=True) print parser.getResults(day, month, year)