From bf9599d38c7d444cf28f2f76f346c3de327e2c80 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 8 Jun 2008 17:01:30 +0000 Subject: [PATCH] Adding Planet scraper (not quite finished, but this time I don't want to lose it!) --- python_scrapers/AcolnetParser.py | 2 +- python_scrapers/Planet.py | 155 +++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 python_scrapers/Planet.py diff --git a/python_scrapers/AcolnetParser.py b/python_scrapers/AcolnetParser.py index 02e609b..8201213 100644 --- a/python_scrapers/AcolnetParser.py +++ b/python_scrapers/AcolnetParser.py @@ -359,6 +359,6 @@ if __name__ == '__main__': #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "http://planningonline.southwarksites.com/planningonline2/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") - #parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") print parser.getResults(day, month, year) diff --git a/python_scrapers/Planet.py b/python_scrapers/Planet.py new file mode 100644 index 0000000..1c34ef8 --- /dev/null +++ b/python_scrapers/Planet.py @@ -0,0 +1,155 @@ +import urllib2 +import urllib +import urlparse + +from cgi import parse_qs + +import datetime + +import cookielib + +cookie_jar = cookielib.CookieJar() + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class PlanetParser: + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + def getResultsByDayMonthYear(self, day, month, year): + # What is the serviceKey for this council? + # It's in our base url + query_string = urlparse.urlsplit(self.base_url)[3] + + # This query string just contains the servicekey + query_dict = parse_qs(query_string) + + service_key = query_dict['serviceKey'][0] + + # First get the form + get_request = urllib2.Request(self.base_url) + get_response = urllib2.urlopen(get_request) + + cookie_jar.extract_cookies(get_response, get_request) + + # We also need to get the security token + get_soup = BeautifulSoup(get_response.read()) + + security_token = get_soup.find('input', {'name': 'securityToken'})['value'] + + # Now post to it + search_date = datetime.date(year, month, day) + + search_data = urllib.urlencode( + { + "serviceKey":service_key, + "securityToken": security_token, + "STEP":"Planet_SearchCriteria", + #X.resultCount= + "X.pageNumber": "0", + "X.searchCriteria_StartDate": search_date.strftime(date_format), + "X.searchCriteria_EndDate": search_date.strftime(date_format), + } + ) + + post_request = urllib2.Request(self.base_url, search_data) + cookie_jar.add_cookie_header(post_request) + + post_response = urllib2.urlopen(post_request) + + post_soup = BeautifulSoup(post_response.read()) + + # Now we need to find the results. We'll do this by searching for the text "Ref No" and then going forward from there. For some reason a search for the table gets the table without contents + + ref_no_text = post_soup.find(text="Ref No") + + first_tr = ref_no_text.findNext("tr") + + other_trs = first_tr.findNextSiblings() + + trs = [first_tr] + other_trs + + for tr in trs: + self._current_application = PlanningApplication() + + # We don't need to get the date, it's the one we searched for. + self._current_application.date_received = search_date + + tds = tr.findAll("td") + + self._current_application.council_reference = tds[0].a.string.strip() + self._current_application.address = tds[1].string.strip() + self._current_application.postcode = getPostcodeFromText(self._current_application.address) + + self._current_application.description = tds[2].string.strip() + + # There is no good info url, so we just give the search page. + self._current_application.info_url = self.base_url + + # Similarly for the comment url + self._current_application.comment_url = self.base_url + + self._results.addApplication(self._current_application) + + return self._results + + +# post data for worcester +# hopefully we can ignore almost all of this... + +#ACTION=NEXT +#serviceKey=SysDoc-PlanetApplicationEnquiry +#serviceGeneration= +#securityToken=NTgxMjE3OTExMjA4OQ%3D%3D +#enquiry= +#STEP=Planet_SearchCriteria +#RECEIVED= +#COMMENTS= +#LAST_UPDATED= +#status= +#X.endEnquiry= +#X.resultCount= +#X.applicationNotFound= +#X.pageNumber=0 +#X.searchCriteria_ApplicationReference= +#X.searchCriteria_StartDate=20%2F04%2F2008 +#X.searchCriteria_EndDate=20%2F04%2F2008 +#X.searchCriteria_Ward= +#X.searchCriteria_Parish= +#X.searchCriteria_Address= +#X.searchCriteria_Postcode= +#X.searchCriteria_ApplicantName= +#X.searchCriteria_AgentName= +#X.searchCriteria_UndecidedApplications= + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +if __name__ == '__main__': +# parser = PlanetParser("Worcester City Council", "Worcester", "http://www.worcester.gov.uk:8080/planet/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry", debug=True) +# parser = PlanetParser("Elmbridge Borough Council", "Elmbridge", "http://www2.elmbridge.gov.uk/Planet/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry") +# parser = PlanetParser("North Lincolnshire Council", "North Lincolnshire", "http://www.planning.northlincs.gov.uk/planet/ispforms.asp?ServiceKey=SysDoc-PlanetApplicationEnquiry") +# parser = PlanetParser("Rydale District Council", "Rydale", "http://www.ryedale.gov.uk/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry") + parser = PlanetParser("Tewkesbury Borough Council", "Tewkesbury", "http://planning.tewkesbury.gov.uk/Planet/ispforms.asp?serviceKey=07WCC04163103430") + print parser.getResults(21,5,2008)