From 906d3856954d6e43da874b64c57d88df10f38b31 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 2 Dec 2007 15:44:20 +0000 Subject: [PATCH] Add scrapers for the SwiftLG sites (apart from Daventry which is not responding today). We already have scrapers for Islington and East Herts, but I'm adding the new ones as well - we can then see if we get the same results :-) --- python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/PlanningExplorer.py | 5 +- python_scrapers/SitesToGenerate.csv | 14 +- python_scrapers/SwiftLG.py | 225 +++++++++++++++++++++++++++ 4 files changed, 242 insertions(+), 3 deletions(-) create mode 100644 python_scrapers/SwiftLG.py diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 3b6cbc9..a34556b 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -14,3 +14,4 @@ "RutlandLike.py", "420" "AtriumePlanning.py", "420" "PlanningExplorer.py", "420" +"SwiftLG.py", "420" diff --git a/python_scrapers/PlanningExplorer.py b/python_scrapers/PlanningExplorer.py index b96bd6e..11cb6a0 100644 --- a/python_scrapers/PlanningExplorer.py +++ b/python_scrapers/PlanningExplorer.py @@ -180,6 +180,7 @@ class PlanningExplorerParser: headers = self._getHeaders() request = urllib2.Request(self.search_url, post_data, headers) + post_response = urllib2.urlopen(request) # We have actually been returned here by an http302 object @@ -546,14 +547,14 @@ if __name__ == '__main__': #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") #parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") #parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") - parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") + #parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") #parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") #parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") #parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") #parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") #parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/") #parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/") - #parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") + parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") #parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/") #parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/") #parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/") diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 1c3400e..39c1aa1 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -166,4 +166,16 @@ "Tamworth Borough Council", "Tamworth", "http://80.1.64.77/", "PlanningExplorer", "TamworthParser" "Trafford Council", "Trafford", "http://planning.trafford.gov.uk/", "PlanningExplorer", "TraffordParser" "West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/", "PlanningExplorer", "WestOxfordshireParser" - +"Dudley Metropolitan Borough Council", "Dudley", "http://www2.dudley.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"East Hertfordshire District Council", "East Herts", "http://e-services.eastherts.gov.uk/swiftlg/apas/run/", "SwiftLG", "EastHertsParser" +"London Borough of Islington", "Islington", "https://www.islington.gov.uk/onlineplanning/apas/run/", "SwiftLG", "IslingtonParser" +"Cyngor Gwynedd Council", "Gwynedd", "http://www.gwynedd.gov.uk/swiftlg/apas/run/", "SwiftLG", "GwyneddParser" +"Lake District National Park Authority", "Lake District", "http://www.lake-district.gov.uk/swiftlg/apas/run/", "SwiftLG", "LakeDistrictParser" +"Macclesfield Borough Council", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/", "SwiftLG", "MacclesfieldParser" +"Maidstone Borough Council", "Maidstone", "http://digitalmaidstone.co.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Mole Valley District Council", "Mole Valley", "http://www.molevalley.gov.uk/swiftlg/apas/run/", "SwiftLG", "MoleValleyParser" +"Pembrokeshire County Council", "Pembrokeshire", "http://planning.pembrokeshire.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Rochdale Metropolitan Borough Council", "Rochdale", "http://www.rochdale.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Slough Borough Council", "Slough", "http://www2.slough.gov.uk/swiftlg/apas/run/", "SwiftLG", "SloughParser" +"Snowdonia National Park Authority", "Snowdonia", "http://www.snowdonia-npa.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"St Edmundsbury Borough Council", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" diff --git a/python_scrapers/SwiftLG.py b/python_scrapers/SwiftLG.py new file mode 100644 index 0000000..59fddcb --- /dev/null +++ b/python_scrapers/SwiftLG.py @@ -0,0 +1,225 @@ + +import urllib2 +import urllib +import urlparse +import cgi +import re +import datetime + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import getPostcodeFromText, \ + PlanningAuthorityResults, \ + PlanningApplication + +# - Browser request: -------------------------- +# {POST http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0} {Host: digitalmaidstone.co.uk +# Accept: text/html, text/plain, text/css, text/sgml, */*;q=0.01 +# Accept-Encoding: gzip +# Accept-Language: en +# Pragma: no-cache +# Cache-Control: no-cache +# User-Agent: Lynx/2.8.6rel.4 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.6.3 +# Content-type: application/x-www-form-urlencoded +# Content-length: 638 +# } %25.MAINBODY.WPACIS.1.=&APNID.MAINBODY.WPACIS.1.=&JUSTLOCATION.MAINBODY.WPACIS.1.=&JUSTDEVDESC.MAINBODY.WPACIS.1.=&DEVDESC.MAINBODY.WPACIS.1.=&SURNAME.MAINBODY.WPACIS.1.=®FROMDATE.MAINBODY.WPACIS.1.=01%2F11%2F2007®TODATE.MAINBODY.WPACIS.1.=02%2F11%2F2007&DECFROMDATE.MAINBODY.WPACIS.1.=&DECTODATE.MAINBODY.WPACIS.1.=&FINALGRANTFROM.MAINBODY.WPACIS.1.=&FINALGRANTTO.MAINBODY.WPACIS.1.=&APELDGDATFROM.MAINBODY.WPACIS.1.=&APELDGDATTO.MAINBODY.WPACIS.1.=&APEDECDATFROM.MAINBODY.WPACIS.1.=&APEDECDATTO.MAINBODY.WPACIS.1.=&AREA.MAINBODY.WPACIS.1.=&WARD.MAINBODY.WPACIS.1.=&PARISH.MAINBODY.WPACIS.1.=&SEARCHBUTTON.MAINBODY.WPACIS.1.=Search +# server=[digitalmaidstone.co.uk] , port=[80], script=[/swiftlg/apas/run/WPHAPPCRITERIA] +# request_line=[POST /swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0] + +# second page +#http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=243941& +#StartIndex=11& +#SortOrder=APNID:asc& +#DispResultsAs=WPHAPPSEARCHRES& +#BackURL=Search%20Criteria + +# Date format to enter into search boxes +date_format = "%d/%m/%Y" + +class SwiftLGParser: + search_path = "WPHAPPCRITERIA" + info_path = "WPHAPPDETAIL.DisplayUrl?theApnID=%s" + comment_path ="wphmakerep.displayURL?ApnID=%s" + + def _findResultsTable(self, soup): + """Unless there is just one table in the page, the resuts table, + override this in a subclass.""" + return soup.table + + def _findTRs(self, results_table): + """The usual situation is for the results table to contain + one row of headers, followed by a row per app. + If this is not the case, override this in a subclass.""" + return results_table.findAll("tr")[1:] + + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.search_url = urlparse.urljoin(base_url, self.search_path) + self.info_url = urlparse.urljoin(base_url, self.info_path) + self.comment_url = urlparse.urljoin(base_url, self.comment_path) + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + post_data = urllib.urlencode(( + ("REGFROMDATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), + ("REGTODATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), + ("SEARCHBUTTON.MAINBODY.WPACIS.1.", "Search"), + )) + + response = urllib2.urlopen(self.search_url, post_data) + contents = response.read() + + # Check for the no results warning + if not contents.count("No Matching Applications Found"): + soup = BeautifulSoup(contents) + + # Get the links to later pages of results. + later_pages = soup.findAll("a", {"href": re.compile("WPHAPPSEARCHRES\.displayResultsURL.*StartIndex=\d*.*")}) + + for a in ["initial_search"] + later_pages: + if a != "initial_search": + url = a['href'] + + # Example url + + #http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=244037&StartIndex=11&SortOrder=APNID:asc&DispResultsAs=WPHAPPSEARCHRES&BackURL=Search Criteria + + # urllib2 doesn't like this url, to make it happy, we'll + # get rid of the BackURL parameter, which we don't need. + + split_url = urlparse.urlsplit(url) + qs = split_url[3] + + # This gets us a dictionary of key to lists of values + qsl = cgi.parse_qsl(qs) + + # Get rid of BackURL + qsl.pop(-1) + + # I think this is safe, as there are no repeats of parameters + new_qs = urllib.urlencode(qsl) + + url = urlparse.urlunsplit(split_url[:3] + (new_qs,) + split_url[4:]) + + this_page_url = urlparse.urljoin(self.base_url, url) + response = urllib2.urlopen(this_page_url) + contents = response.read() + soup = BeautifulSoup(contents) + + results_table = self._findResultsTable(soup)#.body.find("table", {"class": "apas_tbl"}) + + trs = self._findTRs(results_table) + + for tr in trs: + self._current_application = PlanningApplication() + + tds = tr.findAll("td") + + # The first td + + #Search Criteria > Search Criteria'>Search Results">07/1884 + + # The html here is a bit of a mess, and doesn't all get into + # the soup. + # We can get the reference from the first in td 0. + first_link = tds[0].a['href'] + + app_id = cgi.parse_qs(urlparse.urlsplit(first_link)[3])['theApnID'][0] + + self._current_application.date_received = search_date + self._current_application.council_reference = app_id + self._current_application.info_url = self.info_url %(app_id) + self._current_application.comment_url = self.comment_url %(app_id) + self._current_application.description = tds[1].string.strip() + + # the second td + + # + #LAND ADJ. BRAMBLING, HAWKENBURY ROAD, HAWKENBURY, TN120EA + # + + # For some reason, this doesn't work: + #address = tds[2].string + + # But this does + address = tds[2].input.next.strip() + + self._current_application.address = address + self._current_application.postcode = getPostcodeFromText(address) + + self._results.addApplication(self._current_application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +class EastHertsParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[3] + +class GwyneddParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.find("table", {"class": "thinBox"}) + +class IslingtonParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.table.table + +class LakeDistrictParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.table.table + +class MacclesfieldParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[6] + +class MoleValleyParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[5] + +class SloughParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[1] + + def _findTRs(self, results_table): + return results_table.findAll("tr")[2:] + +if __name__ == '__main__': +# parser = SwiftLGParser("Dudley", "Dudley", "http://www2.dudley.gov.uk/swiftlg/apas/run/") + parser = EastHertsParser("East Hertfordshire", "East Herts", "http://e-services.eastherts.gov.uk/swiftlg/apas/run/") +# parser = GwyneddParser("Gwynedd", "Gwynedd", "http://www.gwynedd.gov.uk/swiftlg/apas/run/") +# parser = IslingtonParser("Islington", "Islington", "https://www.islington.gov.uk/onlineplanning/apas/run/") +# parser = LakeDistrictParser("Lake District", "Lake District", "http://www.lake-district.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Maidstone Borough Council", "Maidstone", "http://digitalmaidstone.co.uk/swiftlg/apas/run/") +# parser = MoleValleyParser("Mole Valley", "Mole Valley", "http://www.molevalley.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Pembrokeshire County Council", "Pembrokeshire", "http://planning.pembrokeshire.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Rochdale Metropolitan Borough Council", "Rochdale", "http://www.rochdale.gov.uk/swiftlg/apas/run/") +# parser = SloughParser("Slough", "Slough", "http://www2.slough.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Snowdonia National Park", "Snowdonia", "http://www.snowdonia-npa.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("St Edmundsbury", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/") +# parser = MacclesfieldParser("Macclesfield", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/") + print parser.getResults(22,11,2007) + + +# To Do: + +#1) Check out comment url on Maidstone + +#2) Daventry, when it is back up. + +#3) Work out what goes wrong with Gwynedd on 06/11/2007