diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 3b6cbc9..a34556b 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -14,3 +14,4 @@ "RutlandLike.py", "420" "AtriumePlanning.py", "420" "PlanningExplorer.py", "420" +"SwiftLG.py", "420" diff --git a/python_scrapers/PlanningExplorer.py b/python_scrapers/PlanningExplorer.py index b96bd6e..11cb6a0 100644 --- a/python_scrapers/PlanningExplorer.py +++ b/python_scrapers/PlanningExplorer.py @@ -180,6 +180,7 @@ class PlanningExplorerParser: headers = self._getHeaders() request = urllib2.Request(self.search_url, post_data, headers) + post_response = urllib2.urlopen(request) # We have actually been returned here by an http302 object @@ -546,14 +547,14 @@ if __name__ == '__main__': #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") #parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") #parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") - parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") + #parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") #parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") #parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") #parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") #parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") #parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/") #parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/") - #parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") + parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") #parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/") #parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/") #parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/") diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 1c3400e..39c1aa1 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -166,4 +166,16 @@ "Tamworth Borough Council", "Tamworth", "http://80.1.64.77/", "PlanningExplorer", "TamworthParser" "Trafford Council", "Trafford", "http://planning.trafford.gov.uk/", "PlanningExplorer", "TraffordParser" "West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/", "PlanningExplorer", "WestOxfordshireParser" - +"Dudley Metropolitan Borough Council", "Dudley", "http://www2.dudley.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"East Hertfordshire District Council", "East Herts", "http://e-services.eastherts.gov.uk/swiftlg/apas/run/", "SwiftLG", "EastHertsParser" +"London Borough of Islington", "Islington", "https://www.islington.gov.uk/onlineplanning/apas/run/", "SwiftLG", "IslingtonParser" +"Cyngor Gwynedd Council", "Gwynedd", "http://www.gwynedd.gov.uk/swiftlg/apas/run/", "SwiftLG", "GwyneddParser" +"Lake District National Park Authority", "Lake District", "http://www.lake-district.gov.uk/swiftlg/apas/run/", "SwiftLG", "LakeDistrictParser" +"Macclesfield Borough Council", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/", "SwiftLG", "MacclesfieldParser" +"Maidstone Borough Council", "Maidstone", "http://digitalmaidstone.co.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Mole Valley District Council", "Mole Valley", "http://www.molevalley.gov.uk/swiftlg/apas/run/", "SwiftLG", "MoleValleyParser" +"Pembrokeshire County Council", "Pembrokeshire", "http://planning.pembrokeshire.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Rochdale Metropolitan Borough Council", "Rochdale", "http://www.rochdale.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Slough Borough Council", "Slough", "http://www2.slough.gov.uk/swiftlg/apas/run/", "SwiftLG", "SloughParser" +"Snowdonia National Park Authority", "Snowdonia", "http://www.snowdonia-npa.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"St Edmundsbury Borough Council", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" diff --git a/python_scrapers/SwiftLG.py b/python_scrapers/SwiftLG.py new file mode 100644 index 0000000..59fddcb --- /dev/null +++ b/python_scrapers/SwiftLG.py @@ -0,0 +1,225 @@ + +import urllib2 +import urllib +import urlparse +import cgi +import re +import datetime + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import getPostcodeFromText, \ + PlanningAuthorityResults, \ + PlanningApplication + +# - Browser request: -------------------------- +# {POST http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0} {Host: digitalmaidstone.co.uk +# Accept: text/html, text/plain, text/css, text/sgml, */*;q=0.01 +# Accept-Encoding: gzip +# Accept-Language: en +# Pragma: no-cache +# Cache-Control: no-cache +# User-Agent: Lynx/2.8.6rel.4 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.6.3 +# Content-type: application/x-www-form-urlencoded +# Content-length: 638 +# } %25.MAINBODY.WPACIS.1.=&APNID.MAINBODY.WPACIS.1.=&JUSTLOCATION.MAINBODY.WPACIS.1.=&JUSTDEVDESC.MAINBODY.WPACIS.1.=&DEVDESC.MAINBODY.WPACIS.1.=&SURNAME.MAINBODY.WPACIS.1.=®FROMDATE.MAINBODY.WPACIS.1.=01%2F11%2F2007®TODATE.MAINBODY.WPACIS.1.=02%2F11%2F2007&DECFROMDATE.MAINBODY.WPACIS.1.=&DECTODATE.MAINBODY.WPACIS.1.=&FINALGRANTFROM.MAINBODY.WPACIS.1.=&FINALGRANTTO.MAINBODY.WPACIS.1.=&APELDGDATFROM.MAINBODY.WPACIS.1.=&APELDGDATTO.MAINBODY.WPACIS.1.=&APEDECDATFROM.MAINBODY.WPACIS.1.=&APEDECDATTO.MAINBODY.WPACIS.1.=&AREA.MAINBODY.WPACIS.1.=&WARD.MAINBODY.WPACIS.1.=&PARISH.MAINBODY.WPACIS.1.=&SEARCHBUTTON.MAINBODY.WPACIS.1.=Search +# server=[digitalmaidstone.co.uk] , port=[80], script=[/swiftlg/apas/run/WPHAPPCRITERIA] +# request_line=[POST /swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0] + +# second page +#http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=243941& +#StartIndex=11& +#SortOrder=APNID:asc& +#DispResultsAs=WPHAPPSEARCHRES& +#BackURL=Search%20Criteria + +# Date format to enter into search boxes +date_format = "%d/%m/%Y" + +class SwiftLGParser: + search_path = "WPHAPPCRITERIA" + info_path = "WPHAPPDETAIL.DisplayUrl?theApnID=%s" + comment_path ="wphmakerep.displayURL?ApnID=%s" + + def _findResultsTable(self, soup): + """Unless there is just one table in the page, the resuts table, + override this in a subclass.""" + return soup.table + + def _findTRs(self, results_table): + """The usual situation is for the results table to contain + one row of headers, followed by a row per app. + If this is not the case, override this in a subclass.""" + return results_table.findAll("tr")[1:] + + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.search_url = urlparse.urljoin(base_url, self.search_path) + self.info_url = urlparse.urljoin(base_url, self.info_path) + self.comment_url = urlparse.urljoin(base_url, self.comment_path) + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + post_data = urllib.urlencode(( + ("REGFROMDATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), + ("REGTODATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), + ("SEARCHBUTTON.MAINBODY.WPACIS.1.", "Search"), + )) + + response = urllib2.urlopen(self.search_url, post_data) + contents = response.read() + + # Check for the no results warning + if not contents.count("No Matching Applications Found"): + soup = BeautifulSoup(contents) + + # Get the links to later pages of results. + later_pages = soup.findAll("a", {"href": re.compile("WPHAPPSEARCHRES\.displayResultsURL.*StartIndex=\d*.*")}) + + for a in ["initial_search"] + later_pages: + if a != "initial_search": + url = a['href'] + + # Example url + + #http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=244037&StartIndex=11&SortOrder=APNID:asc&DispResultsAs=WPHAPPSEARCHRES&BackURL=Search Criteria + + # urllib2 doesn't like this url, to make it happy, we'll + # get rid of the BackURL parameter, which we don't need. + + split_url = urlparse.urlsplit(url) + qs = split_url[3] + + # This gets us a dictionary of key to lists of values + qsl = cgi.parse_qsl(qs) + + # Get rid of BackURL + qsl.pop(-1) + + # I think this is safe, as there are no repeats of parameters + new_qs = urllib.urlencode(qsl) + + url = urlparse.urlunsplit(split_url[:3] + (new_qs,) + split_url[4:]) + + this_page_url = urlparse.urljoin(self.base_url, url) + response = urllib2.urlopen(this_page_url) + contents = response.read() + soup = BeautifulSoup(contents) + + results_table = self._findResultsTable(soup)#.body.find("table", {"class": "apas_tbl"}) + + trs = self._findTRs(results_table) + + for tr in trs: + self._current_application = PlanningApplication() + + tds = tr.findAll("td") + + # The first td + + #Search Criteria > Search Criteria'>Search Results">07/1884 + + # The html here is a bit of a mess, and doesn't all get into + # the soup. + # We can get the reference from the first in td 0. + first_link = tds[0].a['href'] + + app_id = cgi.parse_qs(urlparse.urlsplit(first_link)[3])['theApnID'][0] + + self._current_application.date_received = search_date + self._current_application.council_reference = app_id + self._current_application.info_url = self.info_url %(app_id) + self._current_application.comment_url = self.comment_url %(app_id) + self._current_application.description = tds[1].string.strip() + + # the second td + + # + #LAND ADJ. BRAMBLING, HAWKENBURY ROAD, HAWKENBURY, TN120EA + # + + # For some reason, this doesn't work: + #address = tds[2].string + + # But this does + address = tds[2].input.next.strip() + + self._current_application.address = address + self._current_application.postcode = getPostcodeFromText(address) + + self._results.addApplication(self._current_application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +class EastHertsParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[3] + +class GwyneddParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.find("table", {"class": "thinBox"}) + +class IslingtonParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.table.table + +class LakeDistrictParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.table.table + +class MacclesfieldParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[6] + +class MoleValleyParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[5] + +class SloughParser(SwiftLGParser): + def _findResultsTable(self, soup): + return soup.findAll("table")[1] + + def _findTRs(self, results_table): + return results_table.findAll("tr")[2:] + +if __name__ == '__main__': +# parser = SwiftLGParser("Dudley", "Dudley", "http://www2.dudley.gov.uk/swiftlg/apas/run/") + parser = EastHertsParser("East Hertfordshire", "East Herts", "http://e-services.eastherts.gov.uk/swiftlg/apas/run/") +# parser = GwyneddParser("Gwynedd", "Gwynedd", "http://www.gwynedd.gov.uk/swiftlg/apas/run/") +# parser = IslingtonParser("Islington", "Islington", "https://www.islington.gov.uk/onlineplanning/apas/run/") +# parser = LakeDistrictParser("Lake District", "Lake District", "http://www.lake-district.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Maidstone Borough Council", "Maidstone", "http://digitalmaidstone.co.uk/swiftlg/apas/run/") +# parser = MoleValleyParser("Mole Valley", "Mole Valley", "http://www.molevalley.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Pembrokeshire County Council", "Pembrokeshire", "http://planning.pembrokeshire.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Rochdale Metropolitan Borough Council", "Rochdale", "http://www.rochdale.gov.uk/swiftlg/apas/run/") +# parser = SloughParser("Slough", "Slough", "http://www2.slough.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("Snowdonia National Park", "Snowdonia", "http://www.snowdonia-npa.gov.uk/swiftlg/apas/run/") +# parser = SwiftLGParser("St Edmundsbury", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/") +# parser = MacclesfieldParser("Macclesfield", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/") + print parser.getResults(22,11,2007) + + +# To Do: + +#1) Check out comment url on Maidstone + +#2) Daventry, when it is back up. + +#3) Work out what goes wrong with Gwynedd on 06/11/2007