diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 3b6cbc9..a34556b 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -14,3 +14,4 @@ "RutlandLike.py", "420" "AtriumePlanning.py", "420" "PlanningExplorer.py", "420" +"SwiftLG.py", "420" diff --git a/python_scrapers/PlanningExplorer.py b/python_scrapers/PlanningExplorer.py index b96bd6e..11cb6a0 100644 --- a/python_scrapers/PlanningExplorer.py +++ b/python_scrapers/PlanningExplorer.py @@ -180,6 +180,7 @@ class PlanningExplorerParser: headers = self._getHeaders() request = urllib2.Request(self.search_url, post_data, headers) + post_response = urllib2.urlopen(request) # We have actually been returned here by an http302 object @@ -546,14 +547,14 @@ if __name__ == '__main__': #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") #parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") #parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") - parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") + #parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") #parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") #parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") #parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") #parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") #parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/") #parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/") - #parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") + parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") #parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/") #parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/") #parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/") diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 1c3400e..39c1aa1 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -166,4 +166,16 @@ "Tamworth Borough Council", "Tamworth", "http://80.1.64.77/", "PlanningExplorer", "TamworthParser" "Trafford Council", "Trafford", "http://planning.trafford.gov.uk/", "PlanningExplorer", "TraffordParser" "West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/", "PlanningExplorer", "WestOxfordshireParser" - +"Dudley Metropolitan Borough Council", "Dudley", "http://www2.dudley.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"East Hertfordshire District Council", "East Herts", "http://e-services.eastherts.gov.uk/swiftlg/apas/run/", "SwiftLG", "EastHertsParser" +"London Borough of Islington", "Islington", "https://www.islington.gov.uk/onlineplanning/apas/run/", "SwiftLG", "IslingtonParser" +"Cyngor Gwynedd Council", "Gwynedd", "http://www.gwynedd.gov.uk/swiftlg/apas/run/", "SwiftLG", "GwyneddParser" +"Lake District National Park Authority", "Lake District", "http://www.lake-district.gov.uk/swiftlg/apas/run/", "SwiftLG", "LakeDistrictParser" +"Macclesfield Borough Council", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/", "SwiftLG", "MacclesfieldParser" +"Maidstone Borough Council", "Maidstone", "http://digitalmaidstone.co.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Mole Valley District Council", "Mole Valley", "http://www.molevalley.gov.uk/swiftlg/apas/run/", "SwiftLG", "MoleValleyParser" +"Pembrokeshire County Council", "Pembrokeshire", "http://planning.pembrokeshire.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Rochdale Metropolitan Borough Council", "Rochdale", "http://www.rochdale.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"Slough Borough Council", "Slough", "http://www2.slough.gov.uk/swiftlg/apas/run/", "SwiftLG", "SloughParser" +"Snowdonia National Park Authority", "Snowdonia", "http://www.snowdonia-npa.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" +"St Edmundsbury Borough Council", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/", "SwiftLG", "SwiftLGParser" diff --git a/python_scrapers/SwiftLG.py b/python_scrapers/SwiftLG.py new file mode 100644 index 0000000..59fddcb --- /dev/null +++ b/python_scrapers/SwiftLG.py @@ -0,0 +1,225 @@ + +import urllib2 +import urllib +import urlparse +import cgi +import re +import datetime + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import getPostcodeFromText, \ + PlanningAuthorityResults, \ + PlanningApplication + +# - Browser request: -------------------------- +# {POST http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0} {Host: digitalmaidstone.co.uk +# Accept: text/html, text/plain, text/css, text/sgml, */*;q=0.01 +# Accept-Encoding: gzip +# Accept-Language: en +# Pragma: no-cache +# Cache-Control: no-cache +# User-Agent: Lynx/2.8.6rel.4 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.6.3 +# Content-type: application/x-www-form-urlencoded +# Content-length: 638 +# } %25.MAINBODY.WPACIS.1.=&APNID.MAINBODY.WPACIS.1.=&JUSTLOCATION.MAINBODY.WPACIS.1.=&JUSTDEVDESC.MAINBODY.WPACIS.1.=&DEVDESC.MAINBODY.WPACIS.1.=&SURNAME.MAINBODY.WPACIS.1.=®FROMDATE.MAINBODY.WPACIS.1.=01%2F11%2F2007®TODATE.MAINBODY.WPACIS.1.=02%2F11%2F2007&DECFROMDATE.MAINBODY.WPACIS.1.=&DECTODATE.MAINBODY.WPACIS.1.=&FINALGRANTFROM.MAINBODY.WPACIS.1.=&FINALGRANTTO.MAINBODY.WPACIS.1.=&APELDGDATFROM.MAINBODY.WPACIS.1.=&APELDGDATTO.MAINBODY.WPACIS.1.=&APEDECDATFROM.MAINBODY.WPACIS.1.=&APEDECDATTO.MAINBODY.WPACIS.1.=&AREA.MAINBODY.WPACIS.1.=&WARD.MAINBODY.WPACIS.1.=&PARISH.MAINBODY.WPACIS.1.=&SEARCHBUTTON.MAINBODY.WPACIS.1.=Search +# server=[digitalmaidstone.co.uk] , port=[80], script=[/swiftlg/apas/run/WPHAPPCRITERIA] +# request_line=[POST /swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0] + +# second page +#http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=243941& +#StartIndex=11& +#SortOrder=APNID:asc& +#DispResultsAs=WPHAPPSEARCHRES& +#BackURL=Search%20Criteria + +# Date format to enter into search boxes +date_format = "%d/%m/%Y" + +class SwiftLGParser: + search_path = "WPHAPPCRITERIA" + info_path = "WPHAPPDETAIL.DisplayUrl?theApnID=%s" + comment_path ="wphmakerep.displayURL?ApnID=%s" + + def _findResultsTable(self, soup): + """Unless there is just one table in the page, the resuts table, + override this in a subclass.""" + return soup.table + + def _findTRs(self, results_table): + """The usual situation is for the results table to contain + one row of headers, followed by a row per app. + If this is not the case, override this in a subclass.""" + return results_table.findAll("tr")[1:] + + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.search_url = urlparse.urljoin(base_url, self.search_path) + self.info_url = urlparse.urljoin(base_url, self.info_path) + self.comment_url = urlparse.urljoin(base_url, self.comment_path) + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + post_data = urllib.urlencode(( + ("REGFROMDATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), + ("REGTODATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)), + ("SEARCHBUTTON.MAINBODY.WPACIS.1.", "Search"), + )) + + response = urllib2.urlopen(self.search_url, post_data) + contents = response.read() + + # Check for the no results warning + if not contents.count("No Matching Applications Found"): + soup = BeautifulSoup(contents) + + # Get the links to later pages of results. + later_pages = soup.findAll("a", {"href": re.compile("WPHAPPSEARCHRES\.displayResultsURL.*StartIndex=\d*.*")}) + + for a in ["initial_search"] + later_pages: + if a != "initial_search": + url = a['href'] + + # Example url + + #http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=244037&StartIndex=11&SortOrder=APNID:asc&DispResultsAs=WPHAPPSEARCHRES&BackURL=Search Criteria + + # urllib2 doesn't like this url, to make it happy, we'll + # get rid of the BackURL parameter, which we don't need. + + split_url = urlparse.urlsplit(url) + qs = split_url[3] + + # This gets us a dictionary of key to lists of values + qsl = cgi.parse_qsl(qs) + + # Get rid of BackURL + qsl.pop(-1) + + # I think this is safe, as there are no repeats of parameters + new_qs = urllib.urlencode(qsl) + + url = urlparse.urlunsplit(split_url[:3] + (new_qs,) + split_url[4:]) + + this_page_url = urlparse.urljoin(self.base_url, url) + response = urllib2.urlopen(this_page_url) + contents = response.read() + soup = BeautifulSoup(contents) + + results_table = self._findResultsTable(soup)#.body.find("table", {"class": "apas_tbl"}) + + trs = self._findTRs(results_table) + + for tr in trs: + self._current_application = PlanningApplication() + + tds = tr.findAll("td") + + # The first td + + #