diff --git a/docs/templates/getinvolved.tpl b/docs/templates/getinvolved.tpl index 8210eab..3af748f 100644 --- a/docs/templates/getinvolved.tpl +++ b/docs/templates/getinvolved.tpl @@ -7,13 +7,13 @@ You can help by writing a screen scraper for your local authority that was can import into planningalerts.com. There are only 2 criteria for the screen scraper:

    -
  1. That it can output data in the following format: http://www.planningalerts.com/lambeth.xml
  2. -
  3. That it can accept a query sting in the format day=X&month=Y&year=Z
  4. +
  5. That it can output data in the following format: http://www.planningalerts.com/lambeth.xml
  6. +
  7. That it can accept a query sting in the format day=X&month=Y&year=Z

Other than that it's up to you. It can be in any language. You can host them yourself or we can host it for you.

-

You can grab the code for this site and view some developent tickets here and join our developer mailing list here. +

You can grab the code for this site and view some development tickets here and join our developer mailing list here.

I work for a local authority and would like to make our data available

diff --git a/python_scrapers/AcolnetParser.py b/python_scrapers/AcolnetParser.py index 9825b75..d73bd57 100644 --- a/python_scrapers/AcolnetParser.py +++ b/python_scrapers/AcolnetParser.py @@ -34,6 +34,7 @@ end_head_regex = re.compile("?", re.IGNORECASE) class AcolnetParser: + received_date_label = "Registration Date:" received_date_format = "%d/%m/%Y" comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s" @@ -55,7 +56,7 @@ class AcolnetParser: return app_table.a.string.strip() def _getDateReceived(self, app_table): - date_str = ''.join(app_table.find(text="Registration Date:").findNext("td").string.strip().split()) + date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split()) day, month, year = date_str.split('/') return date(int(year), int(month), int(day)) @@ -205,6 +206,15 @@ class BridgnorthParser(AcolnetParser): #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958 return self._current_application.info_url.replace("NewPages", "PgeCommentForm") +class BlackpoolParser(AcolnetParser): + received_date_label = "Application Date:" + + def _getResultsSections(self, soup): + return soup.findAll("table", {"class": "acolnet-results-table"}) + + def _getCommentUrl(self, app_table): + ref = self._getCouncilReference(app_table) + return "https://www.blackpool.gov.uk/Services/M-R/PlanningApplications/Forms/PlanningNeighbourResponseForm.htm?Application_No=" + ref.replace('/','%2F') class CanterburyParser(AcolnetParser): """Here the apps are one row each in a big table.""" @@ -227,14 +237,17 @@ class CanterburyParser(AcolnetParser): def _getDescription(self, app_table): return app_table.findAll("td")[2].string.strip() -#Kensington and chelsea is sufficiently different, it may as well be handled separately +class GreenwichParser(AcolnetParser): + received_date_label = "Registration date:" + comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentNeighbourForm&TheSystemkey=%s" -# Mid Bedfordshire - there is an acolnet here, but you have to have a username -# and password to access it! + def _getInfoUrl(self, app_table): + return AcolnetParser._getInfoUrl(self, app_table).replace('/?', '/acolnetcgi.gov?', 1) + +#Kensington and chelsea is sufficiently different, it may as well be handled separately class MidBedsParser(AcolnetParser): def _getCouncilReference(self, app_table): -# return app_table.findAll("a")[1].string.strip() return app_table.findAll("a")[1].string.strip() class OldhamParser(AcolnetParser): @@ -361,11 +374,11 @@ if __name__ == '__main__': #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") # parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") - #parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") - # parser = MidBedsParser("Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") # parser = AcolnetParser("Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") - parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") - +# parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") +# parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") +# parser = BlackpoolParser("Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + parser = GreenwichParser("London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") print parser.getResults(day, month, year) diff --git a/python_scrapers/Ocella.py b/python_scrapers/Ocella.py index 082c6a5..eef26b6 100644 --- a/python_scrapers/Ocella.py +++ b/python_scrapers/Ocella.py @@ -50,6 +50,7 @@ class OcellaParser: # These will be used to store the column numbers of the appropriate items in the results table self.reference_col = None self.address_col = None + self.applicant_col = None self.description_col = None self.received_date_col = None self.accepted_date_col = None @@ -59,6 +60,7 @@ class OcellaParser: # First get the search page get_request = urllib2.Request(self.base_url) + get_request.add_header('Accept', 'text/html') get_response = urllib2.urlopen(get_request) cookie_jar.extract_cookies(get_response, get_request) @@ -75,6 +77,14 @@ class OcellaParser: # but it seems we don't need it... session_id = None + # Unless we retrieve the correct form name, we will simply get the last week's applications + submit_tag = get_soup.find('input', {'value': 'Search'}) or get_soup.find('input', {'value': 'Search for Applications'}) or get_soup.find('input', {'value': 'Submit'}) + try: + submit_name = submit_tag['name'] + form_name = submit_name.split('.')[0] + except TypeError: + form_name = 'FRM_PLANNING_LIST' + # # From Breckland # p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01 @@ -88,21 +98,23 @@ class OcellaParser: # FRM_WEEKLY_LIST.DEFAULT.PARISH.01= post_data = urllib.urlencode( - [('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'), + [('p_object_name', form_name + '.DEFAULT.SUBMIT_TOP.01'), ('p_instance', '1'), ('p_event_type', 'ON_CLICK'), ('p_user_args', ''), ('p_session_id', session_id), ('p_page_url', self.base_url), - ('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)), - ('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)), - ('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''), + (form_name + '.DEFAULT.AGENT.01', ''), + (form_name + '.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)), + (form_name + '.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)), + (form_name + '.DEFAULT.PARISH.01', ''), ] ) post_request = urllib2.Request(action, post_data) cookie_jar.add_cookie_header(post_request) + post_request.add_header('Accept', 'text/html') post_request.add_header('Referer', self.base_url) post_response = cookie_handling_opener.open(post_request) @@ -119,10 +131,12 @@ class OcellaParser: th_index = 0 for th in ths: th_content = th.font.string.strip() - if th_content == 'Reference' or th_content == 'Application Ref': + if th_content == 'Reference' or th_content == 'Application Ref' or th_content == 'Application Number': self.reference_col = th_index elif th_content == 'Location': self.address_col = th_index + elif th_content == 'Applicant Details': + self.applicant_col = th_index elif th_content == 'Proposal': self.description_col = th_index elif th_content == 'Development Description': @@ -159,8 +173,12 @@ class OcellaParser: self._current_application.address = tds[self.address_col].font.string.strip() self._current_application.postcode = getPostcodeFromText(self._current_application.address) + if self._current_application.postcode is None and self.applicant_col is not None: + # won't always be accurate to do this but better than nothing (needed for Havering) + self._current_application.postcode = getPostcodeFromText(tds[self.applicant_col].font.string.strip()) self._current_application.description = tds[self.description_col].font.string.strip() - self._current_application.info_url = tds[self.reference_col].a['href'] + # seems to be dependent on the implementation whether the URL is encoded (e.g. Great Yarmouth does this), so we cannot do anything more "standard" + self._current_application.info_url = urlparse.urljoin(post_response.geturl(), tds[self.reference_col].a['href'].replace('&','&')) # This is what a comment url looks like # It seems to be no problem to remove the sessionid (which is in any case blank...) @@ -184,20 +202,13 @@ if __name__ == '__main__': # parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL") -# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL") -# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL") -# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL") - - - # Bad status line? Try changing browser id string? +# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly") # parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL") - -# Post never comes back -# parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") - - # Can't find the URL similar to the others, even though it is clearly Ocella - # We get a 406 at the moment. Try browser id string? - parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/search") + parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly") print parser.getResults(21,5,2008) diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 4f3cf0d..2146514 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -192,7 +192,6 @@ "Chichester District Council", "Chichester", "http://pa.chichester.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "London Borough of Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", "WAM", "WAMParser" "Braintree District Council", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", "WAM", "BraintreeParser" -"Castle Point Borough Council", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser" "Colchester Borough Council", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", "WAM", "BraintreeParser" "East Lothian Council", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser" "North Somerset Council", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", "WAM", "BraintreeParser" @@ -213,9 +212,9 @@ "Ellesmere Port and Neston Borough Council", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" "Fareham Borough Council", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" "London Borough of Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" -"North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" -"Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" -"Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser" "Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "MidBedsParser" "Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" "Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser" @@ -237,3 +236,10 @@ "Lewes District Council", "Lewes", "http://planning.lewes.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" "Olympic Delivery Authority", "Olympics", "http://planning.london2012.com/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" +"Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BlackpoolParser" +"London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "GreenwichParser" +"Bridgend County Borough Council", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"London Borough of Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Castle Point Borough Council", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Great Yarmouth Borough Council", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser" diff --git a/python_scrapers/SwiftLG.py b/python_scrapers/SwiftLG.py index e002183..d5e743b 100644 --- a/python_scrapers/SwiftLG.py +++ b/python_scrapers/SwiftLG.py @@ -215,7 +215,8 @@ if __name__ == '__main__': # parser = SwiftLGParser("St Edmundsbury", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/") # parser = MacclesfieldParser("Macclesfield", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/") # parser = SwiftLGParser("Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display") - parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display") +# parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display") + parser = SwiftLGParser("Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display") print parser.getResults(26,6,2008)