Explorar el Código

Fix Stockport (which is now acolnet, not PlanningExplorer).

Add Blackburn and Greenwich (acolnet sites - patch from Peter Collingbourne.
Fixes to Ocella parser (also from Peter). Adds Bridgend, Castle Point, Great Yarmouth, Havering).
Fix typo and escaping in getinvolved (patch from David Sheldon).
master
duncan.parkes hace 16 años
padre
commit
884e8f97ec
Se han modificado 5 ficheros con 67 adiciones y 36 borrados
  1. +3
    -3
      docs/templates/getinvolved.tpl
  2. +22
    -9
      python_scrapers/AcolnetParser.py
  3. +30
    -19
      python_scrapers/Ocella.py
  4. +10
    -4
      python_scrapers/SitesToGenerate.csv
  5. +2
    -1
      python_scrapers/SwiftLG.py

+ 3
- 3
docs/templates/getinvolved.tpl Ver fichero

@@ -7,13 +7,13 @@
You can help by writing a <a href="http://en.wikipedia.org/wiki/Screen_scraping">screen scraper</a> for your local authority that was can import into planningalerts.com. There are only 2 criteria for the screen scraper:
</p>
<ol>
<li>That it can output data in the following format: <a href="http://www.planningalerts.com/lambeth.xml">http://www.planningalerts.com/lambeth.xml</a></li>
<li>That it can accept a query sting in the format day=X&month=Y&year=Z</li>
<li>That it can output data in the following format: <a href="/lambeth.xml">http://www.planningalerts.com/lambeth.xml</a></li>
<li>That it can accept a query sting in the format day=X&amp;month=Y&amp;year=Z</li>
</ol>
<p>
Other than that it's up to you. It can be in any language. You can host them yourself or we can host it for you.
</p>
<p><span class="highlight">You can grab the code for this site and view some developent tickets <a href="http://code.google.com/p/planningalerts/">here</a> and join our developer mailing list <a href="http://groups.google.com/group/planningalerts-dev"> here</a>.</span>
<p><span class="highlight">You can grab the code for this site and view some development tickets <a href="http://code.google.com/p/planningalerts/">here</a> and join our developer mailing list <a href="http://groups.google.com/group/planningalerts-dev"> here</a>.</span>
</p>
<h3>I work for a local authority and would like to make our data available</h3>
<p>


+ 22
- 9
python_scrapers/AcolnetParser.py Ver fichero

@@ -34,6 +34,7 @@ end_head_regex = re.compile("</head>?", re.IGNORECASE)


class AcolnetParser:
received_date_label = "Registration Date:"
received_date_format = "%d/%m/%Y"

comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
@@ -55,7 +56,7 @@ class AcolnetParser:
return app_table.a.string.strip()

def _getDateReceived(self, app_table):
date_str = ''.join(app_table.find(text="Registration Date:").findNext("td").string.strip().split())
date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split())
day, month, year = date_str.split('/')
return date(int(year), int(month), int(day))

@@ -205,6 +206,15 @@ class BridgnorthParser(AcolnetParser):
#http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
return self._current_application.info_url.replace("NewPages", "PgeCommentForm")

class BlackpoolParser(AcolnetParser):
received_date_label = "Application Date:"

def _getResultsSections(self, soup):
return soup.findAll("table", {"class": "acolnet-results-table"})

def _getCommentUrl(self, app_table):
ref = self._getCouncilReference(app_table)
return "https://www.blackpool.gov.uk/Services/M-R/PlanningApplications/Forms/PlanningNeighbourResponseForm.htm?Application_No=" + ref.replace('/','%2F')

class CanterburyParser(AcolnetParser):
"""Here the apps are one row each in a big table."""
@@ -227,14 +237,17 @@ class CanterburyParser(AcolnetParser):
def _getDescription(self, app_table):
return app_table.findAll("td")[2].string.strip()

#Kensington and chelsea is sufficiently different, it may as well be handled separately
class GreenwichParser(AcolnetParser):
received_date_label = "Registration date:"
comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentNeighbourForm&TheSystemkey=%s"

# Mid Bedfordshire - there is an acolnet here, but you have to have a username
# and password to access it!
def _getInfoUrl(self, app_table):
return AcolnetParser._getInfoUrl(self, app_table).replace('/?', '/acolnetcgi.gov?', 1)

#Kensington and chelsea is sufficiently different, it may as well be handled separately

class MidBedsParser(AcolnetParser):
def _getCouncilReference(self, app_table):
# return app_table.findAll("a")[1].string.strip()
return app_table.findAll("a")[1].string.strip()
class OldhamParser(AcolnetParser):
@@ -361,11 +374,11 @@ if __name__ == '__main__':
#parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

#parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

# parser = MidBedsParser("Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = AcolnetParser("Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

# parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = BlackpoolParser("Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
parser = GreenwichParser("London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
print parser.getResults(day, month, year)

+ 30
- 19
python_scrapers/Ocella.py Ver fichero

@@ -50,6 +50,7 @@ class OcellaParser:
# These will be used to store the column numbers of the appropriate items in the results table
self.reference_col = None
self.address_col = None
self.applicant_col = None
self.description_col = None
self.received_date_col = None
self.accepted_date_col = None
@@ -59,6 +60,7 @@ class OcellaParser:

# First get the search page
get_request = urllib2.Request(self.base_url)
get_request.add_header('Accept', 'text/html')
get_response = urllib2.urlopen(get_request)

cookie_jar.extract_cookies(get_response, get_request)
@@ -75,6 +77,14 @@ class OcellaParser:
# but it seems we don't need it...
session_id = None

# Unless we retrieve the correct form name, we will simply get the last week's applications
submit_tag = get_soup.find('input', {'value': 'Search'}) or get_soup.find('input', {'value': 'Search for Applications'}) or get_soup.find('input', {'value': 'Submit'})
try:
submit_name = submit_tag['name']
form_name = submit_name.split('.')[0]
except TypeError:
form_name = 'FRM_PLANNING_LIST'

# # From Breckland

# p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
@@ -88,21 +98,23 @@ class OcellaParser:
# FRM_WEEKLY_LIST.DEFAULT.PARISH.01=

post_data = urllib.urlencode(
[('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
[('p_object_name', form_name + '.DEFAULT.SUBMIT_TOP.01'),
('p_instance', '1'),
('p_event_type', 'ON_CLICK'),
('p_user_args', ''),
('p_session_id', session_id),
('p_page_url', self.base_url),
('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
(form_name + '.DEFAULT.AGENT.01', ''),
(form_name + '.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
(form_name + '.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
(form_name + '.DEFAULT.PARISH.01', ''),
]
)
post_request = urllib2.Request(action, post_data)
cookie_jar.add_cookie_header(post_request)

post_request.add_header('Accept', 'text/html')
post_request.add_header('Referer', self.base_url)

post_response = cookie_handling_opener.open(post_request)
@@ -119,10 +131,12 @@ class OcellaParser:
th_index = 0
for th in ths:
th_content = th.font.string.strip()
if th_content == 'Reference' or th_content == 'Application Ref':
if th_content == 'Reference' or th_content == 'Application Ref' or th_content == 'Application Number':
self.reference_col = th_index
elif th_content == 'Location':
self.address_col = th_index
elif th_content == 'Applicant Details':
self.applicant_col = th_index
elif th_content == 'Proposal':
self.description_col = th_index
elif th_content == 'Development Description':
@@ -159,8 +173,12 @@ class OcellaParser:

self._current_application.address = tds[self.address_col].font.string.strip()
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
if self._current_application.postcode is None and self.applicant_col is not None:
# won't always be accurate to do this but better than nothing (needed for Havering)
self._current_application.postcode = getPostcodeFromText(tds[self.applicant_col].font.string.strip())
self._current_application.description = tds[self.description_col].font.string.strip()
self._current_application.info_url = tds[self.reference_col].a['href']
# seems to be dependent on the implementation whether the URL is encoded (e.g. Great Yarmouth does this), so we cannot do anything more "standard"
self._current_application.info_url = urlparse.urljoin(post_response.geturl(), tds[self.reference_col].a['href'].replace('&amp;','&'))

# This is what a comment url looks like
# It seems to be no problem to remove the sessionid (which is in any case blank...)
@@ -184,20 +202,13 @@ if __name__ == '__main__':
# parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL")


# Bad status line? Try changing browser id string?
# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly")
# parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")

# Post never comes back
# parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")

# Can't find the URL similar to the others, even though it is clearly Ocella
# We get a 406 at the moment. Try browser id string?
parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/search")
parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")


print parser.getResults(21,5,2008)


+ 10
- 4
python_scrapers/SitesToGenerate.csv Ver fichero

@@ -192,7 +192,6 @@
"Chichester District Council", "Chichester", "http://pa.chichester.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"London Borough of Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"Braintree District Council", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", "WAM", "BraintreeParser"
"Castle Point Borough Council", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"Colchester Borough Council", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", "WAM", "BraintreeParser"
"East Lothian Council", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"North Somerset Council", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", "WAM", "BraintreeParser"
@@ -213,9 +212,9 @@
"Ellesmere Port and Neston Borough Council", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Fareham Borough Council", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"London Borough of Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"
"Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "MidBedsParser"
"Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
"Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser"
@@ -237,3 +236,10 @@
"Lewes District Council", "Lewes", "http://planning.lewes.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Olympic Delivery Authority", "Olympics", "http://planning.london2012.com/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BlackpoolParser"
"London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "GreenwichParser"
"Bridgend County Borough Council", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"London Borough of Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Castle Point Borough Council", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Great Yarmouth Borough Council", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"

+ 2
- 1
python_scrapers/SwiftLG.py Ver fichero

@@ -215,7 +215,8 @@ if __name__ == '__main__':
# parser = SwiftLGParser("St Edmundsbury", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/")
# parser = MacclesfieldParser("Macclesfield", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/")
# parser = SwiftLGParser("Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display")
parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display")
# parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display")
parser = SwiftLGParser("Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display")
print parser.getResults(26,6,2008)




Cargando…
Cancelar
Guardar