Explorar el Código

Fix Stockport (which is now acolnet, not PlanningExplorer).

Add Blackburn and Greenwich (acolnet sites - patch from Peter Collingbourne.
Fixes to Ocella parser (also from Peter). Adds Bridgend, Castle Point, Great Yarmouth, Havering).
Fix typo and escaping in getinvolved (patch from David Sheldon).
import/raw
duncan.parkes hace 16 años
padre
commit
29076cb994
Se han modificado 5 ficheros con 67 adiciones y 36 borrados
  1. +3
    -3
      trunk/docs/templates/getinvolved.tpl
  2. +22
    -9
      trunk/python_scrapers/AcolnetParser.py
  3. +30
    -19
      trunk/python_scrapers/Ocella.py
  4. +10
    -4
      trunk/python_scrapers/SitesToGenerate.csv
  5. +2
    -1
      trunk/python_scrapers/SwiftLG.py

+ 3
- 3
trunk/docs/templates/getinvolved.tpl Ver fichero

@@ -7,13 +7,13 @@
You can help by writing a <a href="http://en.wikipedia.org/wiki/Screen_scraping">screen scraper</a> for your local authority that was can import into planningalerts.com. There are only 2 criteria for the screen scraper:
</p>
<ol>
<li>That it can output data in the following format: <a href="http://www.planningalerts.com/lambeth.xml">http://www.planningalerts.com/lambeth.xml</a></li>
<li>That it can accept a query sting in the format day=X&month=Y&year=Z</li>
<li>That it can output data in the following format: <a href="/lambeth.xml">http://www.planningalerts.com/lambeth.xml</a></li>
<li>That it can accept a query sting in the format day=X&amp;month=Y&amp;year=Z</li>
</ol>
<p>
Other than that it's up to you. It can be in any language. You can host them yourself or we can host it for you.
</p>
<p><span class="highlight">You can grab the code for this site and view some developent tickets <a href="http://code.google.com/p/planningalerts/">here</a> and join our developer mailing list <a href="http://groups.google.com/group/planningalerts-dev"> here</a>.</span>
<p><span class="highlight">You can grab the code for this site and view some development tickets <a href="http://code.google.com/p/planningalerts/">here</a> and join our developer mailing list <a href="http://groups.google.com/group/planningalerts-dev"> here</a>.</span>
</p>
<h3>I work for a local authority and would like to make our data available</h3>
<p>


+ 22
- 9
trunk/python_scrapers/AcolnetParser.py Ver fichero

@@ -34,6 +34,7 @@ end_head_regex = re.compile("</head>?", re.IGNORECASE)


class AcolnetParser:
received_date_label = "Registration Date:"
received_date_format = "%d/%m/%Y"

comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
@@ -55,7 +56,7 @@ class AcolnetParser:
return app_table.a.string.strip()

def _getDateReceived(self, app_table):
date_str = ''.join(app_table.find(text="Registration Date:").findNext("td").string.strip().split())
date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split())
day, month, year = date_str.split('/')
return date(int(year), int(month), int(day))

@@ -205,6 +206,15 @@ class BridgnorthParser(AcolnetParser):
#http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
return self._current_application.info_url.replace("NewPages", "PgeCommentForm")

class BlackpoolParser(AcolnetParser):
received_date_label = "Application Date:"

def _getResultsSections(self, soup):
return soup.findAll("table", {"class": "acolnet-results-table"})

def _getCommentUrl(self, app_table):
ref = self._getCouncilReference(app_table)
return "https://www.blackpool.gov.uk/Services/M-R/PlanningApplications/Forms/PlanningNeighbourResponseForm.htm?Application_No=" + ref.replace('/','%2F')

class CanterburyParser(AcolnetParser):
"""Here the apps are one row each in a big table."""
@@ -227,14 +237,17 @@ class CanterburyParser(AcolnetParser):
def _getDescription(self, app_table):
return app_table.findAll("td")[2].string.strip()

#Kensington and chelsea is sufficiently different, it may as well be handled separately
class GreenwichParser(AcolnetParser):
received_date_label = "Registration date:"
comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentNeighbourForm&TheSystemkey=%s"

# Mid Bedfordshire - there is an acolnet here, but you have to have a username
# and password to access it!
def _getInfoUrl(self, app_table):
return AcolnetParser._getInfoUrl(self, app_table).replace('/?', '/acolnetcgi.gov?', 1)

#Kensington and chelsea is sufficiently different, it may as well be handled separately

class MidBedsParser(AcolnetParser):
def _getCouncilReference(self, app_table):
# return app_table.findAll("a")[1].string.strip()
return app_table.findAll("a")[1].string.strip()
class OldhamParser(AcolnetParser):
@@ -361,11 +374,11 @@ if __name__ == '__main__':
#parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

#parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

# parser = MidBedsParser("Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = AcolnetParser("Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

# parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = BlackpoolParser("Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
parser = GreenwichParser("London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
print parser.getResults(day, month, year)

+ 30
- 19
trunk/python_scrapers/Ocella.py Ver fichero

@@ -50,6 +50,7 @@ class OcellaParser:
# These will be used to store the column numbers of the appropriate items in the results table
self.reference_col = None
self.address_col = None
self.applicant_col = None
self.description_col = None
self.received_date_col = None
self.accepted_date_col = None
@@ -59,6 +60,7 @@ class OcellaParser:

# First get the search page
get_request = urllib2.Request(self.base_url)
get_request.add_header('Accept', 'text/html')
get_response = urllib2.urlopen(get_request)

cookie_jar.extract_cookies(get_response, get_request)
@@ -75,6 +77,14 @@ class OcellaParser:
# but it seems we don't need it...
session_id = None

# Unless we retrieve the correct form name, we will simply get the last week's applications
submit_tag = get_soup.find('input', {'value': 'Search'}) or get_soup.find('input', {'value': 'Search for Applications'}) or get_soup.find('input', {'value': 'Submit'})
try:
submit_name = submit_tag['name']
form_name = submit_name.split('.')[0]
except TypeError:
form_name = 'FRM_PLANNING_LIST'

# # From Breckland

# p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
@@ -88,21 +98,23 @@ class OcellaParser:
# FRM_WEEKLY_LIST.DEFAULT.PARISH.01=

post_data = urllib.urlencode(
[('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
[('p_object_name', form_name + '.DEFAULT.SUBMIT_TOP.01'),
('p_instance', '1'),
('p_event_type', 'ON_CLICK'),
('p_user_args', ''),
('p_session_id', session_id),
('p_page_url', self.base_url),
('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
(form_name + '.DEFAULT.AGENT.01', ''),
(form_name + '.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
(form_name + '.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
(form_name + '.DEFAULT.PARISH.01', ''),
]
)
post_request = urllib2.Request(action, post_data)
cookie_jar.add_cookie_header(post_request)

post_request.add_header('Accept', 'text/html')
post_request.add_header('Referer', self.base_url)

post_response = cookie_handling_opener.open(post_request)
@@ -119,10 +131,12 @@ class OcellaParser:
th_index = 0
for th in ths:
th_content = th.font.string.strip()
if th_content == 'Reference' or th_content == 'Application Ref':
if th_content == 'Reference' or th_content == 'Application Ref' or th_content == 'Application Number':
self.reference_col = th_index
elif th_content == 'Location':
self.address_col = th_index
elif th_content == 'Applicant Details':
self.applicant_col = th_index
elif th_content == 'Proposal':
self.description_col = th_index
elif th_content == 'Development Description':
@@ -159,8 +173,12 @@ class OcellaParser:

self._current_application.address = tds[self.address_col].font.string.strip()
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
if self._current_application.postcode is None and self.applicant_col is not None:
# won't always be accurate to do this but better than nothing (needed for Havering)
self._current_application.postcode = getPostcodeFromText(tds[self.applicant_col].font.string.strip())
self._current_application.description = tds[self.description_col].font.string.strip()
self._current_application.info_url = tds[self.reference_col].a['href']
# seems to be dependent on the implementation whether the URL is encoded (e.g. Great Yarmouth does this), so we cannot do anything more "standard"
self._current_application.info_url = urlparse.urljoin(post_response.geturl(), tds[self.reference_col].a['href'].replace('&amp;','&'))

# This is what a comment url looks like
# It seems to be no problem to remove the sessionid (which is in any case blank...)
@@ -184,20 +202,13 @@ if __name__ == '__main__':
# parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL")


# Bad status line? Try changing browser id string?
# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly")
# parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")

# Post never comes back
# parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")

# Can't find the URL similar to the others, even though it is clearly Ocella
# We get a 406 at the moment. Try browser id string?
parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/search")
parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")


print parser.getResults(21,5,2008)


+ 10
- 4
trunk/python_scrapers/SitesToGenerate.csv Ver fichero

@@ -192,7 +192,6 @@
"Chichester District Council", "Chichester", "http://pa.chichester.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"London Borough of Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"Braintree District Council", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", "WAM", "BraintreeParser"
"Castle Point Borough Council", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"Colchester Borough Council", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", "WAM", "BraintreeParser"
"East Lothian Council", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"North Somerset Council", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", "WAM", "BraintreeParser"
@@ -213,9 +212,9 @@
"Ellesmere Port and Neston Borough Council", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Fareham Borough Council", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"London Borough of Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"
"Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "MidBedsParser"
"Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
"Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser"
@@ -237,3 +236,10 @@
"Lewes District Council", "Lewes", "http://planning.lewes.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Olympic Delivery Authority", "Olympics", "http://planning.london2012.com/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BlackpoolParser"
"London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "GreenwichParser"
"Bridgend County Borough Council", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"London Borough of Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Castle Point Borough Council", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
"Great Yarmouth Borough Council", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"

+ 2
- 1
trunk/python_scrapers/SwiftLG.py Ver fichero

@@ -215,7 +215,8 @@ if __name__ == '__main__':
# parser = SwiftLGParser("St Edmundsbury", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/")
# parser = MacclesfieldParser("Macclesfield", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/")
# parser = SwiftLGParser("Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display")
parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display")
# parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display")
parser = SwiftLGParser("Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display")
print parser.getResults(26,6,2008)




Cargando…
Cancelar
Guardar