Explorar el Código

Add Ocella scraper that works for seven councils. Woop!

import/raw
duncan.parkes hace 16 años
padre
commit
6c75c672a6
Se han modificado 1 ficheros con 130 adiciones y 34 borrados
  1. +130
    -34
      trunk/python_scrapers/Ocella.py

+ 130
- 34
trunk/python_scrapers/Ocella.py Ver fichero

@@ -2,7 +2,7 @@ import urllib2
import urllib
import urlparse

import datetime
import datetime, time

import cookielib

@@ -14,9 +14,25 @@ from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d-%m-%Y"
search_date_format = "%d-%m-%Y" # Format used for the accepted date when searching

possible_date_formats = [search_date_format, "%d/%m/%Y"]

class CookieAddingHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
"""The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""
def redirect_request(self, req, fp, code, msg, headers, newurl):
new_request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
# We need to add a cookie from the cookie_jar
cookie_jar.add_cookie_header(new_request)

return new_request

cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler())


class OcellaParser:
received_date_format = search_date_format

def __init__(self,
authority_name,
authority_short_name,
@@ -31,6 +47,12 @@ class OcellaParser:

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

# These will be used to store the column numbers of the appropriate items in the results table
self.reference_col = None
self.address_col = None
self.description_col = None
self.received_date_col = None
self.accepted_date_col = None

def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)
@@ -47,16 +69,28 @@ class OcellaParser:
action = get_soup.form['action']
session_id = get_soup.find('input', {'name': 'p_session_id'})['value']

# # From Breckland

# p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
# p_instance=1
# p_event_type=ON_CLICK
# p_user_args=
# p_session_id=53573
# p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
# FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=02-06-2008
# FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=09-06-2008
# FRM_WEEKLY_LIST.DEFAULT.PARISH.01=

post_data = urllib.urlencode(
[#('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
#('p_instance', '1'),
#('p_event_type', 'ON_CLICK'),
#('p_user_args', ''),
[('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
('p_instance', '1'),
('p_event_type', 'ON_CLICK'),
('p_user_args', ''),
('p_session_id', session_id),
#('p_page_url', self.base_url),
('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', '02-06-2008'), #search_date.strftime(date_format),
('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', '09-06-2008'),#search_date.strftime(date_format),
#('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
('p_page_url', self.base_url),
('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
]
)
@@ -65,39 +99,101 @@ class OcellaParser:

post_request.add_header('Referer', self.base_url)

post_response = urllib2.urlopen(post_request)
post_response = cookie_handling_opener.open(post_request)

import pdb;pdb.set_trace()
post_soup = BeautifulSoup(post_response.read())

# # From Breckland
results_table = post_soup.find("table", summary="Printing Table Headers")

# p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
# p_instance=1
# p_event_type=ON_CLICK
# p_user_args=
# p_session_id=53573
# p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
# FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=02-06-2008
# FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=09-06-2008
# FRM_WEEKLY_LIST.DEFAULT.PARISH.01=
trs = results_table.findAll("tr")

# # Mine
# p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
# p_user_args=
# FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=21-05-2008
# FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=21-05-2008
# p_session_id=53576
# p_instance=1
# p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
# FRM_WEEKLY_LIST.DEFAULT.PARISH.01=
# p_event_type=ON_CLICK
# We'll use the headings in the first tr to find out what columns the address, description, etc are in.
ths = trs[0].findAll("th")

th_index = 0
for th in ths:
th_content = th.font.string.strip()
if th_content == 'Reference' or th_content == 'Application Ref':
self.reference_col = th_index
elif th_content == 'Location':
self.address_col = th_index
elif th_content == 'Proposal':
self.description_col = th_index
elif th_content == 'Development Description':
self.description_col = th_index
elif th_content == 'Received Date' or th_content == 'Date Received':
self.received_date_col = th_index
elif th_content == 'Accepted Date':
self.accepted_date_col = th_index

th_index += 1
# If there is a received date, we'll use that, otherwise, we'll have to settle for the accepted date.
self.received_date_col = self.received_date_col or self.accepted_date_col

# We want all the trs except the first one, which is just headers,
# and the last, which is empty
trs = trs[1:-1]

for tr in trs:
self._current_application = PlanningApplication()

tds = tr.findAll("td")

self._current_application.council_reference = (tds[self.reference_col].font.a or tds[self.reference_col].a.font).string.strip()

date_string = tds[self.received_date_col]

for possible_format in possible_date_formats:
try:
self._current_application.date_received = datetime.datetime(*(time.strptime(tds[self.received_date_col].font.string.strip(), possible_format)[0:6]))
except ValueError:
pass

self._current_application.address = tds[self.address_col].font.string.strip()
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
self._current_application.description = tds[self.description_col].font.string.strip()
self._current_application.info_url = tds[self.reference_col].a['href']

# This is what a comment url looks like
# It seems to be no problem to remove the sessionid (which is in any case blank...)
# I can't see a good way to avoid having to go to the info page to find the moduleid though.

#http://wplan01.intranet.breckland.gov.uk:7778/pls/portal/PORTAL.wwa_app_module.link?p_arg_names=_moduleid&p_arg_values=8941787057&p_arg_names=_sessionid&p_arg_values=&p_arg_names=APPLICATION_REFERENCE&p_arg_values=3PL%2F2008%2F0877%2FF

# For the moment, we'll just use the info url, as that seems to work.
self._current_application.comment_url = self._current_application.info_url
self._results.addApplication(self._current_application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


if __name__ == '__main__':
parser = OcellaParser("Breckland Council", "Breckland", "http://wplan01.intranet.breckland.gov.uk:7778/portal/page?_pageid=33,30988&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Arun", "Arun", "http://www.arun.gov.uk/iplanning/portal/page?_pageid=33,4139&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Breckland Council", "Breckland", "http://wplan01.intranet.breckland.gov.uk:7778/portal/page?_pageid=33,30988&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL")

# Bad status line?
# parser = BrecklandParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
# parser = ArunParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")

# Can't find the URL similar to the others, even though it is clearly Ocella
parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://www.great-yarmouth.gov.uk/wmplan_application_search-6.htm")



print parser.getResults(21,5,2008)

#TODO

# 1) Sort out proper comment url?
# 2) Check for pagination
# 3) Check no results case

Cargando…
Cancelar
Guardar