import urllib2
import urllib
import urlparse
import cgi
import re
import datetime
import BeautifulSoup
import cookielib
cookie_jar = cookielib.CookieJar()
from BeautifulSoup import BeautifulSoup
__auth__ = None
import re
date_format = "%d/%m/%Y"
def fixNewlines(text):
# This can be used to sort out windows newlines
return text.replace("\r\n","\n")
# So what can a postcode look like then?
# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
#AN NAA M1 1AA
#ANN NAA M60 1NW
#AAN NAA CR2 6XH
#AANN NAA DN55 1PT
#ANA NAA W1A 1HP
#AANA NAA EC1A 1BB
postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")
def getPostcodeFromText(text, default_postcode="No Postcode"):
"""This function takes a piece of text and returns the first
bit of it that looks like a postcode."""
postcode_match = postcode_regex.search(text)
return postcode_match.group() if postcode_match else default_postcode
class PlanningAuthorityResults:
"""This class represents a set of results of a planning search.
This should probably be separated out so that it can be used for
authorities other than Cherwell.
"""
def __init__(self, authority_name, authority_short_name):
self.authority_name = authority_name
self.authority_short_name = authority_short_name
# this will be a list of PlanningApplication objects
self.planning_applications = []
def addApplication(self, application):
self.planning_applications.append(application)
def __repr__(self):
return self.displayXML()
def displayXML(self):
"""This should display the contents of this object in the planningalerts format.
i.e. in the same format as this one:
http://www.planningalerts.com/lambeth.xml
"""
applications_bit = "".join([x.displayXML() for x in self.planning_applications])
return u"""\n""" + \
u"\n" +\
u"%s\n" %self.authority_name +\
u"%s\n" %self.authority_short_name +\
u"\n" + applications_bit +\
u"\n" +\
u"\n"
class PlanningApplication:
def __init__(self):
self.council_reference = None
self.address = None
self.postcode = None
self.description = None
self.info_url = None
self.comment_url = None
# expecting this as a datetime.date object
self.date_received = None
# If we can get them, we may as well include OSGB.
# These will be the entirely numeric version.
self.osgb_x = None
self.osgb_y = None
def __repr__(self):
return self.displayXML()
def is_ready(self):
# This method tells us if the application is complete
# Because of the postcode default, we can't really
# check the postcode - make sure it is filled in when
# you do the address.
return self.council_reference \
and self.address \
and self.description \
and self.info_url \
and self.comment_url \
and self.date_received
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
if not self.postcode:
self.postcode = getPostcodeFromText(self.address)
contents = [
u"" %(self.council_reference),
u"
" %(self.address),
u"" %self.postcode,
u"" %(self.description),
u"" %(self.info_url),
u"" %(self.comment_url),
u"" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"%s" %(self.osgb_x))
if self.osgb_y:
contents.append(u"%s" %(self.osgb_y))
return u"\n%s\n" %('\n'.join(contents))
# Date format to enter into search boxes
date_format = "%d/%m/%Y"
# Regex for getting the application code
# (needed for the comments url, when it exists)
app_code_regex = re.compile("PARAM0=(\d*)")
class PlanningExplorerParser:
# If this authority doesn't have a comments page,
# then set this email_address to an address for the
# planning department, and it will be used in lieu of
# a comments url.
comments_email_address = None
# These are the directories where the info urls, and search urls,
# usually live underneath the base_url.
# If these are different for a particular
# authority, then they can be overridden in a subclass.
info_url_path = "MVM/Online/Generic/"
search_url_path = "MVM/Online/PL/GeneralSearch.aspx"
# This is the most common place for comments urls to live
# The %s will be filled in with an application code
comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s"
# Most authorities don't need the referer header on the post
# request. If one does, override this in the subclass
use_referer = False
# Some authorities won't give us anything back if we use the
# python urllib2 useragent string. In that case, override this
# in a subclass to pretend to be firefox.
use_firefox_user_agent = False
# This is the most common css class of the table containing the
# the search results. If it is different for a particular authority
# it can be overridden in a subclass
results_table_attrs = {"class": "ResultsTable"}
# These are the most common column positions for the
# council reference, the address, and the description
# in the results table.
# They should be overridden in subclasses if they are different
# for a particular authority.
reference_td_no = 0
address_td_no = 1
description_td_no = 2
# In some cases we won't be able to get the full address/description/postcode without getting the info page for each app.
# If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
fetch_info_page = False
asp_args_regex = re.compile(']*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')
def _modify_response(self, response):
"""For most sites, we have managed to get all the apps on a
single page by choosing the right parameters.
If that hasn't been possible, override this method to get a
new response object which has all the apps in one page.
(See, for example, Hackney).
"""
return response
def _find_trs(self, results_table):
"""Normally, we just want a list of all the trs except the first one
(which is usually a header).
If the authority requires a different list of trs, override this method.
"""
return results_table.findAll("tr")[1:]
def _sanitisePostHtml(self, html):
"""This method can be overriden in subclasses if the
html that comes back from the post request is bad, and
needs tidying up before giving it to BeautifulSoup."""
return html
def _sanitiseInfoUrl(self, url):
"""If an authority has info urls which are for some reason full
of crap (like Broadland does), then this method should be overridden
in order to tidy them up."""
return ''.join(url.split())
def _getHeaders(self):
"""If the authority requires any headers for the post request,
override this method returning a dictionary of header key to
header value."""
headers = {}
if self.use_firefox_user_agent:
headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10"
if self.use_referer:
headers["Referer"] = self.search_url
return headers
def _getPostData(self, asp_args, search_date):
"""Accepts asp_args (a tuple of key value pairs of the pesky ASP
parameters, and search_date, a datetime.date object for the day
we are searching for.
This seems to be the most common set of post data which is needed
for PlanningExplorer sites. It won't work for all of them, so
will sometimes need to be overridden in a subclass.
The parameter edrDateSelection is often not needed.
It is needed by Charnwood though, so I've left it in
to keep things simple.
"""
year_month_day = search_date.timetuple()[:3]
post_data = urllib.urlencode(asp_args + (
("_ctl0", "DATE_REGISTERED"),
("rbGroup", "_ctl5"),
("_ctl7_hidden", urllib.quote('' %year_month_day)),
("_ctl8_hidden", urllib.quote('' %year_month_day)),
("edrDateSelection", "1"),
("csbtnSearch", "Search"),
("cboNumRecs", "99999"),
))
return post_data
def _getAddress(self, tds, info_soup):
# If this td contains a div, then the address is the
# string in there - otherwise, use the string in the td.
address_td = tds[self.address_td_no]
if address_td.div is not None:
address = address_td.div.string
else:
address = address_td.string
return address
def _getPostCode(self, info_soup):
"""In most cases, the postcode can be got from the address in
the results table. Some councils put the address there without the
postcode. In this case we will have to go to the info page to get
the postcode. This should be done by overriding this method with
one that parses the info page."""
return getPostcodeFromText(self._current_application.address)
def _getDescription(self, tds, info_soup):
description_td = tds[self.description_td_no]
if description_td.div is not None:
# Mostly this is in a div
# Use the empty string if the description is missing
description = description_td.div.string or ""
else:
# But sometimes (eg Crewe) it is directly in the td.
# Use the empty string if the description is missing
description = description_td.string or ""
return description
def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):
self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url
self.search_url = urlparse.urljoin(base_url, self.search_url_path)
self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path)
self.debug = debug
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)
# First do a get, to get some state
get_request = urllib2.Request(self.search_url)
get_response = urllib2.urlopen(get_request)
cookie_jar.extract_cookies(get_response, get_request)
html = get_response.read()
# We need to find those ASP parameters such as __VIEWSTATE
# so we can use them in the next POST
# re.findall gets us a list of key value pairs.
# We want to concatenate it with a tuple, so we must
# make it a tuple
asp_args = tuple(re.findall(self.asp_args_regex, html))
# The post data needs to be different for different councils
# so we have a method on each council's scraper to make it.
post_data = self._getPostData(asp_args, search_date)
headers = self._getHeaders()
request = urllib2.Request(self.search_url, post_data, headers)
cookie_jar.add_cookie_header(request)
post_response = urllib2.urlopen(request)
# We have actually been returned here by an http302 object
# moved, and the response called post_response is really a get.
# In some cases, we can't get the page size set high
# until now. In that case, override _modify_response
# so that we get back a response with all the apps on one page.
# We pass in headers so that any
post_response = self._modify_response(post_response)
html = self._sanitisePostHtml(post_response.read())
soup = BeautifulSoup(html)
results_table = soup.find("table", attrs=self.results_table_attrs)
# If there is no results table, then there were no apps on that day.
if results_table:
trs = self._find_trs(results_table)
self._current_application = None
# The first tr is just titles, cycle through the trs after that
for tr in trs:
self._current_application = PlanningApplication()
# There is no need to search for the date_received, it's what
# we searched for
self._current_application.date_received = search_date
tds = tr.findAll("td")
self._current_application.council_reference = tds[self.reference_td_no].a.string
relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href'])
self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)
# Fetch the info page if we need it, otherwise set it to None
if self.fetch_info_page:
# We need to quote the spaces in the info url
info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))
info_soup = BeautifulSoup(urllib2.urlopen(info_request))
else:
info_soup = None
# What about a comment url?
# There doesn't seem to be one, so we'll use the email address
if self.comments_email_address is not None:
# We're using the email address, as there doesn't seem
# to be a web form for comments
self._current_application.comment_url = self.comments_email_address
else:
# This link contains a code which we need for the comments url
# (on those sites that use it)
application_code = app_code_regex.search(relative_info_url).groups()[0]
relative_comments_url = self.comments_path %(application_code)
self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)
self._current_application.address = self._getAddress(tds, info_soup)
self._current_application.postcode = self._getPostCode(info_soup)
self._current_application.description = self._getDescription(tds, info_soup)
self._results.addApplication(self._current_application)
return self._results
def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
class BroadlandLike:
# FIXME - BroadlandLike authorities don't have postcodes on their site, but
# they do have grid references. We should use these.
results_table_attrs = {"class": "display_table"}
info_url_path = "Northgate/PlanningExplorer/Generic/"
search_url_path = "Northgate/PlanningExplorer/GeneralSearch.aspx"
use_firefox_user_agent = True
use_referer = True
def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("cboSelectDateValue", "DATE_RECEIVED"),
("rbGroup", "rbRange"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("cboNumRecs", "99999"),
("csbtnSearch", "Search"),
))
return post_data
def _sanitiseInfoUrl(self, url):
"""The broadland info urls arrive full of rubbish. This method tidies
them up."""
# We need to
# 1) Remove whitespace
# 2) Remove
and
ws_re = re.compile("(?:(?:\s)|(?:\w;))*")
return ''.join(ws_re.split(url))
class BlackburnParser(PlanningExplorerParser):
use_firefox_user_agent = True
class BroadlandParser(BroadlandLike, PlanningExplorerParser):
# FIXME - is http://secure.broadland.gov.uk/mvm/Online/PL/GeneralSearch.aspx
# a better url for Broadland?
def _sanitisePostHtml(self, html):
"""The page that comes back from the post for the broadland site
has a broken doctype declaration. We need to tidy that up before
giving it to BeautifulSoup."""
# This is what it looks like - note the missing close doublequote
#'.join(html.split('