|
@@ -0,0 +1,568 @@ |
|
|
|
|
|
import urllib2 |
|
|
|
|
|
import urllib |
|
|
|
|
|
import urlparse |
|
|
|
|
|
import cgi |
|
|
|
|
|
import re |
|
|
|
|
|
import datetime |
|
|
|
|
|
|
|
|
|
|
|
from BeautifulSoup import BeautifulSoup |
|
|
|
|
|
|
|
|
|
|
|
from PlanningUtils import PlanningApplication, \ |
|
|
|
|
|
PlanningAuthorityResults, \ |
|
|
|
|
|
getPostcodeFromText |
|
|
|
|
|
|
|
|
|
|
|
# Date format to enter into search boxes |
|
|
|
|
|
date_format = "%d/%m/%Y" |
|
|
|
|
|
|
|
|
|
|
|
# Regex for getting the application code |
|
|
|
|
|
# (needed for the comments url, when it exists) |
|
|
|
|
|
app_code_regex = re.compile("PARAM0=(\d*)") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PlanningExplorerParser: |
|
|
|
|
|
# If this authority doesn't have a comments page, |
|
|
|
|
|
# then set this email_address to an address for the |
|
|
|
|
|
# planning department, and it will be used in lieu of |
|
|
|
|
|
# a comments url. |
|
|
|
|
|
comments_email_address = None |
|
|
|
|
|
|
|
|
|
|
|
# These are the directories where the info urls, and search urls, |
|
|
|
|
|
# usually live underneath the base_url. |
|
|
|
|
|
# If these are different for a particular |
|
|
|
|
|
# authority, then they can be overridden in a subclass. |
|
|
|
|
|
info_url_path = "MVM/Online/Generic/" |
|
|
|
|
|
search_url_path = "MVM/Online/PL/GeneralSearch.aspx" |
|
|
|
|
|
|
|
|
|
|
|
# This is the most common place for comments urls to live |
|
|
|
|
|
# The %s will be filled in with an application code |
|
|
|
|
|
comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s" |
|
|
|
|
|
|
|
|
|
|
|
# Most authorities don't need the referer header on the post |
|
|
|
|
|
# request. If one does, override this in the subclass |
|
|
|
|
|
use_referer = False |
|
|
|
|
|
|
|
|
|
|
|
# Some authorities won't give us anything back if we use the |
|
|
|
|
|
# python urllib2 useragent string. In that case, override this |
|
|
|
|
|
# in a subclass to pretend to be firefox. |
|
|
|
|
|
use_firefox_user_agent = False |
|
|
|
|
|
|
|
|
|
|
|
# This is the most common css class of the table containing the |
|
|
|
|
|
# the search results. If it is different for a particular authority |
|
|
|
|
|
# it can be overridden in a subclass |
|
|
|
|
|
results_table_attrs = {"class": "ResultsTable"} |
|
|
|
|
|
|
|
|
|
|
|
# These are the most common column positions for the |
|
|
|
|
|
# council reference, the address, and the description |
|
|
|
|
|
# in the results table. |
|
|
|
|
|
# They should be overridden in subclasses if they are different |
|
|
|
|
|
# for a particular authority. |
|
|
|
|
|
reference_td_no = 0 |
|
|
|
|
|
address_td_no = 1 |
|
|
|
|
|
description_td_no = 2 |
|
|
|
|
|
|
|
|
|
|
|
def _modify_response(self, response): |
|
|
|
|
|
"""For most sites, we have managed to get all the apps on a |
|
|
|
|
|
single page by choosing the right parameters. |
|
|
|
|
|
If that hasn't been possible, override this method to get a |
|
|
|
|
|
new response object which has all the apps in one page. |
|
|
|
|
|
(See, for example, Hackney). |
|
|
|
|
|
""" |
|
|
|
|
|
return response |
|
|
|
|
|
|
|
|
|
|
|
def _find_trs(self, results_table): |
|
|
|
|
|
"""Normally, we just want a list of all the trs except the first one |
|
|
|
|
|
(which is usually a header). |
|
|
|
|
|
If the authority requires a different list of trs, override this method. |
|
|
|
|
|
""" |
|
|
|
|
|
return results_table.findAll("tr")[1:] |
|
|
|
|
|
|
|
|
|
|
|
def _sanitisePostHtml(self, html): |
|
|
|
|
|
"""This method can be overriden in subclasses if the |
|
|
|
|
|
html that comes back from the post request is bad, and |
|
|
|
|
|
needs tidying up before giving it to BeautifulSoup.""" |
|
|
|
|
|
return html |
|
|
|
|
|
|
|
|
|
|
|
def _sanitiseInfoUrl(self, url): |
|
|
|
|
|
"""If an authority has info urls which are for some reason full |
|
|
|
|
|
of crap (like Broadland does), then this method should be overridden |
|
|
|
|
|
in order to tidy them up.""" |
|
|
|
|
|
return url |
|
|
|
|
|
|
|
|
|
|
|
def _getHeaders(self): |
|
|
|
|
|
"""If the authority requires any headers for the post request, |
|
|
|
|
|
override this method returning a dictionary of header key to |
|
|
|
|
|
header value.""" |
|
|
|
|
|
headers = {} |
|
|
|
|
|
|
|
|
|
|
|
if self.use_firefox_user_agent: |
|
|
|
|
|
headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10" |
|
|
|
|
|
|
|
|
|
|
|
if self.use_referer: |
|
|
|
|
|
headers["Referer"] = self.search_url |
|
|
|
|
|
|
|
|
|
|
|
return headers |
|
|
|
|
|
|
|
|
|
|
|
def _getPostData(self, asp_args, search_date): |
|
|
|
|
|
"""Accepts asp_args (a tuple of key value pairs of the pesky ASP |
|
|
|
|
|
parameters, and search_date, a datetime.date object for the day |
|
|
|
|
|
we are searching for. |
|
|
|
|
|
|
|
|
|
|
|
This seems to be the most common set of post data which is needed |
|
|
|
|
|
for PlanningExplorer sites. It won't work for all of them, so |
|
|
|
|
|
will sometimes need to be overridden in a subclass. |
|
|
|
|
|
|
|
|
|
|
|
The parameter edrDateSelection is often not needed. |
|
|
|
|
|
It is needed by Charnwood though, so I've left it in |
|
|
|
|
|
to keep things simple. |
|
|
|
|
|
""" |
|
|
|
|
|
year_month_day = search_date.timetuple()[:3] |
|
|
|
|
|
|
|
|
|
|
|
post_data = urllib.urlencode(asp_args + ( |
|
|
|
|
|
("_ctl0", "DATE_RECEIVED"), |
|
|
|
|
|
("rbGroup", "_ctl5"), |
|
|
|
|
|
("_ctl7_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)), |
|
|
|
|
|
("_ctl8_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)), |
|
|
|
|
|
("edrDateSelection", "1"), |
|
|
|
|
|
("csbtnSearch", "Search"), |
|
|
|
|
|
("cboNumRecs", "99999"), |
|
|
|
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
return post_data |
|
|
|
|
|
|
|
|
|
|
|
def _getPostCode(self): |
|
|
|
|
|
"""In most cases, the postcode can be got from the address in |
|
|
|
|
|
the results table. Some councils put the address there without the |
|
|
|
|
|
postcode. In this case we will have to go to the info page to get |
|
|
|
|
|
the postcode. This should be done by overriding this method with |
|
|
|
|
|
one that parses the info page.""" |
|
|
|
|
|
|
|
|
|
|
|
return getPostcodeFromText(self._current_application.address) |
|
|
|
|
|
|
|
|
|
|
|
def __init__(self, |
|
|
|
|
|
authority_name, |
|
|
|
|
|
authority_short_name, |
|
|
|
|
|
base_url, |
|
|
|
|
|
debug=False): |
|
|
|
|
|
|
|
|
|
|
|
self.authority_name = authority_name |
|
|
|
|
|
self.authority_short_name = authority_short_name |
|
|
|
|
|
self.base_url = base_url |
|
|
|
|
|
|
|
|
|
|
|
self.search_url = urlparse.urljoin(base_url, self.search_url_path) |
|
|
|
|
|
self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path) |
|
|
|
|
|
|
|
|
|
|
|
self.debug = debug |
|
|
|
|
|
|
|
|
|
|
|
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) |
|
|
|
|
|
|
|
|
|
|
|
def getResultsByDayMonthYear(self, day, month, year): |
|
|
|
|
|
search_date = datetime.date(year, month, day) |
|
|
|
|
|
|
|
|
|
|
|
# First do a get, to get some state |
|
|
|
|
|
get_request = urllib2.Request(self.search_url) |
|
|
|
|
|
get_response = urllib2.urlopen(get_request) |
|
|
|
|
|
|
|
|
|
|
|
html = get_response.read() |
|
|
|
|
|
|
|
|
|
|
|
# We need to find those ASP parameters such as __VIEWSTATE |
|
|
|
|
|
# so we can use them in the next POST |
|
|
|
|
|
asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>') |
|
|
|
|
|
|
|
|
|
|
|
# re.findall gets us a list of key value pairs. |
|
|
|
|
|
# We want to concatenate it with a tuple, so we must |
|
|
|
|
|
# make it a tuple |
|
|
|
|
|
asp_args = tuple(re.findall(asp_args_regex, html)) |
|
|
|
|
|
|
|
|
|
|
|
# The post data needs to be different for different councils |
|
|
|
|
|
# so we have a method on each council's scraper to make it. |
|
|
|
|
|
post_data = self._getPostData(asp_args, search_date) |
|
|
|
|
|
|
|
|
|
|
|
headers = self._getHeaders() |
|
|
|
|
|
|
|
|
|
|
|
request = urllib2.Request(self.search_url, post_data, headers) |
|
|
|
|
|
post_response = urllib2.urlopen(request) |
|
|
|
|
|
|
|
|
|
|
|
# We have actually been returned here by an http302 object |
|
|
|
|
|
# moved, and the response called post_response is really a get. |
|
|
|
|
|
|
|
|
|
|
|
# In some cases, we can't get the page size set high |
|
|
|
|
|
# until now. In that case, override _modify_response |
|
|
|
|
|
# so that we get back a response with all the apps on one page. |
|
|
|
|
|
# We pass in headers so that any |
|
|
|
|
|
post_response = self._modify_response(post_response) |
|
|
|
|
|
|
|
|
|
|
|
html = self._sanitisePostHtml(post_response.read()) |
|
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(html) |
|
|
|
|
|
|
|
|
|
|
|
results_table = soup.find("table", attrs=self.results_table_attrs) |
|
|
|
|
|
|
|
|
|
|
|
# If there is no results table, then there were no apps on that day. |
|
|
|
|
|
if results_table: |
|
|
|
|
|
trs = self._find_trs(results_table) |
|
|
|
|
|
|
|
|
|
|
|
self._current_application = None |
|
|
|
|
|
|
|
|
|
|
|
# The first tr is just titles, cycle through the trs after that |
|
|
|
|
|
for tr in trs: |
|
|
|
|
|
self._current_application = PlanningApplication() |
|
|
|
|
|
|
|
|
|
|
|
# There is no need to search for the date_received, it's what |
|
|
|
|
|
# we searched for |
|
|
|
|
|
self._current_application.date_received = search_date |
|
|
|
|
|
|
|
|
|
|
|
tds = tr.findAll("td") |
|
|
|
|
|
|
|
|
|
|
|
for td_no in range(len(tds)): |
|
|
|
|
|
if td_no == self.reference_td_no: |
|
|
|
|
|
# This td contains the reference number and a link to details |
|
|
|
|
|
self._current_application.council_reference = tds[td_no].a.string |
|
|
|
|
|
|
|
|
|
|
|
relative_info_url = self._sanitiseInfoUrl(tds[td_no].a['href']) |
|
|
|
|
|
|
|
|
|
|
|
self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# What about a comment url? |
|
|
|
|
|
# There doesn't seem to be one, so we'll use the email address |
|
|
|
|
|
if self.comments_email_address is not None: |
|
|
|
|
|
# We're using the email address, as there doesn't seem |
|
|
|
|
|
# to be a web form for comments |
|
|
|
|
|
self._current_application.comment_url = self.comments_email_address |
|
|
|
|
|
else: |
|
|
|
|
|
# This link contains a code which we need for the comments url |
|
|
|
|
|
# (on those sites that use it) |
|
|
|
|
|
application_code = app_code_regex.search(relative_info_url).groups()[0] |
|
|
|
|
|
|
|
|
|
|
|
relative_comments_url = self.comments_path %(application_code) |
|
|
|
|
|
self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url) |
|
|
|
|
|
|
|
|
|
|
|
elif td_no == self.address_td_no: |
|
|
|
|
|
# If this td contains a div, then the address is the |
|
|
|
|
|
# string in there - otherwise, use the string in the td. |
|
|
|
|
|
if tds[td_no].div is not None: |
|
|
|
|
|
address = tds[td_no].div.string |
|
|
|
|
|
else: |
|
|
|
|
|
address = tds[td_no].string |
|
|
|
|
|
|
|
|
|
|
|
self._current_application.address = address |
|
|
|
|
|
|
|
|
|
|
|
self._current_application.postcode = self._getPostCode() |
|
|
|
|
|
|
|
|
|
|
|
elif td_no == self.description_td_no: |
|
|
|
|
|
if tds[td_no].div is not None: |
|
|
|
|
|
# Mostly this is in a div |
|
|
|
|
|
# Use the empty string if the description is missing |
|
|
|
|
|
description = tds[td_no].div.string or "" |
|
|
|
|
|
else: |
|
|
|
|
|
# But sometimes (eg Crewe) it is directly in the td. |
|
|
|
|
|
# Use the empty string if the description is missing |
|
|
|
|
|
description = tds[td_no].string or "" |
|
|
|
|
|
|
|
|
|
|
|
self._current_application.description = description |
|
|
|
|
|
|
|
|
|
|
|
self._results.addApplication(self._current_application) |
|
|
|
|
|
|
|
|
|
|
|
return self._results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getResults(self, day, month, year): |
|
|
|
|
|
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BroadlandLike: |
|
|
|
|
|
# FIXME - BroadlandLike authorities don't have postcodes on their site, but |
|
|
|
|
|
# they do have grid references. We should use these. |
|
|
|
|
|
|
|
|
|
|
|
results_table_attrs = {"class": "display_table"} |
|
|
|
|
|
|
|
|
|
|
|
info_url_path = "Northgate/PlanningExplorer/Generic/" |
|
|
|
|
|
search_url_path = "Northgate/PlanningExplorer/GeneralSearch.aspx" |
|
|
|
|
|
|
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
use_referer = True |
|
|
|
|
|
|
|
|
|
|
|
def _getPostData(self, asp_args, search_date): |
|
|
|
|
|
post_data = urllib.urlencode(asp_args + ( |
|
|
|
|
|
("cboSelectDateValue", "DATE_RECEIVED"), |
|
|
|
|
|
("rbGroup", "rbRange"), |
|
|
|
|
|
("dateStart", search_date.strftime(date_format)), |
|
|
|
|
|
("dateEnd", search_date.strftime(date_format)), |
|
|
|
|
|
("cboNumRecs", "99999"), |
|
|
|
|
|
("csbtnSearch", "Search"), |
|
|
|
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
return post_data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _sanitiseInfoUrl(self, url): |
|
|
|
|
|
"""The broadland info urls arrive full of rubbish. This method tidies |
|
|
|
|
|
them up.""" |
|
|
|
|
|
|
|
|
|
|
|
# We need to |
|
|
|
|
|
# 1) Remove whitespace |
|
|
|
|
|
# 2) Remove 
 and 
 |
|
|
|
|
|
|
|
|
|
|
|
ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*") |
|
|
|
|
|
|
|
|
|
|
|
return ''.join(ws_re.split(url)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BlackburnParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
class BroadlandParser(BroadlandLike, PlanningExplorerParser): |
|
|
|
|
|
# FIXME - is http://secure.broadland.gov.uk/mvm/Online/PL/GeneralSearch.aspx |
|
|
|
|
|
# a better url for Broadland? |
|
|
|
|
|
|
|
|
|
|
|
def _sanitisePostHtml(self, html): |
|
|
|
|
|
"""The page that comes back from the post for the broadland site |
|
|
|
|
|
has a broken doctype declaration. We need to tidy that up before |
|
|
|
|
|
giving it to BeautifulSoup.""" |
|
|
|
|
|
|
|
|
|
|
|
# This is what it looks like - note the missing close doublequote |
|
|
|
|
|
#<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd> |
|
|
|
|
|
|
|
|
|
|
|
# Split on the broken doctype and join with the doctype with |
|
|
|
|
|
# closing quote. |
|
|
|
|
|
|
|
|
|
|
|
html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.join(html.split('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>')) |
|
|
|
|
|
|
|
|
|
|
|
return html |
|
|
|
|
|
|
|
|
|
|
|
class CamdenParser(BroadlandLike, PlanningExplorerParser): |
|
|
|
|
|
comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s" |
|
|
|
|
|
|
|
|
|
|
|
class CharnwoodParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
class CreweParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
address_td_no = 4 |
|
|
|
|
|
|
|
|
|
|
|
def _getPostData(self, asp_args, search_date): |
|
|
|
|
|
year_month_day = search_date.timetuple()[:3] |
|
|
|
|
|
|
|
|
|
|
|
post_data = urllib.urlencode(asp_args + ( |
|
|
|
|
|
("drDateReceived:_ctl0_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)), |
|
|
|
|
|
("drDateReceivedxxctl0_input", search_date.strftime(date_format)), |
|
|
|
|
|
("drDateReceived:_ctl1_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)), |
|
|
|
|
|
("drDateReceivedxxctl1_input", search_date.strftime(date_format)), |
|
|
|
|
|
("cboNumRecs", "99999"), |
|
|
|
|
|
("csbtnSearch", "Search"), |
|
|
|
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
return post_data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EastStaffsParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
address_td_no = 4 |
|
|
|
|
|
description_td_no = 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EppingForestParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
address_td_no = 3 |
|
|
|
|
|
description_td_no = 1 |
|
|
|
|
|
|
|
|
|
|
|
class ForestHeathParser(BroadlandLike, PlanningExplorerParser): |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
class HackneyParser(PlanningExplorerParser): |
|
|
|
|
|
# FIXME - This will only get the first ten records on this |
|
|
|
|
|
# day. Need to deal with paging. |
|
|
|
|
|
|
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
address_td_no = 6 |
|
|
|
|
|
description_td_no = 5 |
|
|
|
|
|
|
|
|
|
|
|
def _modify_response(self, response): |
|
|
|
|
|
# In order to make sure we don't have to worry about any paging, |
|
|
|
|
|
# We'll fetch this url again with PS=99999. |
|
|
|
|
|
real_url_tuple = urlparse.urlsplit(response.geturl()) |
|
|
|
|
|
|
|
|
|
|
|
query_string = real_url_tuple[3] |
|
|
|
|
|
|
|
|
|
|
|
# Get the query as a list of key, value pairs |
|
|
|
|
|
parsed_query_list = list(cgi.parse_qsl(query_string)) |
|
|
|
|
|
|
|
|
|
|
|
# Go through the query string replacing any PS parameters |
|
|
|
|
|
# with PS=99999 |
|
|
|
|
|
|
|
|
|
|
|
for i in range(len(parsed_query_list)): |
|
|
|
|
|
key, value = parsed_query_list[i] |
|
|
|
|
|
|
|
|
|
|
|
if key == "PS": |
|
|
|
|
|
value = "99999" |
|
|
|
|
|
parsed_query_list[i] = (key, value) |
|
|
|
|
|
|
|
|
|
|
|
new_query_string = urllib.urlencode(parsed_query_list) |
|
|
|
|
|
|
|
|
|
|
|
new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:] |
|
|
|
|
|
|
|
|
|
|
|
new_url = urlparse.urlunsplit(new_url_tuple) |
|
|
|
|
|
new_request = urllib2.Request(new_url, None, self._getHeaders()) |
|
|
|
|
|
new_response = urllib2.urlopen(new_request) |
|
|
|
|
|
|
|
|
|
|
|
return new_response |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _getPostData(self, asp_args, search_date): |
|
|
|
|
|
post_data = urllib.urlencode(asp_args + ( |
|
|
|
|
|
("ctl00", "DATE_RECEIVED"), |
|
|
|
|
|
("rbGroup", "ctl05"), |
|
|
|
|
|
("ctl07_input", search_date.strftime(date_format)), |
|
|
|
|
|
("ctl08_input", search_date.strftime(date_format)), |
|
|
|
|
|
("edrDateSelection", "1"), |
|
|
|
|
|
("csbtnSearch", "Search"), |
|
|
|
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
return post_data |
|
|
|
|
|
|
|
|
|
|
|
class KennetParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
address_td_no = 3 |
|
|
|
|
|
|
|
|
|
|
|
class LincolnParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
class LiverpoolParser(PlanningExplorerParser): |
|
|
|
|
|
comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk" |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
use_referer = True |
|
|
|
|
|
|
|
|
|
|
|
results_table_attrs = {"xmlns:mvm":"http://www.mvm.co.uk"} |
|
|
|
|
|
|
|
|
|
|
|
info_url_path = "mvm/" |
|
|
|
|
|
search_url_path = "mvm/planningsearch.aspx" |
|
|
|
|
|
|
|
|
|
|
|
def _find_trs(self, results_table): |
|
|
|
|
|
"""In this case we are after all trs except the first two which have a |
|
|
|
|
|
class attribute row0 or row1.""" |
|
|
|
|
|
return results_table.findAll("tr", {"class":["row0", "row1"]})[3:] |
|
|
|
|
|
|
|
|
|
|
|
def _getPostData(self, asp_args, search_date): |
|
|
|
|
|
post_data = urllib.urlencode(asp_args + ( |
|
|
|
|
|
("dummy", "dummy field\tused for custom\tvalidator"), |
|
|
|
|
|
("drReceived$txtStart", search_date.strftime(date_format)), |
|
|
|
|
|
("drReceived$txtEnd", search_date.strftime(date_format)), |
|
|
|
|
|
("cboNumRecs", "99999"), |
|
|
|
|
|
("cmdSearch", "Search"), |
|
|
|
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
return post_data |
|
|
|
|
|
|
|
|
|
|
|
def _sanitiseInfoUrl(self, url): |
|
|
|
|
|
"""The liverpool info urls arrive full of rubbish. This method tidies |
|
|
|
|
|
them up.""" |
|
|
|
|
|
|
|
|
|
|
|
# We need to |
|
|
|
|
|
# 1) Remove whitespace |
|
|
|
|
|
# 2) Remove 
 and 
 |
|
|
|
|
|
|
|
|
|
|
|
ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*") |
|
|
|
|
|
|
|
|
|
|
|
return ''.join(ws_re.split(url)) |
|
|
|
|
|
|
|
|
|
|
|
# FIXME - Merton, Shrewsbury, and South Norfolk need to be done here. |
|
|
|
|
|
# All are down today... |
|
|
|
|
|
|
|
|
|
|
|
class SouthNorfolkParser(PlanningExplorerParser): |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
class SouthShropshireParser(PlanningExplorerParser): |
|
|
|
|
|
comments_email_address = "planning@southshropshire.gov.uk" |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
info_url_path = "MVM/Online/PL/" |
|
|
|
|
|
|
|
|
|
|
|
def _getPostData(self, asp_args, search_date): |
|
|
|
|
|
local_date_format = "%d-%m-%Y" |
|
|
|
|
|
year, month, day = search_date.timetuple()[:3] |
|
|
|
|
|
|
|
|
|
|
|
post_data = urllib.urlencode(asp_args + ( |
|
|
|
|
|
("edrDateSelection:htxtRange", "radRangeBetween"), |
|
|
|
|
|
("cboDateList", "DATE_RECEIVED"), |
|
|
|
|
|
("edrDateSelection:txtStart", search_date.strftime(local_date_format)), |
|
|
|
|
|
("edrDateSelection:txtEnd", search_date.strftime(local_date_format)), |
|
|
|
|
|
("edrDateSelection:txtDateReceived", "%(day)d-%(month)d-%(year)d~%(day)d-%(month)d-%(year)d" %({"day":day, "month":month, "year":year})), |
|
|
|
|
|
("cboNumRecs", "99999"), |
|
|
|
|
|
("csbtnSearch", "Search"), |
|
|
|
|
|
)) |
|
|
|
|
|
|
|
|
|
|
|
return post_data |
|
|
|
|
|
|
|
|
|
|
|
class SouthTynesideParser(BroadlandLike, PlanningExplorerParser): |
|
|
|
|
|
# Unlike the other BroadlandLike sites, there are postcodes :-) |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class StockportParser(PlanningExplorerParser): |
|
|
|
|
|
comments_email_address = "admin.dc@stockport.gov.uk" |
|
|
|
|
|
info_url_path = "MVM/Online/PL/" |
|
|
|
|
|
|
|
|
|
|
|
def _getPostData(self, asp_args, search_date): |
|
|
|
|
|
post_data = urllib.urlencode(asp_args + ( |
|
|
|
|
|
("drDateReceived:txtStart", search_date.strftime(date_format)), |
|
|
|
|
|
("drDateReceived:txtEnd", search_date.strftime(date_format)), |
|
|
|
|
|
("cboNumRecs", "99999"), |
|
|
|
|
|
("csbtnSearch", "Search"),), |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
return post_data |
|
|
|
|
|
|
|
|
|
|
|
# FIXME - should add Swansea, but it is currently down |
|
|
|
|
|
|
|
|
|
|
|
class TamworthParser(PlanningExplorerParser): |
|
|
|
|
|
comments_email_address = "planningadmin@tamworth.gov.uk" |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
info_url_path = "MVM/Online/PL/" |
|
|
|
|
|
|
|
|
|
|
|
class TraffordParser(PlanningExplorerParser): |
|
|
|
|
|
# There are no postcodes on the Trafford site. |
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
address_td_no = 3 |
|
|
|
|
|
|
|
|
|
|
|
class WestOxfordshireParser(PlanningExplorerParser): |
|
|
|
|
|
address_td_no = 3 |
|
|
|
|
|
description_td_no = 1 |
|
|
|
|
|
|
|
|
|
|
|
use_firefox_user_agent = True |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
# NOTE - 04/11/2007 is a sunday |
|
|
|
|
|
# I'm using it to test that the scrapers behave on days with no apps. |
|
|
|
|
|
|
|
|
|
|
|
#parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") |
|
|
|
|
|
#parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") |
|
|
|
|
|
#parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") |
|
|
|
|
|
#parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") |
|
|
|
|
|
#parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") |
|
|
|
|
|
#parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") |
|
|
|
|
|
#parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") |
|
|
|
|
|
#parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") |
|
|
|
|
|
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") |
|
|
|
|
|
#parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") |
|
|
|
|
|
#parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") |
|
|
|
|
|
#parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") |
|
|
|
|
|
#parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") |
|
|
|
|
|
#parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/") |
|
|
|
|
|
#parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/") |
|
|
|
|
|
#parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") |
|
|
|
|
|
#parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/") |
|
|
|
|
|
#parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/") |
|
|
|
|
|
#parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/") |
|
|
|
|
|
print parser.getResults(1, 11, 2007) |
|
|
|
|
|
|
|
|
|
|
|
# To Do |
|
|
|
|
|
|
|
|
|
|
|
# Sort out paging: |
|
|
|
|
|
# South Shropshire - pages on 6 |
|
|
|
|
|
|
|
|
|
|
|
# Investigate catching unavailable message: |
|
|
|
|
|
# Charnwood |