diff --git a/trunk/python_scrapers/PlanningExplorer.py b/trunk/python_scrapers/PlanningExplorer.py
index 5c3e2ec..ebdef7e 100644
--- a/trunk/python_scrapers/PlanningExplorer.py
+++ b/trunk/python_scrapers/PlanningExplorer.py
@@ -5,6 +5,12 @@ import cgi
import re
import datetime
+
+import cookielib
+
+cookie_jar = cookielib.CookieJar()
+
+
from BeautifulSoup import BeautifulSoup
from PlanningUtils import PlanningApplication, \
@@ -64,6 +70,8 @@ class PlanningExplorerParser:
# If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
fetch_info_page = False
+ asp_args_regex = re.compile(']*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')
+
def _modify_response(self, response):
"""For most sites, we have managed to get all the apps on a
single page by choosing the right parameters.
@@ -90,7 +98,7 @@ class PlanningExplorerParser:
"""If an authority has info urls which are for some reason full
of crap (like Broadland does), then this method should be overridden
in order to tidy them up."""
- return url
+ return ''.join(url.split())
def _getHeaders(self):
"""If the authority requires any headers for the post request,
@@ -194,16 +202,17 @@ class PlanningExplorerParser:
get_request = urllib2.Request(self.search_url)
get_response = urllib2.urlopen(get_request)
+ cookie_jar.extract_cookies(get_response, get_request)
+
html = get_response.read()
# We need to find those ASP parameters such as __VIEWSTATE
# so we can use them in the next POST
- asp_args_regex = re.compile(']*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')
# re.findall gets us a list of key value pairs.
# We want to concatenate it with a tuple, so we must
# make it a tuple
- asp_args = tuple(re.findall(asp_args_regex, html))
+ asp_args = tuple(re.findall(self.asp_args_regex, html))
# The post data needs to be different for different councils
# so we have a method on each council's scraper to make it.
@@ -212,7 +221,7 @@ class PlanningExplorerParser:
headers = self._getHeaders()
request = urllib2.Request(self.search_url, post_data, headers)
-
+ cookie_jar.add_cookie_header(request)
post_response = urllib2.urlopen(request)
# We have actually been returned here by an http302 object
@@ -358,19 +367,36 @@ class CharnwoodParser(PlanningExplorerParser):
class CreweParser(PlanningExplorerParser):
use_firefox_user_agent = True
- address_td_no = 4
+ use_referer = True
+
+ info_url_path = "Northgate/PlanningExplorer/Generic/"
+ search_url_path = "northgate/planningexplorer/generalsearch.aspx"
+
+ results_table_attrs = {"class": "display_table"}
def _getPostData(self, asp_args, search_date):
year_month_day = search_date.timetuple()[:3]
post_data = urllib.urlencode(asp_args + (
- ("drDateReceived:_ctl0_hidden", urllib.quote('' %year_month_day)),
- ("drDateReceivedxxctl0_input", search_date.strftime(date_format)),
- ("drDateReceived:_ctl1_hidden", urllib.quote('' %year_month_day)),
- ("drDateReceivedxxctl1_input", search_date.strftime(date_format)),
- ("cboNumRecs", "99999"),
+ ("txtApplicantName", ""),
+ ("txtAgentName", ""),
+ ("cboStreetReferenceNumber", ""),
+ ("txtProposal", ""),
+ ("cboWardCode", ""),
+ ("cboParishCode", ""),
+ ("cboApplicationTypeCode", ""),
+ ("cboDevelopmentTypeCode", ""),
+ ("cboStatusCode", ""),
+ ("cboSelectDateValue", "DATE_RECEIVED"),
+ ("cboMonths", "1"),
+ ("cboDays", "1"),
+ ("rbGroup", "rbRange"),
+ ("dateStart", search_date.strftime(date_format)),
+ ("dateEnd", search_date.strftime(date_format)),
+ ("edrDateSelection", ""),
("csbtnSearch", "Search"),
- ))
+ )
+ )
return post_data
@@ -430,26 +456,72 @@ class HackneyParser(PlanningExplorerParser):
return new_response
-
+#txtApplicationNumber=&ctl00=DATE_REGISTERED&ctl01=1&ctl02=1&rbGroup=ctl05&ctl07_hidden=&ctl07_input=28%2F08%2F2008&ctl08_hidden=&ctl08_input=28%2F08%2F2008&edrDateSelection=1&cboApplicationTypeCode=&txtLocality=&txtPostCode=&txtPropertyName=&txtPropertyNumber=&txtSiteAddress=&txtStreetName=&csbtnSearch=Search&
def _getPostData(self, asp_args, search_date):
+ """Note - using date registered here, not date received. There is too much time taken
+ between the council 'receiving' an app and 'registering' it for the latter to be useful."""
post_data = urllib.urlencode(asp_args + (
- ("ctl00", "DATE_RECEIVED"),
+ ("txtApplicationNumber", ""),
+ ("ctl00", "DATE_REGISTERED"),
+ ("ctl01", "1"),
+ ("ctl02", "1"),
("rbGroup", "ctl05"),
+ ("ctl07_hidden", ""),
("ctl07_input", search_date.strftime(date_format)),
+ ("ctl08_hidden", ""),
("ctl08_input", search_date.strftime(date_format)),
("edrDateSelection", "1"),
+ ("cboApplicationTypeCode", ""),
+ ("txtLocality", ""),
+ ("txtPostCode", ""),
+ ("txtPropertyName", ""),
+ ("txtPropertyNumber", ""),
+ ("txtSiteAddress", ""),
+ ("txtStreetName", ""),
("csbtnSearch", "Search"),
- ))
-
+ )
+ )
return post_data
-
+
class KennetParser(BroadlandLike, PlanningExplorerParser):
comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
class LincolnParser(PlanningExplorerParser):
use_firefox_user_agent = True
- results_table_attrs = {"class": "resultstable"}
+ use_referer = True
+
+ results_table_attrs = {"class": "display_table"}
+ search_url_path = "northgate/planningexplorer/generalsearch.aspx"
+ info_url_path = "Northgate/PlanningExplorer/Generic/"
+
+
+ def _getPostData(self, asp_args, search_date):
+ post_data = urllib.urlencode(asp_args + (
+ ("txtApplicationNumber", ""),
+ ("txtApplicantName", ""),
+ ("txtAgentName", ""),
+ ("cboApplicationTypeCode", ""),
+ ("cboStatusCode", ""),
+ ("txtPropertyName", ""),
+ ("txtPropertyNumber", ""),
+ ("cboStreetReferenceNumber", ""),
+ ("txtPostCode", ""),
+ ("cboLocality", ""),
+ ("txtProposal", ""),
+ ("cboSelectDateValue", "DATE_REGISTERED"),
+ ("cboMonths", "1"),
+ ("rbGroup", "rbDay"),
+ ("cboDays", "10"),
+ ("dateStart", search_date.strftime(date_format)),
+ ("dateEnd", search_date.strftime(date_format)),
+ ("edrDateSelection", ""),
+ ("csbtnSearch", "Search"),
+ )
+ )
+ return post_data
+
+
class LiverpoolParser(PlanningExplorerParser):
comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk"
use_firefox_user_agent = True
@@ -639,9 +711,9 @@ if __name__ == '__main__':
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
- parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
+# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
-# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
+ parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
# parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
# parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
@@ -655,7 +727,7 @@ if __name__ == '__main__':
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
- print parser.getResults(4, 9, 2008)
+ print parser.getResults(9, 9, 2008)
# To Do
@@ -666,3 +738,7 @@ if __name__ == '__main__':
# Charnwood
# South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site...
+
+# Notes:
+
+# Since the changed, Liverpool and Crewe look rather similar. They are also a little Broadlandlike. Maybe we can do some consolidation