diff --git a/trunk/python_scrapers/PlanningExplorer.py b/trunk/python_scrapers/PlanningExplorer.py
index 82bf7fe..800c2cb 100644
--- a/trunk/python_scrapers/PlanningExplorer.py
+++ b/trunk/python_scrapers/PlanningExplorer.py
@@ -4,7 +4,7 @@ import urlparse
import cgi
import re
import datetime
-import BeautifulSoup
+
import cookielib
@@ -13,130 +13,9 @@ cookie_jar = cookielib.CookieJar()
from BeautifulSoup import BeautifulSoup
-__auth__ = None
-
-import re
-
-date_format = "%d/%m/%Y"
-
-def fixNewlines(text):
- # This can be used to sort out windows newlines
- return text.replace("\r\n","\n")
-
-# So what can a postcode look like then?
-# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
-#AN NAA M1 1AA
-#ANN NAA M60 1NW
-#AAN NAA CR2 6XH
-#AANN NAA DN55 1PT
-#ANA NAA W1A 1HP
-#AANA NAA EC1A 1BB
-
-postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")
-
-def getPostcodeFromText(text, default_postcode="No Postcode"):
- """This function takes a piece of text and returns the first
- bit of it that looks like a postcode."""
-
- postcode_match = postcode_regex.search(text)
-
- return postcode_match.group() if postcode_match else default_postcode
-
-
-class PlanningAuthorityResults:
- """This class represents a set of results of a planning search.
-
- This should probably be separated out so that it can be used for
- authorities other than Cherwell.
- """
-
- def __init__(self, authority_name, authority_short_name):
- self.authority_name = authority_name
- self.authority_short_name = authority_short_name
-
- # this will be a list of PlanningApplication objects
- self.planning_applications = []
-
-
- def addApplication(self, application):
- self.planning_applications.append(application)
-
- def __repr__(self):
- return self.displayXML()
-
- def displayXML(self):
- """This should display the contents of this object in the planningalerts format.
- i.e. in the same format as this one:
- http://www.planningalerts.com/lambeth.xml
- """
-
- applications_bit = "".join([x.displayXML() for x in self.planning_applications])
-
- return u"""\n""" + \
- u"\n" +\
- u"%s\n" %self.authority_name +\
- u"%s\n" %self.authority_short_name +\
- u"\n" + applications_bit +\
- u"\n" +\
- u"\n"
-
-
-
-class PlanningApplication:
- def __init__(self):
- self.council_reference = None
- self.address = None
- self.postcode = None
- self.description = None
- self.info_url = None
- self.comment_url = None
-
- # expecting this as a datetime.date object
- self.date_received = None
-
- # If we can get them, we may as well include OSGB.
- # These will be the entirely numeric version.
- self.osgb_x = None
- self.osgb_y = None
-
- def __repr__(self):
- return self.displayXML()
-
- def is_ready(self):
- # This method tells us if the application is complete
- # Because of the postcode default, we can't really
- # check the postcode - make sure it is filled in when
- # you do the address.
- return self.council_reference \
- and self.address \
- and self.description \
- and self.info_url \
- and self.comment_url \
- and self.date_received
-
-
- def displayXML(self):
- #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
-
- if not self.postcode:
- self.postcode = getPostcodeFromText(self.address)
-
- contents = [
- u"" %(self.council_reference),
- u"
" %(self.address),
- u"" %self.postcode,
- u"" %(self.description),
- u"" %(self.info_url),
- u"" %(self.comment_url),
- u"" %self.date_received.strftime(date_format),
- ]
- if self.osgb_x:
- contents.append(u"%s" %(self.osgb_x))
- if self.osgb_y:
- contents.append(u"%s" %(self.osgb_y))
-
- return u"\n%s\n" %('\n'.join(contents))
-
+from PlanningUtils import PlanningApplication, \
+ PlanningAuthorityResults, \
+ getPostcodeFromText
# Date format to enter into search boxes
date_format = "%d/%m/%Y"
@@ -159,7 +38,7 @@ class PlanningExplorerParser:
# authority, then they can be overridden in a subclass.
info_url_path = "MVM/Online/Generic/"
search_url_path = "MVM/Online/PL/GeneralSearch.aspx"
-
+
# This is the most common place for comments urls to live
# The %s will be filled in with an application code
comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s"
@@ -226,7 +105,7 @@ class PlanningExplorerParser:
override this method returning a dictionary of header key to
header value."""
headers = {}
-
+
if self.use_firefox_user_agent:
headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10"
@@ -259,7 +138,7 @@ class PlanningExplorerParser:
("csbtnSearch", "Search"),
("cboNumRecs", "99999"),
))
-
+
return post_data
@@ -271,7 +150,7 @@ class PlanningExplorerParser:
address = address_td.div.string
else:
address = address_td.string
-
+
return address
@@ -283,10 +162,10 @@ class PlanningExplorerParser:
one that parses the info page."""
return getPostcodeFromText(self._current_application.address)
-
+
def _getDescription(self, tds, info_soup):
description_td = tds[self.description_td_no]
-
+
if description_td.div is not None:
# Mostly this is in a div
# Use the empty string if the description is missing
@@ -311,7 +190,7 @@ class PlanningExplorerParser:
self.search_url = urlparse.urljoin(base_url, self.search_url_path)
self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path)
-
+
self.debug = debug
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
@@ -323,6 +202,7 @@ class PlanningExplorerParser:
get_request = urllib2.Request(self.search_url)
get_response = urllib2.urlopen(get_request)
+
cookie_jar.extract_cookies(get_response, get_request)
html = get_response.read()
@@ -338,7 +218,7 @@ class PlanningExplorerParser:
# The post data needs to be different for different councils
# so we have a method on each council's scraper to make it.
post_data = self._getPostData(asp_args, search_date)
-
+
headers = self._getHeaders()
request = urllib2.Request(self.search_url, post_data, headers)
@@ -371,7 +251,7 @@ class PlanningExplorerParser:
self._current_application = PlanningApplication()
# There is no need to search for the date_received, it's what
- # we searched for
+ # we searched for
self._current_application.date_received = search_date
tds = tr.findAll("td")
@@ -386,7 +266,7 @@ class PlanningExplorerParser:
if self.fetch_info_page:
# We need to quote the spaces in the info url
info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))
-
+
info_soup = BeautifulSoup(urllib2.urlopen(info_request))
else:
info_soup = None
@@ -493,7 +373,7 @@ class CreweParser(PlanningExplorerParser):
info_url_path = "Northgate/PlanningExplorer/Generic/"
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
-
+
results_table_attrs = {"class": "display_table"}
def _getPostData(self, asp_args, search_date):
@@ -554,13 +434,13 @@ class HackneyParser(PlanningExplorerParser):
real_url_tuple = urlparse.urlsplit(response.geturl())
query_string = real_url_tuple[3]
-
+
# Get the query as a list of key, value pairs
parsed_query_list = list(cgi.parse_qsl(query_string))
# Go through the query string replacing any PS parameters
# with PS=99999
-
+
for i in range(len(parsed_query_list)):
key, value = parsed_query_list[i]
@@ -569,10 +449,10 @@ class HackneyParser(PlanningExplorerParser):
parsed_query_list[i] = (key, value)
new_query_string = urllib.urlencode(parsed_query_list)
-
+
new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:]
-
- new_url = urlparse.urlunsplit(new_url_tuple)
+
+ new_url = urlparse.urlunsplit(new_url_tuple)
new_request = urllib2.Request(new_url, None, self._getHeaders())
new_response = urllib2.urlopen(new_request)
@@ -607,13 +487,13 @@ class HackneyParser(PlanningExplorerParser):
class KennetParser(BroadlandLike, PlanningExplorerParser):
comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
-
+
class LincolnParser(PlanningExplorerParser):
use_firefox_user_agent = True
use_referer = True
results_table_attrs = {"class": "display_table"}
-
+
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
info_url_path = "Northgate/PlanningExplorer/Generic/"
@@ -751,7 +631,7 @@ class SouthShropshireParser(PlanningExplorerParser):
("cboNumRecs", "99999"),
("csbtnSearch", "Search"),
))
-
+
return post_data
class SouthTynesideParser(BroadlandLike, PlanningExplorerParser):
@@ -759,6 +639,7 @@ class SouthTynesideParser(BroadlandLike, PlanningExplorerParser):
pass
+
class StockportParser(PlanningExplorerParser):
comments_email_address = "admin.dc@stockport.gov.uk"
info_url_path = "MVM/Online/PL/"
@@ -868,11 +749,11 @@ class MendipParser(BroadlandLike, PlanningExplorerParser):
if __name__ == '__main__':
# NOTE - 04/11/2007 is a sunday
# I'm using it to test that the scrapers behave on days with no apps.
-
+
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
-# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
+ parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
# parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
@@ -895,7 +776,8 @@ if __name__ == '__main__':
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
# parser = MendipParser("Mendip District Council", "Mendip", "http://planning.mendip.gov.uk/")
parser = BirminghamParser("Birmingham City Council", "Birmingham", "http://eplanning.birmingham.gov.uk/Northgate/")
- print parser.getResults(27, 4, 2010)
+
+ print parser.getResults(12, 6, 2009)
# To Do