diff --git a/trunk/python_scrapers/PlanningExplorer.py b/trunk/python_scrapers/PlanningExplorer.py index 82bf7fe..800c2cb 100644 --- a/trunk/python_scrapers/PlanningExplorer.py +++ b/trunk/python_scrapers/PlanningExplorer.py @@ -4,7 +4,7 @@ import urlparse import cgi import re import datetime -import BeautifulSoup + import cookielib @@ -13,130 +13,9 @@ cookie_jar = cookielib.CookieJar() from BeautifulSoup import BeautifulSoup -__auth__ = None - -import re - -date_format = "%d/%m/%Y" - -def fixNewlines(text): - # This can be used to sort out windows newlines - return text.replace("\r\n","\n") - -# So what can a postcode look like then? -# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm -#AN NAA M1 1AA -#ANN NAA M60 1NW -#AAN NAA CR2 6XH -#AANN NAA DN55 1PT -#ANA NAA W1A 1HP -#AANA NAA EC1A 1BB - -postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]") - -def getPostcodeFromText(text, default_postcode="No Postcode"): - """This function takes a piece of text and returns the first - bit of it that looks like a postcode.""" - - postcode_match = postcode_regex.search(text) - - return postcode_match.group() if postcode_match else default_postcode - - -class PlanningAuthorityResults: - """This class represents a set of results of a planning search. - - This should probably be separated out so that it can be used for - authorities other than Cherwell. - """ - - def __init__(self, authority_name, authority_short_name): - self.authority_name = authority_name - self.authority_short_name = authority_short_name - - # this will be a list of PlanningApplication objects - self.planning_applications = [] - - - def addApplication(self, application): - self.planning_applications.append(application) - - def __repr__(self): - return self.displayXML() - - def displayXML(self): - """This should display the contents of this object in the planningalerts format. - i.e. in the same format as this one: - http://www.planningalerts.com/lambeth.xml - """ - - applications_bit = "".join([x.displayXML() for x in self.planning_applications]) - - return u"""\n""" + \ - u"\n" +\ - u"%s\n" %self.authority_name +\ - u"%s\n" %self.authority_short_name +\ - u"\n" + applications_bit +\ - u"\n" +\ - u"\n" - - - -class PlanningApplication: - def __init__(self): - self.council_reference = None - self.address = None - self.postcode = None - self.description = None - self.info_url = None - self.comment_url = None - - # expecting this as a datetime.date object - self.date_received = None - - # If we can get them, we may as well include OSGB. - # These will be the entirely numeric version. - self.osgb_x = None - self.osgb_y = None - - def __repr__(self): - return self.displayXML() - - def is_ready(self): - # This method tells us if the application is complete - # Because of the postcode default, we can't really - # check the postcode - make sure it is filled in when - # you do the address. - return self.council_reference \ - and self.address \ - and self.description \ - and self.info_url \ - and self.comment_url \ - and self.date_received - - - def displayXML(self): - #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received - - if not self.postcode: - self.postcode = getPostcodeFromText(self.address) - - contents = [ - u"" %(self.council_reference), - u"
" %(self.address), - u"" %self.postcode, - u"" %(self.description), - u"" %(self.info_url), - u"" %(self.comment_url), - u"" %self.date_received.strftime(date_format), - ] - if self.osgb_x: - contents.append(u"%s" %(self.osgb_x)) - if self.osgb_y: - contents.append(u"%s" %(self.osgb_y)) - - return u"\n%s\n" %('\n'.join(contents)) - +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText # Date format to enter into search boxes date_format = "%d/%m/%Y" @@ -159,7 +38,7 @@ class PlanningExplorerParser: # authority, then they can be overridden in a subclass. info_url_path = "MVM/Online/Generic/" search_url_path = "MVM/Online/PL/GeneralSearch.aspx" - + # This is the most common place for comments urls to live # The %s will be filled in with an application code comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s" @@ -226,7 +105,7 @@ class PlanningExplorerParser: override this method returning a dictionary of header key to header value.""" headers = {} - + if self.use_firefox_user_agent: headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10" @@ -259,7 +138,7 @@ class PlanningExplorerParser: ("csbtnSearch", "Search"), ("cboNumRecs", "99999"), )) - + return post_data @@ -271,7 +150,7 @@ class PlanningExplorerParser: address = address_td.div.string else: address = address_td.string - + return address @@ -283,10 +162,10 @@ class PlanningExplorerParser: one that parses the info page.""" return getPostcodeFromText(self._current_application.address) - + def _getDescription(self, tds, info_soup): description_td = tds[self.description_td_no] - + if description_td.div is not None: # Mostly this is in a div # Use the empty string if the description is missing @@ -311,7 +190,7 @@ class PlanningExplorerParser: self.search_url = urlparse.urljoin(base_url, self.search_url_path) self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path) - + self.debug = debug self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) @@ -323,6 +202,7 @@ class PlanningExplorerParser: get_request = urllib2.Request(self.search_url) get_response = urllib2.urlopen(get_request) + cookie_jar.extract_cookies(get_response, get_request) html = get_response.read() @@ -338,7 +218,7 @@ class PlanningExplorerParser: # The post data needs to be different for different councils # so we have a method on each council's scraper to make it. post_data = self._getPostData(asp_args, search_date) - + headers = self._getHeaders() request = urllib2.Request(self.search_url, post_data, headers) @@ -371,7 +251,7 @@ class PlanningExplorerParser: self._current_application = PlanningApplication() # There is no need to search for the date_received, it's what - # we searched for + # we searched for self._current_application.date_received = search_date tds = tr.findAll("td") @@ -386,7 +266,7 @@ class PlanningExplorerParser: if self.fetch_info_page: # We need to quote the spaces in the info url info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?=")) - + info_soup = BeautifulSoup(urllib2.urlopen(info_request)) else: info_soup = None @@ -493,7 +373,7 @@ class CreweParser(PlanningExplorerParser): info_url_path = "Northgate/PlanningExplorer/Generic/" search_url_path = "northgate/planningexplorer/generalsearch.aspx" - + results_table_attrs = {"class": "display_table"} def _getPostData(self, asp_args, search_date): @@ -554,13 +434,13 @@ class HackneyParser(PlanningExplorerParser): real_url_tuple = urlparse.urlsplit(response.geturl()) query_string = real_url_tuple[3] - + # Get the query as a list of key, value pairs parsed_query_list = list(cgi.parse_qsl(query_string)) # Go through the query string replacing any PS parameters # with PS=99999 - + for i in range(len(parsed_query_list)): key, value = parsed_query_list[i] @@ -569,10 +449,10 @@ class HackneyParser(PlanningExplorerParser): parsed_query_list[i] = (key, value) new_query_string = urllib.urlencode(parsed_query_list) - + new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:] - - new_url = urlparse.urlunsplit(new_url_tuple) + + new_url = urlparse.urlunsplit(new_url_tuple) new_request = urllib2.Request(new_url, None, self._getHeaders()) new_response = urllib2.urlopen(new_request) @@ -607,13 +487,13 @@ class HackneyParser(PlanningExplorerParser): class KennetParser(BroadlandLike, PlanningExplorerParser): comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s" - + class LincolnParser(PlanningExplorerParser): use_firefox_user_agent = True use_referer = True results_table_attrs = {"class": "display_table"} - + search_url_path = "northgate/planningexplorer/generalsearch.aspx" info_url_path = "Northgate/PlanningExplorer/Generic/" @@ -751,7 +631,7 @@ class SouthShropshireParser(PlanningExplorerParser): ("cboNumRecs", "99999"), ("csbtnSearch", "Search"), )) - + return post_data class SouthTynesideParser(BroadlandLike, PlanningExplorerParser): @@ -759,6 +639,7 @@ class SouthTynesideParser(BroadlandLike, PlanningExplorerParser): pass + class StockportParser(PlanningExplorerParser): comments_email_address = "admin.dc@stockport.gov.uk" info_url_path = "MVM/Online/PL/" @@ -868,11 +749,11 @@ class MendipParser(BroadlandLike, PlanningExplorerParser): if __name__ == '__main__': # NOTE - 04/11/2007 is a sunday # I'm using it to test that the scrapers behave on days with no apps. - + # parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") -# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") + parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") # parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") @@ -895,7 +776,8 @@ if __name__ == '__main__': # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") # parser = MendipParser("Mendip District Council", "Mendip", "http://planning.mendip.gov.uk/") parser = BirminghamParser("Birmingham City Council", "Birmingham", "http://eplanning.birmingham.gov.uk/Northgate/") - print parser.getResults(27, 4, 2010) + + print parser.getResults(12, 6, 2009) # To Do