| @@ -4,7 +4,7 @@ import urlparse | |||||
| import cgi | import cgi | ||||
| import re | import re | ||||
| import datetime | import datetime | ||||
| import BeautifulSoup | |||||
| import cookielib | import cookielib | ||||
| @@ -13,130 +13,9 @@ cookie_jar = cookielib.CookieJar() | |||||
| from BeautifulSoup import BeautifulSoup | from BeautifulSoup import BeautifulSoup | ||||
| __auth__ = None | |||||
| import re | |||||
| date_format = "%d/%m/%Y" | |||||
| def fixNewlines(text): | |||||
| # This can be used to sort out windows newlines | |||||
| return text.replace("\r\n","\n") | |||||
| # So what can a postcode look like then? | |||||
| # This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm | |||||
| #AN NAA M1 1AA | |||||
| #ANN NAA M60 1NW | |||||
| #AAN NAA CR2 6XH | |||||
| #AANN NAA DN55 1PT | |||||
| #ANA NAA W1A 1HP | |||||
| #AANA NAA EC1A 1BB | |||||
| postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]") | |||||
| def getPostcodeFromText(text, default_postcode="No Postcode"): | |||||
| """This function takes a piece of text and returns the first | |||||
| bit of it that looks like a postcode.""" | |||||
| postcode_match = postcode_regex.search(text) | |||||
| return postcode_match.group() if postcode_match else default_postcode | |||||
| class PlanningAuthorityResults: | |||||
| """This class represents a set of results of a planning search. | |||||
| This should probably be separated out so that it can be used for | |||||
| authorities other than Cherwell. | |||||
| """ | |||||
| def __init__(self, authority_name, authority_short_name): | |||||
| self.authority_name = authority_name | |||||
| self.authority_short_name = authority_short_name | |||||
| # this will be a list of PlanningApplication objects | |||||
| self.planning_applications = [] | |||||
| def addApplication(self, application): | |||||
| self.planning_applications.append(application) | |||||
| def __repr__(self): | |||||
| return self.displayXML() | |||||
| def displayXML(self): | |||||
| """This should display the contents of this object in the planningalerts format. | |||||
| i.e. in the same format as this one: | |||||
| http://www.planningalerts.com/lambeth.xml | |||||
| """ | |||||
| applications_bit = "".join([x.displayXML() for x in self.planning_applications]) | |||||
| return u"""<?xml version="1.0" encoding="UTF-8"?>\n""" + \ | |||||
| u"<planning>\n" +\ | |||||
| u"<authority_name>%s</authority_name>\n" %self.authority_name +\ | |||||
| u"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\ | |||||
| u"<applications>\n" + applications_bit +\ | |||||
| u"</applications>\n" +\ | |||||
| u"</planning>\n" | |||||
| class PlanningApplication: | |||||
| def __init__(self): | |||||
| self.council_reference = None | |||||
| self.address = None | |||||
| self.postcode = None | |||||
| self.description = None | |||||
| self.info_url = None | |||||
| self.comment_url = None | |||||
| # expecting this as a datetime.date object | |||||
| self.date_received = None | |||||
| # If we can get them, we may as well include OSGB. | |||||
| # These will be the entirely numeric version. | |||||
| self.osgb_x = None | |||||
| self.osgb_y = None | |||||
| def __repr__(self): | |||||
| return self.displayXML() | |||||
| def is_ready(self): | |||||
| # This method tells us if the application is complete | |||||
| # Because of the postcode default, we can't really | |||||
| # check the postcode - make sure it is filled in when | |||||
| # you do the address. | |||||
| return self.council_reference \ | |||||
| and self.address \ | |||||
| and self.description \ | |||||
| and self.info_url \ | |||||
| and self.comment_url \ | |||||
| and self.date_received | |||||
| def displayXML(self): | |||||
| #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | |||||
| if not self.postcode: | |||||
| self.postcode = getPostcodeFromText(self.address) | |||||
| contents = [ | |||||
| u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference), | |||||
| u"<address><![CDATA[%s]]></address>" %(self.address), | |||||
| u"<postcode><![CDATA[%s]]></postcode>" %self.postcode, | |||||
| u"<description><![CDATA[%s]]></description>" %(self.description), | |||||
| u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url), | |||||
| u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url), | |||||
| u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format), | |||||
| ] | |||||
| if self.osgb_x: | |||||
| contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x)) | |||||
| if self.osgb_y: | |||||
| contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y)) | |||||
| return u"<application>\n%s\n</application>" %('\n'.join(contents)) | |||||
| from PlanningUtils import PlanningApplication, \ | |||||
| PlanningAuthorityResults, \ | |||||
| getPostcodeFromText | |||||
| # Date format to enter into search boxes | # Date format to enter into search boxes | ||||
| date_format = "%d/%m/%Y" | date_format = "%d/%m/%Y" | ||||
| @@ -159,7 +38,7 @@ class PlanningExplorerParser: | |||||
| # authority, then they can be overridden in a subclass. | # authority, then they can be overridden in a subclass. | ||||
| info_url_path = "MVM/Online/Generic/" | info_url_path = "MVM/Online/Generic/" | ||||
| search_url_path = "MVM/Online/PL/GeneralSearch.aspx" | search_url_path = "MVM/Online/PL/GeneralSearch.aspx" | ||||
| # This is the most common place for comments urls to live | # This is the most common place for comments urls to live | ||||
| # The %s will be filled in with an application code | # The %s will be filled in with an application code | ||||
| comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s" | comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s" | ||||
| @@ -226,7 +105,7 @@ class PlanningExplorerParser: | |||||
| override this method returning a dictionary of header key to | override this method returning a dictionary of header key to | ||||
| header value.""" | header value.""" | ||||
| headers = {} | headers = {} | ||||
| if self.use_firefox_user_agent: | if self.use_firefox_user_agent: | ||||
| headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10" | headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10" | ||||
| @@ -259,7 +138,7 @@ class PlanningExplorerParser: | |||||
| ("csbtnSearch", "Search"), | ("csbtnSearch", "Search"), | ||||
| ("cboNumRecs", "99999"), | ("cboNumRecs", "99999"), | ||||
| )) | )) | ||||
| return post_data | return post_data | ||||
| @@ -271,7 +150,7 @@ class PlanningExplorerParser: | |||||
| address = address_td.div.string | address = address_td.div.string | ||||
| else: | else: | ||||
| address = address_td.string | address = address_td.string | ||||
| return address | return address | ||||
| @@ -283,10 +162,10 @@ class PlanningExplorerParser: | |||||
| one that parses the info page.""" | one that parses the info page.""" | ||||
| return getPostcodeFromText(self._current_application.address) | return getPostcodeFromText(self._current_application.address) | ||||
| def _getDescription(self, tds, info_soup): | def _getDescription(self, tds, info_soup): | ||||
| description_td = tds[self.description_td_no] | description_td = tds[self.description_td_no] | ||||
| if description_td.div is not None: | if description_td.div is not None: | ||||
| # Mostly this is in a div | # Mostly this is in a div | ||||
| # Use the empty string if the description is missing | # Use the empty string if the description is missing | ||||
| @@ -311,7 +190,7 @@ class PlanningExplorerParser: | |||||
| self.search_url = urlparse.urljoin(base_url, self.search_url_path) | self.search_url = urlparse.urljoin(base_url, self.search_url_path) | ||||
| self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path) | self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path) | ||||
| self.debug = debug | self.debug = debug | ||||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | ||||
| @@ -323,6 +202,7 @@ class PlanningExplorerParser: | |||||
| get_request = urllib2.Request(self.search_url) | get_request = urllib2.Request(self.search_url) | ||||
| get_response = urllib2.urlopen(get_request) | get_response = urllib2.urlopen(get_request) | ||||
| cookie_jar.extract_cookies(get_response, get_request) | cookie_jar.extract_cookies(get_response, get_request) | ||||
| html = get_response.read() | html = get_response.read() | ||||
| @@ -338,7 +218,7 @@ class PlanningExplorerParser: | |||||
| # The post data needs to be different for different councils | # The post data needs to be different for different councils | ||||
| # so we have a method on each council's scraper to make it. | # so we have a method on each council's scraper to make it. | ||||
| post_data = self._getPostData(asp_args, search_date) | post_data = self._getPostData(asp_args, search_date) | ||||
| headers = self._getHeaders() | headers = self._getHeaders() | ||||
| request = urllib2.Request(self.search_url, post_data, headers) | request = urllib2.Request(self.search_url, post_data, headers) | ||||
| @@ -371,7 +251,7 @@ class PlanningExplorerParser: | |||||
| self._current_application = PlanningApplication() | self._current_application = PlanningApplication() | ||||
| # There is no need to search for the date_received, it's what | # There is no need to search for the date_received, it's what | ||||
| # we searched for | |||||
| # we searched for | |||||
| self._current_application.date_received = search_date | self._current_application.date_received = search_date | ||||
| tds = tr.findAll("td") | tds = tr.findAll("td") | ||||
| @@ -386,7 +266,7 @@ class PlanningExplorerParser: | |||||
| if self.fetch_info_page: | if self.fetch_info_page: | ||||
| # We need to quote the spaces in the info url | # We need to quote the spaces in the info url | ||||
| info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?=")) | info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?=")) | ||||
| info_soup = BeautifulSoup(urllib2.urlopen(info_request)) | info_soup = BeautifulSoup(urllib2.urlopen(info_request)) | ||||
| else: | else: | ||||
| info_soup = None | info_soup = None | ||||
| @@ -493,7 +373,7 @@ class CreweParser(PlanningExplorerParser): | |||||
| info_url_path = "Northgate/PlanningExplorer/Generic/" | info_url_path = "Northgate/PlanningExplorer/Generic/" | ||||
| search_url_path = "northgate/planningexplorer/generalsearch.aspx" | search_url_path = "northgate/planningexplorer/generalsearch.aspx" | ||||
| results_table_attrs = {"class": "display_table"} | results_table_attrs = {"class": "display_table"} | ||||
| def _getPostData(self, asp_args, search_date): | def _getPostData(self, asp_args, search_date): | ||||
| @@ -554,13 +434,13 @@ class HackneyParser(PlanningExplorerParser): | |||||
| real_url_tuple = urlparse.urlsplit(response.geturl()) | real_url_tuple = urlparse.urlsplit(response.geturl()) | ||||
| query_string = real_url_tuple[3] | query_string = real_url_tuple[3] | ||||
| # Get the query as a list of key, value pairs | # Get the query as a list of key, value pairs | ||||
| parsed_query_list = list(cgi.parse_qsl(query_string)) | parsed_query_list = list(cgi.parse_qsl(query_string)) | ||||
| # Go through the query string replacing any PS parameters | # Go through the query string replacing any PS parameters | ||||
| # with PS=99999 | # with PS=99999 | ||||
| for i in range(len(parsed_query_list)): | for i in range(len(parsed_query_list)): | ||||
| key, value = parsed_query_list[i] | key, value = parsed_query_list[i] | ||||
| @@ -569,10 +449,10 @@ class HackneyParser(PlanningExplorerParser): | |||||
| parsed_query_list[i] = (key, value) | parsed_query_list[i] = (key, value) | ||||
| new_query_string = urllib.urlencode(parsed_query_list) | new_query_string = urllib.urlencode(parsed_query_list) | ||||
| new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:] | new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:] | ||||
| new_url = urlparse.urlunsplit(new_url_tuple) | |||||
| new_url = urlparse.urlunsplit(new_url_tuple) | |||||
| new_request = urllib2.Request(new_url, None, self._getHeaders()) | new_request = urllib2.Request(new_url, None, self._getHeaders()) | ||||
| new_response = urllib2.urlopen(new_request) | new_response = urllib2.urlopen(new_request) | ||||
| @@ -607,13 +487,13 @@ class HackneyParser(PlanningExplorerParser): | |||||
| class KennetParser(BroadlandLike, PlanningExplorerParser): | class KennetParser(BroadlandLike, PlanningExplorerParser): | ||||
| comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s" | comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s" | ||||
| class LincolnParser(PlanningExplorerParser): | class LincolnParser(PlanningExplorerParser): | ||||
| use_firefox_user_agent = True | use_firefox_user_agent = True | ||||
| use_referer = True | use_referer = True | ||||
| results_table_attrs = {"class": "display_table"} | results_table_attrs = {"class": "display_table"} | ||||
| search_url_path = "northgate/planningexplorer/generalsearch.aspx" | search_url_path = "northgate/planningexplorer/generalsearch.aspx" | ||||
| info_url_path = "Northgate/PlanningExplorer/Generic/" | info_url_path = "Northgate/PlanningExplorer/Generic/" | ||||
| @@ -751,7 +631,7 @@ class SouthShropshireParser(PlanningExplorerParser): | |||||
| ("cboNumRecs", "99999"), | ("cboNumRecs", "99999"), | ||||
| ("csbtnSearch", "Search"), | ("csbtnSearch", "Search"), | ||||
| )) | )) | ||||
| return post_data | return post_data | ||||
| class SouthTynesideParser(BroadlandLike, PlanningExplorerParser): | class SouthTynesideParser(BroadlandLike, PlanningExplorerParser): | ||||
| @@ -759,6 +639,7 @@ class SouthTynesideParser(BroadlandLike, PlanningExplorerParser): | |||||
| pass | pass | ||||
| class StockportParser(PlanningExplorerParser): | class StockportParser(PlanningExplorerParser): | ||||
| comments_email_address = "admin.dc@stockport.gov.uk" | comments_email_address = "admin.dc@stockport.gov.uk" | ||||
| info_url_path = "MVM/Online/PL/" | info_url_path = "MVM/Online/PL/" | ||||
| @@ -868,11 +749,11 @@ class MendipParser(BroadlandLike, PlanningExplorerParser): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| # NOTE - 04/11/2007 is a sunday | # NOTE - 04/11/2007 is a sunday | ||||
| # I'm using it to test that the scrapers behave on days with no apps. | # I'm using it to test that the scrapers behave on days with no apps. | ||||
| # parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | # parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | ||||
| # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | ||||
| # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | ||||
| # parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | |||||
| parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | |||||
| # parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") | # parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") | ||||
| # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | ||||
| # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | ||||
| @@ -895,7 +776,8 @@ if __name__ == '__main__': | |||||
| # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | ||||
| # parser = MendipParser("Mendip District Council", "Mendip", "http://planning.mendip.gov.uk/") | # parser = MendipParser("Mendip District Council", "Mendip", "http://planning.mendip.gov.uk/") | ||||
| parser = BirminghamParser("Birmingham City Council", "Birmingham", "http://eplanning.birmingham.gov.uk/Northgate/") | parser = BirminghamParser("Birmingham City Council", "Birmingham", "http://eplanning.birmingham.gov.uk/Northgate/") | ||||
| print parser.getResults(27, 4, 2010) | |||||
| print parser.getResults(12, 6, 2009) | |||||
| # To Do | # To Do | ||||