import urllib, urllib2 import HTMLParser import urlparse import datetime, time from PlanningUtils import PlanningAuthorityResults, \ getPostcodeFromText, \ PlanningApplication # The search results list will give us reference, location, description, # and info url of each app. # The info page gives us the received date, # and comment_url class ApplicationSearchServletParser(HTMLParser.HTMLParser): """Parser for ApplicationSearchServlet sites. """ # These indicate the column of the main table containing this # piece of information. # They should be overridden in subclasses #self._rows_to_ignore_at_start = None _reference_col_no = None _location_col_no = None _description_col_no = None def __init__(self, authority_name, authority_short_name, base_url, debug=False): HTMLParser.HTMLParser.__init__(self) self.authority_name = authority_name self.authority_short_name = authority_short_name self.base_url = base_url self.debug = debug self.search_url = urlparse.urljoin(self.base_url, "portal/servlets/ApplicationSearchServlet") self._comment_url = urlparse.urljoin(self.base_url, "portal/servlets/PlanningComments?REFNO=%(council_reference)s") self._requested_date = None # 0 - no # 1 - maybe # 2 - yes # 3 - finished self._in_results_table = 0 self._tr_count = 0 self._td_count = 0 self._data_list = [] # this will hold the application we are currently working on. self._current_application = None # The object which stores our set of planning application results self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def _checkAttrsForResultsTable(self, attrs): raise SystemError def handle_starttag(self, tag, attrs): if self.debug: print tag, attrs if tag == "table" and self._in_results_table == 0: self._in_results_table = 1 self._checkAttrsForResultsTable(attrs) elif tag == "tr" and self._in_results_table == 2: self._tr_count += 1 self._td_count = 0 self._data_list = [] self._current_application = PlanningApplication() elif tag == "td" and self._in_results_table == 2: self._td_count += 1 elif tag == "a" and self._in_results_table == 2 and self._td_count == self._reference_col_no: # The href attribute contains the link to the info page for (key, value) in attrs: if key == "href": self._current_application.info_url = urlparse.urljoin(self.search_url, value) def handle_endtag(self, tag): if self.debug: print "ending: " , tag if tag == "table" and self._in_results_table == 2: self._in_results_table = 3 elif tag == "tr" and self._in_results_table == 2: if self._current_application.council_reference is not None: # get the received date #info_response = urllib2.urlopen(self._current_application.info_url) #info_page_parser = InfoPageParser() #info_page_parser.feed(info_response.read()) self._current_application.date_received = self._requested_date#info_page_parser.date_received self._results.addApplication(self._current_application) elif tag == "td" and self._in_results_table == 2: if self._td_count == self._location_col_no: data = ' '.join(self._data_list).strip() self._current_application.address = data postcode = getPostcodeFromText(data) if postcode is not None: self._current_application.postcode = postcode self._data_list = [] elif self._td_count == self._description_col_no: data = ' '.join(self._data_list).strip() self._current_application.description = data self._data_list = [] elif tag == 'a' and self._in_results_table == 2 and self._td_count == self._reference_col_no: data = ''.join(self._data_list).strip() self._current_application.council_reference = data self._current_application.comment_url = self._comment_url %{"council_reference": data} self._data_list = [] def handle_data(self, data): if self.debug: print data if self._in_results_table == 2: if self._td_count == self._reference_col_no or \ self._td_count == self._location_col_no or \ self._td_count == self._description_col_no: self._data_list.append(data.strip()) def getResultsByDayMonthYear(self, day, month, year): """This will return an ApplicationResults object containg the applications for the date passed in.""" # Were going to need a datetime object for the requested date self._requested_date = datetime.date(year, month, day) required_format = "%d-%m-%Y" search_data = urllib.urlencode({"ReceivedDateFrom":self._requested_date.strftime(required_format), "ReceivedDateTo":self._requested_date.strftime(required_format)}) search_request = urllib2.Request(self.search_url, search_data) search_response = urllib2.urlopen(search_request) search_contents = search_response.read() self.feed(search_contents) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() class CoventrySearchParser(ApplicationSearchServletParser): # results table spotter # width="100%" border="0" _reference_col_no = 1 _location_col_no = 5 _description_col_no = 8 def _checkAttrsForResultsTable(self, attrs): got_width = False got_border = False for key, value in attrs: if key == 'width' and value == '100%': got_width = True elif key == 'border' and value == '0': got_border = True if got_width and got_border: self._in_results_table = 2 else: self._in_results_table = 0 class AllerdaleSearchParser(ApplicationSearchServletParser): # results table spotter #class="nis_table" summary="Table of planning applications that matched your query, showing reference number, received date, and address" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 4 def _checkAttrsForResultsTable(self, attrs): got_class = False got_summary = False for key, value in attrs: if key == 'class' and value == 'nis_table': got_class = True elif key == 'summary' and value == 'Table of planning applications that matched your query, showing reference number, received date, and address': got_summary = True if got_class and got_summary: self._in_results_table = 2 else: self._in_results_table = 0 class AlnwickSearchParser(ApplicationSearchServletParser): # results table spotter # width="100%" class="niscontent" _reference_col_no = 1 _location_col_no = 2 _description_col_no = 7 def _checkAttrsForResultsTable(self, attrs): got_class = False for key, value in attrs: if key == 'class' and value == 'niscontent': got_class = True if got_class: self._in_results_table = 2 else: self._in_results_table = 0 class BarrowSearchParser(ApplicationSearchServletParser): # results table spotter # width="100%" border="0" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 6 def _checkAttrsForResultsTable(self, attrs): got_width = False got_border = False for key, value in attrs: if key == 'width' and value == '100%': got_width = True elif key == 'border' and value == '0': got_border = True if got_width and got_border: self._in_results_table = 2 else: self._in_results_table = 0 class HartlepoolSearchParser(ApplicationSearchServletParser): # results table spotter # summary="Table of planning applications that matched your query, showing reference number, received date, and address" _reference_col_no = 1 _location_col_no = 2 _description_col_no = 3 def _checkAttrsForResultsTable(self, attrs): got_summary = False for key, value in attrs: if key == 'summary' and value == "Table of planning applications that matched your query, showing reference number, received date, and address": got_summary = True if got_summary: self._in_results_table = 2 else: self._in_results_table = 0 class NorthWarksSearchParser(ApplicationSearchServletParser): # results table spotter # table width="100%" border="0" cellspacing="0" cellpadding="0" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 4 def _checkAttrsForResultsTable(self, attrs): got_width = False got_border = False got_cellspacing = False got_cellpadding = False for key, value in attrs: if key == 'width' and value == "100%": got_width = True elif key == 'border' and value == '0': got_border = True elif key == 'cellspacing' and value == '0': got_cellspacing = True elif key == 'cellpadding' and value == '0': got_cellpadding = True if got_width and got_border and got_cellspacing and got_cellpadding: self._in_results_table = 2 else: self._in_results_table = 0 class StHelensSearchParser(ApplicationSearchServletParser): # results table spotter # summary="Search Results List" _reference_col_no = 1 _location_col_no = 2 _description_col_no = 5 def _checkAttrsForResultsTable(self, attrs): got_summary = False for key, value in attrs: if key == 'summary' and value == "Search Results List": got_summary = True if got_summary: self._in_results_table = 2 else: self._in_results_table = 0 class EasingtonSearchParser(ApplicationSearchServletParser): # results table spotter #table width="100%" border="0" cellspacing="0" cellpadding="0" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 6 def _checkAttrsForResultsTable(self, attrs): got_width = False got_border = False got_cellspacing = False got_cellpadding = False for key, value in attrs: if key == 'width' and value == "100%": got_width = True elif key == 'border' and value == '0': got_border = True elif key == 'cellspacing' and value == '0': got_cellspacing = True elif key == 'cellpadding' and value == '0': got_cellpadding = True if got_width and got_border and got_cellspacing and got_cellpadding: self._in_results_table = 2 else: self._in_results_table = 0 class HighPeakSearchParser(ApplicationSearchServletParser): # results table spotter # table class="data" width="95%" _reference_col_no = 1 _location_col_no = 2 _description_col_no = 5 def _checkAttrsForResultsTable(self, attrs): got_class = False got_width = False for key, value in attrs: if key == 'class' and value == "data": got_class = True if key == 'width' and value == "95%": got_width = True if got_class and got_width: self._in_results_table = 2 else: self._in_results_table = 0 class WearValleySearchParser(ApplicationSearchServletParser): # results table spotter # table summary="Table of planning applications that matched your query, showing reference number, received date, and address" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 4 def _checkAttrsForResultsTable(self, attrs): got_summary= False for key, value in attrs: if key == 'summary' and value == "Table of planning applications that matched your query, showing reference number, received date, and address": got_summary = True if got_summary: self._in_results_table = 2 else: self._in_results_table = 0 class WellingboroughSearchParser(ApplicationSearchServletParser): # results table spotter #table width="100%" border="0" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 6 def _checkAttrsForResultsTable(self, attrs): got_width = False got_border = False for key, value in attrs: if key == 'width' and value == "100%": got_width = True elif key == 'border' and value == "0": got_border = True if got_width and got_border: self._in_results_table = 2 else: self._in_results_table = 0 class EalingSearchParser(ApplicationSearchServletParser): # results table spotter # table width="100%" cellspacing="0px" border="1px" cellpadding="2px" bordercolor="#FFFFFF" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 4 def _checkAttrsForResultsTable(self, attrs): got_width = False got_cellspacing = False got_border = False got_cellpadding = False got_bordercolor = False for key, value in attrs: if key == 'width' and value == "100%": got_width = True elif key == 'cellspacing' and value == "0px": got_cellspacing = True elif key == 'border' and value == "1px": got_border = True elif key == 'cellpadding' and value == "2px": got_cellpadding = True elif key == 'bordercolor' and value == "#FFFFFF": got_bordercolor = True if got_width and got_cellspacing and got_border and got_cellpadding and got_bordercolor: self._in_results_table = 2 else: self._in_results_table = 0 class HaringeySearchParser(ApplicationSearchServletParser): # results table spotter # summary="Application Results" _reference_col_no = 1 _location_col_no = 2 _description_col_no = 5 def _checkAttrsForResultsTable(self, attrs): got_summary= False for key, value in attrs: if key == 'summary' and value == "Application Results": got_summary = True if got_summary: self._in_results_table = 2 else: self._in_results_table = 0 class DenbighshireSearchParser(ApplicationSearchServletParser): # results table spotter #table width="100%" border="0" _reference_col_no = 1 _location_col_no = 3 _description_col_no = 5 def _checkAttrsForResultsTable(self, attrs): got_width = False got_border = False for key, value in attrs: if key == 'width' and value == "100%": got_width = True elif key == 'border' and value == "0": got_border = True if got_width and got_border: self._in_results_table = 2 else: self._in_results_table = 0 if __name__ == "__main__": #parser = CoventrySearchParser("Coventry", "Coventry", "http://planning.coventry.gov.uk") #parser = AllerdaleSearchParser("Allerdale", "Allerdale", "http://planning.allerdale.gov.uk") #parser = AlnwickSearchParser("Alnwick", "Alnwick", "http://services.castlemorpeth.gov.uk:7777") #parser = BarrowSearchParser("Barrow", "Barrow", "http://localportal.barrowbc.gov.uk") #parser = HartlepoolSearchParser("Hartlepool", "Hartlepool", "http://eforms.hartlepool.gov.uk:7777") #parser = NorthWarksSearchParser("North Warwickshire", "North Warks", "http://planning.northwarks.gov.uk") #parser = StHelensSearchParser("St Helens", "St Helens", "http://212.248.225.150:8080") #parser = EasingtonSearchParser("Easington", "Easington", "http://planning.easington.gov.uk") #parser = HighPeakSearchParser("High Peak", "High Peak", "http://planning.highpeak.gov.uk") #parser = WearValleySearchParser("Wear Valley", "Wear Valley", "http://planning.wearvalley.gov.uk") #parser = WellingboroughSearchParser("Wellingborough", "Wellingborough", "http://planning.wellingborough.gov.uk") #parser = EalingSearchParser("Ealing", "Ealing", "http://www.pam.ealing.gov.uk") parser = HaringeySearchParser("Haringey", "Haringey", "http://www.planningservices.haringey.gov.uk") #parser = DenbighshireSearchParser("Denbighshire", "Denbighshire", "http://planning.denbighshire.gov.uk") print parser.getResults(1,8,2008)