import urllib, urllib2 import HTMLParser import urlparse import datetime, time # This needs a page number inserting search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d" # This needs the council reference comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s" authority_name = "South Oxfordshire District Council" authority_short_name = "South Oxfordshire" from PlanningUtils import fixNewlines, \ getPostcodeFromText, \ PlanningAuthorityResults, \ PlanningApplication class SouthOxfordshireParser(HTMLParser.HTMLParser): """In this case we'll take the date, so that we can avoid doing dowloads for the other days in this week's file. This date should be a datetime.date object. """ def __init__(self): HTMLParser.HTMLParser.__init__(self) self._requested_date = None # We'll keep a count of the number of tables we have seen. # All the interesting stuff is in table 3 self._table_count = 0 # While inside table 3, we'll keep a count of the number of # s we have seen. What is in which numbered is detailed below. # 1 reference # 3 place and description # 5 date received # 2 and 4 are just padding self._td_count = 0 # This is just a flag to say that we are now ready to get the reference # from the next bit of data self._get_reference = False self._data = '' # this will hold the application we are currently working on. self._current_application = None # The object which stores our set of planning application results self._results = PlanningAuthorityResults(authority_name, authority_short_name) def handle_starttag(self, tag, attrs): # if we see a table tag, increment the table count. if tag == 'table': self._table_count += 1 # we are only interested in other tags if we are in table 3. if self._table_count == 3: # If we are starting a , create a new PlanningApplication object # for the application currently being processed if tag == 'tr': self._current_application = PlanningApplication() # if we see a td, increment the count. if tag == 'td': self._td_count += 1 # if we are in the first , and we see a link, # then it is to the info page for this applicaion. if tag == 'a' and self._td_count == 1: for key, value in attrs: if key == 'href': url_end = value self._current_application.info_url = urlparse.urljoin(search_url,url_end) # We now know that the next bit of data is the reference self._get_reference = True # href is the only attribute we are interested in. break def handle_endtag(self, tag): # There is no need to do anything unless we are in table 3. if self._table_count == 3: # The end indicates that the current application is finished. # Now we can fetch the info_page to get the address, postcode, # and description. # If we don't have a reference, then we are in the header row, # which we don't want. # There is no point in doing this if the date is not the requested one. if tag == 'tr' and \ self._current_application.council_reference is not None and \ self._current_application.date_received == self._requested_date: info_page_parser = SouthOxfordshireInfoURLParser() info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read()) self._current_application.address = info_page_parser.address self._current_application.postcode = getPostcodeFromText(info_page_parser.address) self._current_application.description = info_page_parser.description # Add the current application to the results set self._results.addApplication(self._current_application) # At the end of the 5th , self._data should contain # the received date of the application. if tag == 'td' and self._td_count == 5: app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3]) self._current_application.date_received = datetime.date(app_year, app_month, app_day) self._data = '' self._td_count = 0 def handle_data(self, data): # There is no need to do anything if we aren't in table 3. if self._table_count == 3: # If we are in the first , and the get_reference flag is set, # then the next data is the reference. if self._td_count == 1 and self._get_reference: self._current_application.council_reference = data # The comment url can now be made, as it depends only on the reference. # On this site, the link to the comment page is only displayed once # the planning authority has decided who is handling this application # and has opened consultations. The link below works straight away, # and also works for apps for which the consultation period is over. # I have no idea if anything is actually done with these comments if # it is followed too early... self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference} # Set the get_reference flag back to False. self._get_reference = False # If we are in the 5th , then we need to collect all the data together # before we can use it. This is actually processed in handle_endtag. if self._td_count == 5: self._data += data def handle_entityref( self, ref ): # We might have some entity_refs to clear up. # there is no need to bother with this if we aren't in the results table. if self._table_count == 3 and self._td_count == 5: if ref == 'nbsp': self._data += ' ' def getResultsByDayMonthYear(self, day, month, year): """This will return an ApplicationResults object containg the applications for the date passed in.""" today = datetime.date.today() self._requested_date = datetime.date(year, month, day) delta = today - self._requested_date # to get the correct page, we need # page ((days mod 7) + 1) page_number = delta.days/7 + 1 response = urllib2.urlopen(search_url %page_number) contents = response.read() self.feed(contents) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser): """This parser is to get the description and address out of the info page for a South Oxfordshire application.""" def __init__(self): HTMLParser.HTMLParser.__init__(self) self.address = None self.description = None # These two states will be set to: # 0 - if we haven't yet got that bit # 1 - if we are currently working on it # 2 - if we have finished self._address_state = 0 self._description_state = 0 # We well need to know whether or not we are in a self._in_td = False # This is used for collecting together date which comes in several bits. self._data = '' def handle_starttag(self, tag, attrs): # If we see the start of a and we are still interested in some data # then set the td flag to true, and blank the data if tag == 'td' and (self._address_state < 2 or self._description_state < 2): self._in_td = True self._data = '' def handle_endtag(self, tag): if tag == 'td' and (self._address_state < 2 or self._description_state < 2): # If we are working on the description, # set description from _data and note that we need to work on it no more. if self._description_state == 1: self.description = self._data self._description_state = 2 # If we are working on the address, # set address from _data and note that we need to work on it no more. elif self._address_state == 1: self.address = self._data self._address_state = 2 # If we see data which says 'Descripton', # then set the description state to working. elif self._data.strip() == 'Description': self._description_state = 1 # If we see data which says 'Location', # then set the addresss state to working. elif self._data.strip() == 'Location': self._address_state = 1 # Note that we are leaving the self._in_td = False def handle_data(self, data): # if we are in a td, and we are still interested in the data for something, # append the current bit to self._data if self._in_td and (self._address_state < 2 or self._description_state < 2): self._data += data # TODO # find out what time of day this is run - does it matter that # we aren't being careful with daylight saving time etc. # Can we check that scraped email address really is # an email address?