diff --git a/CGI/SouthOxfordshire.py b/CGI/SouthOxfordshire.py deleted file mode 100755 index ef3066f..0000000 --- a/CGI/SouthOxfordshire.py +++ /dev/null @@ -1,263 +0,0 @@ -#!/usr/bin/python - -import cgi -import cgitb; cgitb.enable(display=0, logdir="/tmp") - -import urllib, urllib2 -import HTMLParser -import urlparse -import datetime, time - -# This needs a page number inserting -search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d" - -# This needs the council reference -comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s" - -authority_name = "South Oxfordshire District Council" -authority_short_name = "South Oxfordshire" - - -from PlanningUtils import fixNewlines, \ - getPostcodeFromText, \ - PlanningAuthorityResults, \ - PlanningApplication - -class SouthOxfordshireParser(HTMLParser.HTMLParser): - """In this case we'll take the date, so that we can avoid doing dowloads for - the other days in this week's file. This date should be a datetime.date object. - """ - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - - self._requested_date = None - - # We'll keep a count of the number of tables we have seen. - # All the interesting stuff is in table 3 - self._table_count = 0 - - # While inside table 3, we'll keep a count of the number of - # s we have seen. What is in which numbered is detailed below. - # 1 reference - # 3 place and description - # 5 date received - # 2 and 4 are just padding - self._td_count = 0 - - # This is just a flag to say that we are now ready to get the reference - # from the next bit of data - self._get_reference = False - - self._data = '' - - # this will hold the application we are currently working on. - self._current_application = None - - # The object which stores our set of planning application results - self._results = PlanningAuthorityResults(authority_name, authority_short_name) - - def handle_starttag(self, tag, attrs): - # if we see a table tag, increment the table count. - if tag == 'table': - self._table_count += 1 - - # we are only interested in other tags if we are in table 3. - if self._table_count == 3: - - # If we are starting a , create a new PlanningApplication object - # for the application currently being processed - if tag == 'tr': - self._current_application = PlanningApplication() - - # if we see a td, increment the count. - if tag == 'td': - self._td_count += 1 - - # if we are in the first , and we see a link, - # then it is to the info page for this applicaion. - if tag == 'a' and self._td_count == 1: - for key, value in attrs: - if key == 'href': - url_end = value - self._current_application.info_url = urlparse.urljoin(search_url,url_end) - - # We now know that the next bit of data is the reference - self._get_reference = True - - # href is the only attribute we are interested in. - break - - def handle_endtag(self, tag): - # There is no need to do anything unless we are in table 3. - if self._table_count == 3: - - # The end indicates that the current application is finished. - # Now we can fetch the info_page to get the address, postcode, - # and description. - # If we don't have a reference, then we are in the header row, - # which we don't want. - # There is no point in doing this if the date is not the requested one. - - if tag == 'tr' and \ - self._current_application.council_reference is not None and \ - self._current_application.date_received == self._requested_date: - - info_page_parser = SouthOxfordshireInfoURLParser() - info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read()) - - self._current_application.address = info_page_parser.address - self._current_application.postcode = getPostcodeFromText(info_page_parser.address) - self._current_application.description = info_page_parser.description - - # Add the current application to the results set - self._results.addApplication(self._current_application) - - # At the end of the 5th , self._data should contain - # the received date of the application. - if tag == 'td' and self._td_count == 5: - app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3]) - self._current_application.date_received = datetime.date(app_year, app_month, app_day) - - self._data = '' - self._td_count = 0 - - def handle_data(self, data): - # There is no need to do anything if we aren't in table 3. - if self._table_count == 3: - # If we are in the first , and the get_reference flag is set, - # then the next data is the reference. - if self._td_count == 1 and self._get_reference: - self._current_application.council_reference = data - - # The comment url can now be made, as it depends only on the reference. - # On this site, the link to the comment page is only displayed once - # the planning authority has decided who is handling this application - # and has opened consultations. The link below works straight away, - # and also works for apps for which the consultation period is over. - # I have no idea if anything is actually done with these comments if - # it is followed too early... - self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference} - - # Set the get_reference flag back to False. - self._get_reference = False - - # If we are in the 5th , then we need to collect all the data together - # before we can use it. This is actually processed in handle_endtag. - if self._td_count == 5: - self._data += data - - def handle_entityref( self, ref ): - # We might have some entity_refs to clear up. - # there is no need to bother with this if we aren't in the results table. - if self._table_count == 3 and self._td_count == 5: - if ref == 'nbsp': - self._data += ' ' - - - def getResultsByDayMonthYear(self, day, month, year): - """This will return an ApplicationResults object containg the - applications for the date passed in.""" - - today = datetime.date.today() - self.requested_date = datetime.date(year, month, day) - delta = today - self.requested_date - - # to get the correct page, we need - # page ((days mod 7) + 1) - page_number = delta.days/7 + 1 - - response = urllib2.urlopen(search_url %page_number) - - self.feed(response.read()) - - return self._results - - - def getResults(self, day, month, year): - return getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() - -class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser): - """This parser is to get the description and address out of the info page - for a South Oxfordshire application.""" - - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - - self.address = None - self.description = None - - # These two states will be set to: - # 0 - if we haven't yet got that bit - # 1 - if we are currently working on it - # 2 - if we have finished - self._address_state = 0 - self._description_state = 0 - - # We well need to know whether or not we are in a - self._in_td = False - - # This is used for collecting together date which comes in several bits. - self._data = '' - - def handle_starttag(self, tag, attrs): - # If we see the start of a and we are still interested in some data - # then set the td flag to true, and blank the data - if tag == 'td' and (self._address_state < 2 or self._description_state < 2): - self._in_td = True - self._data = '' - - def handle_endtag(self, tag): - if tag == 'td' and (self._address_state < 2 or self._description_state < 2): - # If we are working on the description, - # set description from _data and note that we need to work on it no more. - if self._description_state == 1: - self.description = self._data - self._description_state = 2 - - - # If we are working on the address, - # set address from _data and note that we need to work on it no more. - elif self._address_state == 1: - self.address = self._data - self._address_state = 2 - - # If we see data which says 'Descripton', - # then set the description state to working. - elif self._data.strip() == 'Description': - self._description_state = 1 - - # If we see data which says 'Location', - # then set the addresss state to working. - elif self._data.strip() == 'Location': - self._address_state = 1 - - # Note that we are leaving the - self._in_td = False - - def handle_data(self, data): - # if we are in a td, and we are still interested in the data for something, - # append the current bit to self._data - if self._in_td and (self._address_state < 2 or self._description_state < 2): - self._data += data - - -# TODO - -# find out what time of day this is run - does it matter that -# we aren't being careful with daylight saving time etc. - -# Can we check that scraped email address really is -# an email address? - -if __name__ == "__main__": - form = cgi.FieldStorage() - day = form.getfirst('day') - month = form.getfirst('month') - year = form.getfirst('year') - - parser = SouthOxfordshireParser() - - - print "Content-Type: text/xml" # XML is following - print - print xml # print the xml