diff --git a/trunk/CGI/CGITemplate b/trunk/CGI/CGITemplate
new file mode 100644
index 0000000..e72f31e
--- /dev/null
+++ b/trunk/CGI/CGITemplate
@@ -0,0 +1,29 @@
+# This is the parser for %(authority_name)s.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "%(authority_name)s"
+authority_short_name = "%(authority_short_name)s"
+base_url = "%(base_url)s"
+
+import PublicAccess
+
+parser = PublicAccess.PublicAccessParser(authority_name,
+ authority_short_name,
+ base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml" # XML is following
+print
+print xml # print the xml
diff --git a/trunk/CGI/PlanningUtils.py b/trunk/CGI/PlanningUtils.py
new file mode 100644
index 0000000..3430576
--- /dev/null
+++ b/trunk/CGI/PlanningUtils.py
@@ -0,0 +1,101 @@
+__auth__ = None
+
+import re
+
+date_format = "%d/%m/%Y"
+
+
+def xmlQuote(text):
+ # Change &s to &s
+ # I suspect there is probably some standard python
+ # function I should be using for this...
+ return text.replace('&', '&')
+
+def fixNewlines(text):
+ # This can be used to sort out windows newlines
+ return text.replace("\r\n","\n")
+
+# So what can a postcode look like then?
+# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
+#AN NAA M1 1AA
+#ANN NAA M60 1NW
+#AAN NAA CR2 6XH
+#AANN NAA DN55 1PT
+#ANA NAA W1A 1HP
+#AANA NAA EC1A 1BB
+
+postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")
+
+def getPostcodeFromText(text):
+ """This function takes a piece of text and returns the first
+ bit of it that looks like a postcode."""
+
+ postcode_match = postcode_regex.search(text)
+
+ if postcode_match is not None:
+ return postcode_match.group()
+
+
+class PlanningAuthorityResults:
+ """This class represents a set of results of a planning search.
+
+ This should probably be separated out so that it can be used for
+ authorities other than Cherwell.
+ """
+
+ def __init__(self, authority_name, authority_short_name):
+ self.authority_name = authority_name
+ self.authority_short_name = authority_short_name
+
+ # this will be a list of PlanningApplication objects
+ self.planning_applications = []
+
+
+ def addApplication(self, application):
+ self.planning_applications.append(application)
+
+ def __repr__(self):
+ return self.displayXML()
+
+ def displayXML(self):
+ """This should display the contents of this object in the planningalerts format.
+ i.e. in the same format as this one:
+ http://www.planningalerts.com/lambeth.xml
+ """
+
+ applications_bit = "".join([x.displayXML() for x in self.planning_applications])
+
+ return "\n" +\
+ "%s\n" %self.authority_name +\
+ "%s\n" %self.authority_short_name +\
+ "\n" + applications_bit +\
+ "\n" +\
+ "\n"
+
+
+
+class PlanningApplication:
+ def __init__(self, no_postcode_default='No postcode'):
+ self.council_reference = None
+ self.address = None
+ self.postcode = no_postcode_default
+ self.description = None
+ self.info_url = None
+ self.comment_url = None
+
+ # expecting this as a datetime.date object
+ self.date_received = None
+
+ def __repr__(self):
+ return self.displayXML()
+
+ def displayXML(self):
+ return "\n" +\
+ "%s\n" %xmlQuote(self.council_reference) +\
+ "%s\n" %xmlQuote(self.address) +\
+ "%s\n" %self.postcode +\
+ "%s\n" %xmlQuote(self.description) +\
+ "%s\n" %xmlQuote(self.info_url) +\
+ "%s\n" %xmlQuote(self.comment_url) +\
+ "%s\n" %self.date_received.strftime(date_format) +\
+ "\n"
diff --git a/trunk/CGI/PublicAccess.py b/trunk/CGI/PublicAccess.py
new file mode 100644
index 0000000..db635be
--- /dev/null
+++ b/trunk/CGI/PublicAccess.py
@@ -0,0 +1,341 @@
+#!/usr/bin/python
+
+import urllib, urllib2
+import HTMLParser
+import urlparse
+import datetime, time
+
+import cookielib
+
+cookie_jar = cookielib.CookieJar()
+
+
+from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication
+
+
+search_form_url_end = "tdc/DcApplication/application_searchform.aspx"
+search_results_url_end = "tdc/DcApplication/application_searchresults.aspx"
+comments_url_end = "tdc/DcApplication/application_comments_entryform.aspx"
+
+class PublicAccessParser(HTMLParser.HTMLParser):
+ """This is the class which parses the PublicAccess search results page.
+ """
+
+ def __init__(self,
+ authority_name,
+ authority_short_name,
+ base_url,
+ debug=False):
+
+ HTMLParser.HTMLParser.__init__(self)
+
+ self.authority_name = authority_name
+ self.authority_short_name = authority_short_name
+ self.base_url = base_url
+
+ self.debug = debug
+
+ # this will change to True when we enter the table of results
+ self._in_results_table = False
+
+ # this will be set to True when we have passed the header row
+ # in the results table
+ self._past_header_row = False
+
+ # this will be true when we are in a
is detailed below.
+ # 1 reference
+ # 3 place and description
+ # 5 date received
+ # 2 and 4 are just padding
+ self._td_count = 0
+
+ # This is just a flag to say that we are now ready to get the reference
+ # from the next bit of data
+ self._get_reference = False
+
+ self._data = ''
+
+ # this will hold the application we are currently working on.
+ self._current_application = None
+
+ # The object which stores our set of planning application results
+ self._results = PlanningAuthorityResults(authority_name, authority_short_name)
+
+ def handle_starttag(self, tag, attrs):
+ # if we see a table tag, increment the table count.
+ if tag == 'table':
+ self._table_count += 1
+
+ # we are only interested in other tags if we are in table 3.
+ if self._table_count == 3:
+
+ # If we are starting a
, create a new PlanningApplication object
+ # for the application currently being processed
+ if tag == 'tr':
+ self._current_application = PlanningApplication()
+
+ # if we see a td, increment the
count.
+ if tag == 'td':
+ self._td_count += 1
+
+ # if we are in the first
, and we see a link,
+ # then it is to the info page for this applicaion.
+ if tag == 'a' and self._td_count == 1:
+ for key, value in attrs:
+ if key == 'href':
+ url_end = value
+ self._current_application.info_url = urlparse.urljoin(search_url,url_end)
+
+ # We now know that the next bit of data is the reference
+ self._get_reference = True
+
+ # href is the only attribute we are interested in.
+ break
+
+ def handle_endtag(self, tag):
+ # There is no need to do anything unless we are in table 3.
+ if self._table_count == 3:
+
+ # The end
indicates that the current application is finished.
+ # Now we can fetch the info_page to get the address, postcode,
+ # and description.
+ # If we don't have a reference, then we are in the header row,
+ # which we don't want.
+ # There is no point in doing this if the date is not the requested one.
+
+ if tag == 'tr' and \
+ self._current_application.council_reference is not None and \
+ self._current_application.date_received == self._requested_date:
+
+ info_page_parser = SouthOxfordshireInfoURLParser()
+ info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())
+
+ self._current_application.address = info_page_parser.address
+ self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
+ self._current_application.description = info_page_parser.description
+
+ # Add the current application to the results set
+ self._results.addApplication(self._current_application)
+
+ # At the end of the 5th
, self._data should contain
+ # the received date of the application.
+ if tag == 'td' and self._td_count == 5:
+ app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
+ self._current_application.date_received = datetime.date(app_year, app_month, app_day)
+
+ self._data = ''
+ self._td_count = 0
+
+ def handle_data(self, data):
+ # There is no need to do anything if we aren't in table 3.
+ if self._table_count == 3:
+ # If we are in the first
, and the get_reference flag is set,
+ # then the next data is the reference.
+ if self._td_count == 1 and self._get_reference:
+ self._current_application.council_reference = data
+
+ # The comment url can now be made, as it depends only on the reference.
+ # On this site, the link to the comment page is only displayed once
+ # the planning authority has decided who is handling this application
+ # and has opened consultations. The link below works straight away,
+ # and also works for apps for which the consultation period is over.
+ # I have no idea if anything is actually done with these comments if
+ # it is followed too early...
+ self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}
+
+ # Set the get_reference flag back to False.
+ self._get_reference = False
+
+ # If we are in the 5th
, then we need to collect all the data together
+ # before we can use it. This is actually processed in handle_endtag.
+ if self._td_count == 5:
+ self._data += data
+
+ def handle_entityref( self, ref ):
+ # We might have some entity_refs to clear up.
+ # there is no need to bother with this if we aren't in the results table.
+ if self._table_count == 3 and self._td_count == 5:
+ if ref == 'nbsp':
+ self._data += ' '
+
+
+ def getResultsByDayMonthYear(self, day, month, year):
+ """This will return an ApplicationResults object containg the
+ applications for the date passed in."""
+
+ today = datetime.date.today()
+ self.requested_date = datetime.date(year, month, day)
+ delta = today - self.requested_date
+
+ # to get the correct page, we need
+ # page ((days mod 7) + 1)
+ page_number = delta.days/7 + 1
+
+ response = urllib2.urlopen(search_url %page_number)
+
+ self.feed(response.read())
+
+ return self._results
+
+
+ def getResults(self, day, month, year):
+ return getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
+ """This parser is to get the description and address out of the info page
+ for a South Oxfordshire application."""
+
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+
+ self.address = None
+ self.description = None
+
+ # These two states will be set to:
+ # 0 - if we haven't yet got that bit
+ # 1 - if we are currently working on it
+ # 2 - if we have finished
+ self._address_state = 0
+ self._description_state = 0
+
+ # We well need to know whether or not we are in a
+ self._in_td = False
+
+ # This is used for collecting together date which comes in several bits.
+ self._data = ''
+
+ def handle_starttag(self, tag, attrs):
+ # If we see the start of a
and we are still interested in some data
+ # then set the td flag to true, and blank the data
+ if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
+ self._in_td = True
+ self._data = ''
+
+ def handle_endtag(self, tag):
+ if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
+ # If we are working on the description,
+ # set description from _data and note that we need to work on it no more.
+ if self._description_state == 1:
+ self.description = self._data
+ self._description_state = 2
+
+
+ # If we are working on the address,
+ # set address from _data and note that we need to work on it no more.
+ elif self._address_state == 1:
+ self.address = self._data
+ self._address_state = 2
+
+ # If we see data which says 'Descripton',
+ # then set the description state to working.
+ elif self._data.strip() == 'Description':
+ self._description_state = 1
+
+ # If we see data which says 'Location',
+ # then set the addresss state to working.
+ elif self._data.strip() == 'Location':
+ self._address_state = 1
+
+ # Note that we are leaving the
+ self._in_td = False
+
+ def handle_data(self, data):
+ # if we are in a td, and we are still interested in the data for something,
+ # append the current bit to self._data
+ if self._in_td and (self._address_state < 2 or self._description_state < 2):
+ self._data += data
+
+
+# TODO
+
+# find out what time of day this is run - does it matter that
+# we aren't being careful with daylight saving time etc.
+
+# Can we check that scraped email address really is
+# an email address?
+
+if __name__ == "__main__":
+ form = cgi.FieldStorage()
+ day = form.getfirst('day')
+ month = form.getfirst('month')
+ year = form.getfirst('year')
+
+ parser = SouthOxfordshireParser()
+
+
+ print "Content-Type: text/xml" # XML is following
+ print
+ print xml # print the xml
diff --git a/trunk/CGI/SouthOxfordshireParser.py b/trunk/CGI/SouthOxfordshireParser.py
new file mode 100644
index 0000000..0097ee5
--- /dev/null
+++ b/trunk/CGI/SouthOxfordshireParser.py
@@ -0,0 +1,248 @@
+
+import urllib, urllib2
+
+import HTMLParser
+import urlparse
+import datetime, time
+
+# This needs a page number inserting
+search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d"
+
+# This needs the council reference
+comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s"
+
+authority_name = "South Oxfordshire District Council"
+authority_short_name = "South Oxfordshire"
+
+
+from PlanningUtils import fixNewlines, \
+ getPostcodeFromText, \
+ PlanningAuthorityResults, \
+ PlanningApplication
+
+class SouthOxfordshireParser(HTMLParser.HTMLParser):
+ """In this case we'll take the date, so that we can avoid doing dowloads for
+ the other days in this week's file. This date should be a datetime.date object.
+ """
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+
+ self._requested_date = None
+
+ # We'll keep a count of the number of tables we have seen.
+ # All the interesting stuff is in table 3
+ self._table_count = 0
+
+ # While inside table 3, we'll keep a count of the number of
+ #
s we have seen. What is in which numbered
is detailed below.
+ # 1 reference
+ # 3 place and description
+ # 5 date received
+ # 2 and 4 are just padding
+ self._td_count = 0
+
+ # This is just a flag to say that we are now ready to get the reference
+ # from the next bit of data
+ self._get_reference = False
+
+ self._data = ''
+
+ # this will hold the application we are currently working on.
+ self._current_application = None
+
+ # The object which stores our set of planning application results
+ self._results = PlanningAuthorityResults(authority_name, authority_short_name)
+
+ def handle_starttag(self, tag, attrs):
+ # if we see a table tag, increment the table count.
+ if tag == 'table':
+ self._table_count += 1
+
+ # we are only interested in other tags if we are in table 3.
+ if self._table_count == 3:
+
+ # If we are starting a
, create a new PlanningApplication object
+ # for the application currently being processed
+ if tag == 'tr':
+ self._current_application = PlanningApplication()
+
+ # if we see a td, increment the
count.
+ if tag == 'td':
+ self._td_count += 1
+
+ # if we are in the first
, and we see a link,
+ # then it is to the info page for this applicaion.
+ if tag == 'a' and self._td_count == 1:
+ for key, value in attrs:
+ if key == 'href':
+ url_end = value
+ self._current_application.info_url = urlparse.urljoin(search_url,url_end)
+
+ # We now know that the next bit of data is the reference
+ self._get_reference = True
+
+ # href is the only attribute we are interested in.
+ break
+
+ def handle_endtag(self, tag):
+ # There is no need to do anything unless we are in table 3.
+ if self._table_count == 3:
+
+ # The end
indicates that the current application is finished.
+ # Now we can fetch the info_page to get the address, postcode,
+ # and description.
+ # If we don't have a reference, then we are in the header row,
+ # which we don't want.
+ # There is no point in doing this if the date is not the requested one.
+
+ if tag == 'tr' and \
+ self._current_application.council_reference is not None and \
+ self._current_application.date_received == self._requested_date:
+
+ info_page_parser = SouthOxfordshireInfoURLParser()
+ info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())
+
+ self._current_application.address = info_page_parser.address
+ self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
+ self._current_application.description = info_page_parser.description
+
+ # Add the current application to the results set
+ self._results.addApplication(self._current_application)
+
+ # At the end of the 5th
, self._data should contain
+ # the received date of the application.
+ if tag == 'td' and self._td_count == 5:
+ app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
+ self._current_application.date_received = datetime.date(app_year, app_month, app_day)
+ self._data = ''
+ self._td_count = 0
+
+ def handle_data(self, data):
+ # There is no need to do anything if we aren't in table 3.
+ if self._table_count == 3:
+ # If we are in the first
, and the get_reference flag is set,
+ # then the next data is the reference.
+ if self._td_count == 1 and self._get_reference:
+ self._current_application.council_reference = data
+
+ # The comment url can now be made, as it depends only on the reference.
+ # On this site, the link to the comment page is only displayed once
+ # the planning authority has decided who is handling this application
+ # and has opened consultations. The link below works straight away,
+ # and also works for apps for which the consultation period is over.
+ # I have no idea if anything is actually done with these comments if
+ # it is followed too early...
+ self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}
+
+ # Set the get_reference flag back to False.
+ self._get_reference = False
+
+ # If we are in the 5th
, then we need to collect all the data together
+ # before we can use it. This is actually processed in handle_endtag.
+ if self._td_count == 5:
+ self._data += data
+
+ def handle_entityref( self, ref ):
+ # We might have some entity_refs to clear up.
+ # there is no need to bother with this if we aren't in the results table.
+ if self._table_count == 3 and self._td_count == 5:
+ if ref == 'nbsp':
+ self._data += ' '
+
+
+ def getResultsByDayMonthYear(self, day, month, year):
+ """This will return an ApplicationResults object containg the
+ applications for the date passed in."""
+
+ today = datetime.date.today()
+ self._requested_date = datetime.date(year, month, day)
+ delta = today - self._requested_date
+
+ # to get the correct page, we need
+ # page ((days mod 7) + 1)
+ page_number = delta.days/7 + 1
+
+ response = urllib2.urlopen(search_url %page_number)
+
+ contents = response.read()
+
+ self.feed(contents)
+
+ return self._results
+
+
+ def getResults(self, day, month, year):
+ return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
+ """This parser is to get the description and address out of the info page
+ for a South Oxfordshire application."""
+
+ def __init__(self):
+ HTMLParser.HTMLParser.__init__(self)
+
+ self.address = None
+ self.description = None
+
+ # These two states will be set to:
+ # 0 - if we haven't yet got that bit
+ # 1 - if we are currently working on it
+ # 2 - if we have finished
+ self._address_state = 0
+ self._description_state = 0
+
+ # We well need to know whether or not we are in a
+ self._in_td = False
+
+ # This is used for collecting together date which comes in several bits.
+ self._data = ''
+
+ def handle_starttag(self, tag, attrs):
+ # If we see the start of a
and we are still interested in some data
+ # then set the td flag to true, and blank the data
+ if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
+ self._in_td = True
+ self._data = ''
+
+ def handle_endtag(self, tag):
+ if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
+ # If we are working on the description,
+ # set description from _data and note that we need to work on it no more.
+ if self._description_state == 1:
+ self.description = self._data
+ self._description_state = 2
+
+
+ # If we are working on the address,
+ # set address from _data and note that we need to work on it no more.
+ elif self._address_state == 1:
+ self.address = self._data
+ self._address_state = 2
+
+ # If we see data which says 'Descripton',
+ # then set the description state to working.
+ elif self._data.strip() == 'Description':
+ self._description_state = 1
+
+ # If we see data which says 'Location',
+ # then set the addresss state to working.
+ elif self._data.strip() == 'Location':
+ self._address_state = 1
+
+ # Note that we are leaving the
+ self._in_td = False
+
+ def handle_data(self, data):
+ # if we are in a td, and we are still interested in the data for something,
+ # append the current bit to self._data
+ if self._in_td and (self._address_state < 2 or self._description_state < 2):
+ self._data += data
+
+
+# TODO
+
+# find out what time of day this is run - does it matter that
+# we aren't being careful with daylight saving time etc.
+
+# Can we check that scraped email address really is
+# an email address?
diff --git a/trunk/CGI/generateCGIScripts.py b/trunk/CGI/generateCGIScripts.py
new file mode 100755
index 0000000..41b2ab1
--- /dev/null
+++ b/trunk/CGI/generateCGIScripts.py
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+
+list_of_sites_filename = "PublicAccessSites.csv"
+template_filename = "CGITemplate"
+python_location = "/usr/bin/python"
+
+import csv
+from os import chmod
+
+list_of_sites_file = open(list_of_sites_filename)
+csv_reader = csv.DictReader(list_of_sites_file, quoting=csv.QUOTE_ALL, skipinitialspace=True)
+
+template_contents = open(template_filename).read()
+
+template = "#!" + python_location +"\n\n" + template_contents
+
+for site_dict in csv_reader:
+ filename = "%s.cgi" %site_dict["authority_short_name"]
+ contents = template %site_dict
+
+ this_file = open(filename, "w")
+ print "Writing %s" %filename
+ this_file.write(contents)
+ this_file.close()
+
+ chmod(filename, 0755)
+
+# need to look at:
+# "Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/"