diff --git a/trunk/CGI/Oswestry.cgi b/trunk/CGI/Oswestry.cgi
new file mode 100755
index 0000000..dc1629b
--- /dev/null
+++ b/trunk/CGI/Oswestry.cgi
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+
+# This is the parser for Oswestry Borough Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "Oswestry Borough Council"
+authority_short_name = "Oswestry"
+base_url = "http://193.114.205.78/PublicAccess/tdc/"
+
+import PublicAccess
+
+parser = PublicAccess.PublicAccessParser(authority_name,
+ authority_short_name,
+ base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml" # XML is following
+print
+print xml # print the xml
diff --git a/trunk/CGI/PlanningUtils.py b/trunk/CGI/PlanningUtils.py
deleted file mode 100644
index 3430576..0000000
--- a/trunk/CGI/PlanningUtils.py
+++ /dev/null
@@ -1,101 +0,0 @@
-__auth__ = None
-
-import re
-
-date_format = "%d/%m/%Y"
-
-
-def xmlQuote(text):
- # Change &s to &s
- # I suspect there is probably some standard python
- # function I should be using for this...
- return text.replace('&', '&')
-
-def fixNewlines(text):
- # This can be used to sort out windows newlines
- return text.replace("\r\n","\n")
-
-# So what can a postcode look like then?
-# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
-#AN NAA M1 1AA
-#ANN NAA M60 1NW
-#AAN NAA CR2 6XH
-#AANN NAA DN55 1PT
-#ANA NAA W1A 1HP
-#AANA NAA EC1A 1BB
-
-postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")
-
-def getPostcodeFromText(text):
- """This function takes a piece of text and returns the first
- bit of it that looks like a postcode."""
-
- postcode_match = postcode_regex.search(text)
-
- if postcode_match is not None:
- return postcode_match.group()
-
-
-class PlanningAuthorityResults:
- """This class represents a set of results of a planning search.
-
- This should probably be separated out so that it can be used for
- authorities other than Cherwell.
- """
-
- def __init__(self, authority_name, authority_short_name):
- self.authority_name = authority_name
- self.authority_short_name = authority_short_name
-
- # this will be a list of PlanningApplication objects
- self.planning_applications = []
-
-
- def addApplication(self, application):
- self.planning_applications.append(application)
-
- def __repr__(self):
- return self.displayXML()
-
- def displayXML(self):
- """This should display the contents of this object in the planningalerts format.
- i.e. in the same format as this one:
- http://www.planningalerts.com/lambeth.xml
- """
-
- applications_bit = "".join([x.displayXML() for x in self.planning_applications])
-
- return "\n" +\
- "%s\n" %self.authority_name +\
- "%s\n" %self.authority_short_name +\
- "\n" + applications_bit +\
- "\n" +\
- "\n"
-
-
-
-class PlanningApplication:
- def __init__(self, no_postcode_default='No postcode'):
- self.council_reference = None
- self.address = None
- self.postcode = no_postcode_default
- self.description = None
- self.info_url = None
- self.comment_url = None
-
- # expecting this as a datetime.date object
- self.date_received = None
-
- def __repr__(self):
- return self.displayXML()
-
- def displayXML(self):
- return "\n" +\
- "%s\n" %xmlQuote(self.council_reference) +\
- "%s\n" %xmlQuote(self.address) +\
- "%s\n" %self.postcode +\
- "%s\n" %xmlQuote(self.description) +\
- "%s\n" %xmlQuote(self.info_url) +\
- "%s\n" %xmlQuote(self.comment_url) +\
- "%s\n" %self.date_received.strftime(date_format) +\
- "\n"
diff --git a/trunk/CGI/PublicAccess.py b/trunk/CGI/PublicAccess.py
deleted file mode 100644
index b7873ac..0000000
--- a/trunk/CGI/PublicAccess.py
+++ /dev/null
@@ -1,358 +0,0 @@
-#!/usr/bin/python
-
-import urllib, urllib2
-import HTMLParser
-import urlparse
-import datetime, time
-
-import cookielib
-
-cookie_jar = cookielib.CookieJar()
-
-
-from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication
-
-
-search_form_url_end = "DcApplication/application_searchform.aspx"
-search_results_url_end = "DcApplication/application_searchresults.aspx"
-comments_url_end = "DcApplication/application_comments_entryform.aspx"
-
-class PublicAccessParser(HTMLParser.HTMLParser):
- """This is the class which parses the PublicAccess search results page.
- """
-
- def __init__(self,
- authority_name,
- authority_short_name,
- base_url,
- debug=False):
-
- HTMLParser.HTMLParser.__init__(self)
-
- self.authority_name = authority_name
- self.authority_short_name = authority_short_name
- self.base_url = base_url
-
- self.debug = debug
-
- # this will change to True when we enter the table of results
- self._in_results_table = False
-
- # this will be set to True when we have passed the header row
- # in the results table
- self._past_header_row = False
-
- # this will be true when we are in a
is detailed below.
- # 1 reference
- # 3 place and description
- # 5 date received
- # 2 and 4 are just padding
- self._td_count = 0
-
- # This is just a flag to say that we are now ready to get the reference
- # from the next bit of data
- self._get_reference = False
-
- self._data = ''
-
- # this will hold the application we are currently working on.
- self._current_application = None
-
- # The object which stores our set of planning application results
- self._results = PlanningAuthorityResults(authority_name, authority_short_name)
-
- def handle_starttag(self, tag, attrs):
- # if we see a table tag, increment the table count.
- if tag == 'table':
- self._table_count += 1
-
- # we are only interested in other tags if we are in table 3.
- if self._table_count == 3:
-
- # If we are starting a
, create a new PlanningApplication object
- # for the application currently being processed
- if tag == 'tr':
- self._current_application = PlanningApplication()
-
- # if we see a td, increment the
count.
- if tag == 'td':
- self._td_count += 1
-
- # if we are in the first
, and we see a link,
- # then it is to the info page for this applicaion.
- if tag == 'a' and self._td_count == 1:
- for key, value in attrs:
- if key == 'href':
- url_end = value
- self._current_application.info_url = urlparse.urljoin(search_url,url_end)
-
- # We now know that the next bit of data is the reference
- self._get_reference = True
-
- # href is the only attribute we are interested in.
- break
-
- def handle_endtag(self, tag):
- # There is no need to do anything unless we are in table 3.
- if self._table_count == 3:
-
- # The end
indicates that the current application is finished.
- # Now we can fetch the info_page to get the address, postcode,
- # and description.
- # If we don't have a reference, then we are in the header row,
- # which we don't want.
- # There is no point in doing this if the date is not the requested one.
-
- if tag == 'tr' and \
- self._current_application.council_reference is not None and \
- self._current_application.date_received == self._requested_date:
-
- info_page_parser = SouthOxfordshireInfoURLParser()
- info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())
-
- self._current_application.address = info_page_parser.address
- self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
- self._current_application.description = info_page_parser.description
-
- # Add the current application to the results set
- self._results.addApplication(self._current_application)
-
- # At the end of the 5th
, self._data should contain
- # the received date of the application.
- if tag == 'td' and self._td_count == 5:
- app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
- self._current_application.date_received = datetime.date(app_year, app_month, app_day)
- self._data = ''
- self._td_count = 0
-
- def handle_data(self, data):
- # There is no need to do anything if we aren't in table 3.
- if self._table_count == 3:
- # If we are in the first
, and the get_reference flag is set,
- # then the next data is the reference.
- if self._td_count == 1 and self._get_reference:
- self._current_application.council_reference = data
-
- # The comment url can now be made, as it depends only on the reference.
- # On this site, the link to the comment page is only displayed once
- # the planning authority has decided who is handling this application
- # and has opened consultations. The link below works straight away,
- # and also works for apps for which the consultation period is over.
- # I have no idea if anything is actually done with these comments if
- # it is followed too early...
- self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}
-
- # Set the get_reference flag back to False.
- self._get_reference = False
-
- # If we are in the 5th
, then we need to collect all the data together
- # before we can use it. This is actually processed in handle_endtag.
- if self._td_count == 5:
- self._data += data
-
- def handle_entityref( self, ref ):
- # We might have some entity_refs to clear up.
- # there is no need to bother with this if we aren't in the results table.
- if self._table_count == 3 and self._td_count == 5:
- if ref == 'nbsp':
- self._data += ' '
-
-
- def getResultsByDayMonthYear(self, day, month, year):
- """This will return an ApplicationResults object containg the
- applications for the date passed in."""
-
- today = datetime.date.today()
- self._requested_date = datetime.date(year, month, day)
- delta = today - self._requested_date
-
- # to get the correct page, we need
- # page ((days mod 7) + 1)
- page_number = delta.days/7 + 1
-
- response = urllib2.urlopen(search_url %page_number)
-
- contents = response.read()
-
- self.feed(contents)
-
- return self._results
-
-
- def getResults(self, day, month, year):
- return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
-
-class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
- """This parser is to get the description and address out of the info page
- for a South Oxfordshire application."""
-
- def __init__(self):
- HTMLParser.HTMLParser.__init__(self)
-
- self.address = None
- self.description = None
-
- # These two states will be set to:
- # 0 - if we haven't yet got that bit
- # 1 - if we are currently working on it
- # 2 - if we have finished
- self._address_state = 0
- self._description_state = 0
-
- # We well need to know whether or not we are in a
- self._in_td = False
-
- # This is used for collecting together date which comes in several bits.
- self._data = ''
-
- def handle_starttag(self, tag, attrs):
- # If we see the start of a
and we are still interested in some data
- # then set the td flag to true, and blank the data
- if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
- self._in_td = True
- self._data = ''
-
- def handle_endtag(self, tag):
- if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
- # If we are working on the description,
- # set description from _data and note that we need to work on it no more.
- if self._description_state == 1:
- self.description = self._data
- self._description_state = 2
-
-
- # If we are working on the address,
- # set address from _data and note that we need to work on it no more.
- elif self._address_state == 1:
- self.address = self._data
- self._address_state = 2
-
- # If we see data which says 'Descripton',
- # then set the description state to working.
- elif self._data.strip() == 'Description':
- self._description_state = 1
-
- # If we see data which says 'Location',
- # then set the addresss state to working.
- elif self._data.strip() == 'Location':
- self._address_state = 1
-
- # Note that we are leaving the
- self._in_td = False
-
- def handle_data(self, data):
- # if we are in a td, and we are still interested in the data for something,
- # append the current bit to self._data
- if self._in_td and (self._address_state < 2 or self._description_state < 2):
- self._data += data
-
-
-# TODO
-
-# find out what time of day this is run - does it matter that
-# we aren't being careful with daylight saving time etc.
-
-# Can we check that scraped email address really is
-# an email address?