add the south oxfordshire scraper, and code to generate

some publicaccess scrapers
17 years ago · d2373f8793
--- a/python_scrapers/CGITemplate
+++ b/python_scrapers/CGITemplate
@@ -0,0 +1,29 @@
 # This is the parser for %(authority_name)s.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "%(authority_name)s"
 authority_short_name = "%(authority_short_name)s"
 base_url = "%(base_url)s"

 import PublicAccess

 parser = PublicAccess.PublicAccessParser(authority_name,
                                         authority_short_name,
                                         base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/python_scrapers/PlanningUtils.py
+++ b/python_scrapers/PlanningUtils.py
@@ -0,0 +1,101 @@
 __auth__ = None

 import re

 date_format = "%d/%m/%Y"


 def xmlQuote(text):
    # Change &s to &amp;s
    # I suspect there is probably some standard python
    # function I should be using for this...
    return text.replace('&', '&amp;')

 def fixNewlines(text):
    # This can be used to sort out windows newlines
    return text.replace("\r\n","\n")

 # So what can a postcode look like then?
 # This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
 #AN NAA  	M1 1AA
 #ANN NAA 	M60 1NW
 #AAN NAA 	CR2 6XH
 #AANN NAA 	DN55 1PT
 #ANA NAA 	W1A 1HP
 #AANA NAA 	EC1A 1BB

 postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")

 def getPostcodeFromText(text):
    """This function takes a piece of text and returns the first
    bit of it that looks like a postcode."""

    postcode_match = postcode_regex.search(text)

    if postcode_match is not None:
        return postcode_match.group()
    

 class PlanningAuthorityResults:
    """This class represents a set of results of a planning search.

       This should probably be separated out so that it can be used for
       authorities other than Cherwell.
       """

    def __init__(self, authority_name, authority_short_name):
 	self.authority_name = authority_name
 	self.authority_short_name = authority_short_name
 	
 	# this will be a list of PlanningApplication objects
 	self.planning_applications = []


    def addApplication(self, application):
 	self.planning_applications.append(application)

    def __repr__(self):
 	return self.displayXML()

    def displayXML(self):
        """This should display the contents of this object in the planningalerts format.
           i.e. in the same format as this one:
           http://www.planningalerts.com/lambeth.xml
           """

 	applications_bit = "".join([x.displayXML() for x in self.planning_applications])

 	return "<planning>\n" +\
               "<authority_name>%s</authority_name>\n" %self.authority_name +\
               "<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
               "<applications>\n" + applications_bit +\
 	       "</applications>\n" +\
               "</planning>\n"



 class PlanningApplication:
    def __init__(self, no_postcode_default='No postcode'):
        self.council_reference = None
 	self.address = None
 	self.postcode = no_postcode_default
 	self.description = None
 	self.info_url = None
 	self.comment_url = None

        # expecting this as a datetime.date object
 	self.date_received = None

    def __repr__(self):
 	return self.displayXML()

    def displayXML(self):
 	return "<application>\n" +\
 	"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
        "<address>%s</address>\n" %xmlQuote(self.address) +\
        "<postcode>%s</postcode>\n" %self.postcode +\
 	"<description>%s</description>\n" %xmlQuote(self.description) +\
 	"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
 	"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
        "<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
        "</application>\n"
--- a/python_scrapers/PublicAccess.py
+++ b/python_scrapers/PublicAccess.py
@@ -0,0 +1,341 @@
 #!/usr/bin/python

 import urllib, urllib2
 import HTMLParser
 import urlparse
 import datetime, time

 import cookielib

 cookie_jar = cookielib.CookieJar()


 from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication


 search_form_url_end = "tdc/DcApplication/application_searchform.aspx"
 search_results_url_end = "tdc/DcApplication/application_searchresults.aspx"
 comments_url_end = "tdc/DcApplication/application_comments_entryform.aspx"

 class PublicAccessParser(HTMLParser.HTMLParser):
    """This is the class which parses the PublicAccess search results page.
    """

    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):
        
 	HTMLParser.HTMLParser.__init__(self)

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        # this will change to True when we enter the table of results
        self._in_results_table = False

        # this will be set to True when we have passed the header row
        # in the results table
        self._past_header_row = False

        # this will be true when we are in a <td> in the results table
        self._in_td = False

        # For each row, this will say how many tds we have seen so far
        self._td_count = 0

        # The object which stores our set of planning application results
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

        # This will store the planning application we are currently working on.
        self._current_application = None

    def handle_starttag(self, tag, attrs):
 	if tag == "table":
 	    self.handle_start_table(attrs)
        # we are only interested in tr tags if we are in the results table
        elif self._in_results_table and tag == "tr":
 	    self.handle_start_tr(attrs)
        # we are only interested in td tags if we are in the results table
 	elif self._in_results_table and tag == "td":
 	    self.handle_start_td(attrs)
        # we are only interested in <a> tags if we are in the 6th td in
        # the results table.
 	elif self._in_td and self._td_count == 6 and tag == "a":
 	    self.handle_start_a(attrs)
 	# If the tag is not one of these then we aren't interested

    def handle_endtag(self, tag):
        # we only need to consider end tags if we are in the results table
 	if self._in_results_table:
 	    if tag == "table":
 		self.handle_end_table()
 	    if tag == "tr":
                self.handle_end_tr()
            if tag == "td":
 		self.handle_end_td()

    def handle_start_table(self, attrs):
 	for attr,value in attrs:
 	    if attr == "class":
 		if value == "cResultsForm":
 		    self._in_results_table = True
 		    break

    def handle_end_table(self):
        # If we see an end table tag, then note that we have left the
        # results table. This method is only called if we are in that table.
        self._in_results_table = False
 	

    def handle_start_tr(self, attrs):
 	# The first tr we meet in the results table is just headers
 	# We will set a flag at the end of that tr to avoid creating
        # a blank PlanningApplication
 	if self._past_header_row:
 	    # Create a candidate result object
 	    self._current_application = PlanningApplication()
 	    self._td_count = 0

    def handle_end_tr(self):
 	# If we are in the results table, and not finishing the header row
        # append the current result to the results list.
 	if self._past_header_row:
 	    self._results.addApplication(self._current_application)
 	else:
 	    # The first row of the results table is headers
            # We want to do nothing until after it
 	    self._past_header_row = True	
 	
    def handle_start_td(self, attrs):
        # increase the td count by one
 	self._td_count += 1
        
        # note that we are now in a td
 	self._in_td = True

    def handle_end_td(self):
        # note that we are now not in a td
 	self._in_td = False

    def handle_start_a(self, attrs):
        # this method is only getting called if we are in the
        # 6th td of a non-header row of the results table.

        # go through the attributes of the <a> looking for one
        # named 'href'
 	for attr,value in attrs:
 	    if attr == "href":
                # the value of this tag is a relative url.
                # parse it so we can get the query string from it
 		parsed_info_url = urlparse.urlparse(value)
                
 		# the 4th part of the tuple is the query string
 		query_string = parsed_info_url[4]

                # join this query string to the search URL, and store this as
                # the info URL of the current planning application
 		self._current_application.info_url = urlparse.urljoin(self.base_url, value)

                # Join this query string to the comments URL, and store this as
                # the comments URL of the current planning application
                comments_url = urlparse.urljoin(self.base_url, comments_url_end)
 		self._current_application.comment_url = urlparse.urljoin(comments_url, query_string)

 		# while we're here, let's follow some links to find the postcode...
                # the postcode is in an input tag in the property page. This page
                # can be found by following the info url.
                # The newlines in the info page need fixing.
 		info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read())
 		
 		info_file_parser = PublicAccessInfoPageParser()
 		info_file_parser.feed(info_file_contents)

 		property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url)
 		
                # the newlines in this page need fixing
 		property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read())
 	
 		property_file_parser = PublicAccessPropertyPageParser()
 		property_file_parser.feed(property_file_contents)

                # Set the postcode on the current planning application from the
                # one found on the property page
                if property_file_parser.postcode is not None:
                    self._current_application.postcode = property_file_parser.postcode

                # There is no need for us to look at any more attributes.
 		break
 	

    def handle_data(self, data):
 	if self._in_td:
            # The first td contains the reference
 	    if self._td_count == 1:
 	        self._current_application.council_reference = data
                
            # The second td contains the date the application was received
 	    elif self._td_count == 2:
                year, month, day = time.strptime(data, "%d/%m/%Y")[:3]
                received_date = datetime.date(year, month, day)

 	        self._current_application.date_received = received_date
                
            # The third td contains the address
 	    elif self._td_count == 3:
 		#data = data.replace("^M","\n")
 	        self._current_application.address = data
                
            # The fourth td contains the description
 	    elif self._td_count == 4:
 	        self._current_application.description = data
 	    # 5 is status - we don't need it.
 	    # 6 is a button - this is where we will get our postcode,
 	    # comment_url, and info_url from (when handling the <a> tag).


    def getResultsByDayMonthYear(self, day, month, year):
        # First download the search form (in order to get a session cookie
        search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end))
        search_form_response = urllib2.urlopen(search_form_request)
        
        cookie_jar.extract_cookies(search_form_response, search_form_request)

        
        # We are only doing this first search in order to get a cookie
        # The paging on the site doesn't work with cookies turned off.

        search_data1 = urllib.urlencode({"searchType":"ADV",
                                         "caseNo":"",
                                         "PPReference":"",
                                         "AltReference":"",
                                         "srchtype":"",
                                         "srchstatus":"",
                                         "srchdecision":"",
                                         "srchapstatus":"",
                                         "srchappealdecision":"",
                                         "srchwardcode":"",
                                         "srchparishcode":"",
                                         "srchagentdetails":"",
                                         "srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year}, 
                                         "srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} })

        if self.debug:
            print search_data1


        search_url = urlparse.urljoin(self.base_url, search_results_url_end)
        request1 = urllib2.Request(search_url, search_data1)
        cookie_jar.add_cookie_header(request1)
        response1 = urllib2.urlopen(request1)

        # This search is the one we will actually use.
        # a maximum of 100 results are returned on this site,
        # hence setting "pagesize" to 100. I doubt there will ever
        # be more than 100 in one day in PublicAccess...
        # "currentpage" = 1 gets us to the first page of results
        # (there will only be one anyway, as we are asking for 100 results...)

 #http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=&currentpage=2&pagesize=10&module=P3

        search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3")))

        if self.debug:
            print search_data2

        # This time we want to do a get request, so add the search data into the url
        request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2)

        request2 = urllib2.Request(request2_url)

        # add the cookie we stored from our first search
        cookie_jar.add_cookie_header(request2)

        response2 = urllib2.urlopen(request2)

        contents = fixNewlines(response2.read())

        if self.debug:
            print contents

        self.feed(contents)

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()




 class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
    """A parser to get the URL for the property details page out of the
       info page (this url is needed in order to get the postcode of the
       application.
       """

    def __init__(self):
 	HTMLParser.HTMLParser.__init__(self)

 	self.property_page_url = None

    def handle_starttag(self, tag, attrs):
        """The URL of the property details page is contained in an <a> tag in
        an attribute with key 'A_btnPropertyDetails'. There is some garbage on
        either side of it which we will have to clear up before storing it...

        We go through the <a> tags looking for one with an attribute with
        key 'id' and value 'A_btnPropertyDetails'. When we find it we go through
        its attributes looking for one with key 'href' - the value of this attribute
        contains the URL we want, after a bit of tidying up.

        Once we have got the URL, there is no need for us to look at any more <a> tags.
        """
 	if tag == "a" and self.property_page_url is None:
 	    if attrs.count(("id","A_btnPropertyDetails")) > 0:
 		for attr,value in attrs:
 		    if attr == "href":
 			the_link = value

 			# this has some garbage on either side of it...
 			# let's strip that off

                        # the garbage on the left is separated by whitespace.
                        # the garbage on the right is separated by a "'".

 			self.property_page_url = the_link.split()[1].split("'")[0]




 class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):
    """A parser to get the postcode out of the property details page."""
    def __init__(self):
 	HTMLParser.HTMLParser.__init__(self)

 	self.postcode = None

    def handle_starttag(self, tag, attrs):
        """The postcode is contained in an <input> tag.
        This tag has an attribute 'name' with value postcode.
        It also has an attribute 'value' with value the postcode of this application.

        We go through the input tags looking for one with an attribute with
        key 'name' and value 'postcode'. When we find one,
        we look through its attributes for one with key 'value' - we store the value of this
        attribute as self.postcode.

        Once we have the postcode, there is no need to look at any more input tags.
        """
        
 	if tag == "input" and self.postcode is None:
 	    if attrs.count(("name","postcode")) > 0:
 		for attr,value in attrs:
 		    if attr == "value":
 			self.postcode = value

--- a/python_scrapers/PublicAccessSites.csv
+++ b/python_scrapers/PublicAccessSites.csv
@@ -0,0 +1,29 @@
 "authority_name", "authority_short_name", "base_url"
 "City of York Council", "York", "http://planning.york.gov.uk/PublicAccess/"
 "Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/"
 "Angus Council", "Angus", "http://planning.angus.gov.uk/PublicAccess/"
 "Huntingdonshire District Council", "Huntingdonshire", "http://planning.huntsdc.gov.uk/publicaccess/"
 "South Staffordshire Council", "South Staffs", "https://services.sstaffs.gov.uk/PublicAccess/"
 "Bexley Council", "Bexley", "http://publicaccess.bexley.gov.uk/publicaccess/"
 "Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/"
 "Bristol City Council", "Bristol", "http://e2eweb.bristol-city.gov.uk/publicaccess/"
 "Portsmouth City Council", "Portsmouth", "http://planning.portsmouth.gov.uk/PublicAccess/"
 "The Borough of Oadby and Wigston", "Oadby and Wigston", "http://web.owbc.net/PublicAccess/"
 "Test Valley Borough Council", "Test Valley", "http://publicaccess.testvalley.gov.uk/publicaccess/"
 "Kings Lynn and West Norfolk Borough Council", "West Norfolk", "http://online.west-norfolk.gov.uk/publicaccess/"
 "Sunderland City Council", "Sunderland", "http://www.sunderland.gov.uk/publicaccess/"
 "Southampton City Council", "Southampton", "http://publicaccess.southampton.gov.uk/publicaccess/"
 "Bath and North East Somerset", "Bath", "http://planning.bathnes.gov.uk/publicaccess/"
 "Buckinghamshire County Council", "Buckinghamshire", "http://www.bucksplanning.gov.uk/PublicAccess/"
 "Spelthorne Borough Council", "Spelthorne", "http://phoenix.spelthorne.gov.uk/PublicAccess/"
 "Stevenage Borough Council", "Stevenage", "http://publicaccess.stevenage.gov.uk/publicaccess/"
 "Tonbridge and Malling Borough Council", "Tonbridge", "http://publicaccess.tmbc.gov.uk/publicaccess/"
 "Hart District Council", "Hart", "http://publicaccess.hart.gov.uk/publicaccess/"
 "Luton Borough Council", "Luton", "http://www.eplan.luton.gov.uk/PublicAccess/"
 "Rushmoor Borough Council", "Rushmoor", "http://pa-dc.rushmoor.gov.uk/publicaccess/"
 "Blaby District Council", "Blaby", "http://www.blaby.gov.uk/PublicAccess/"
 "East Devon District Council", "East Devon", "http://planning.eastdevon.gov.uk/PublicAccess/"
 "Mid Devon District Council", "Mid Devon", "http://planning.middevon.gov.uk/publicaccess/"
 "Sevenoaks District Council", "Sevenoaks", "http://publicaccess.sevenoaks.gov.uk/publicaccess/"
 "Woking Borough Council", "Woking", "http://caps.woking.gov.uk/publicaccess/"
 "Basildon District Council", "Basildon", "http://planning.basildon.gov.uk/publicaccess/"
--- a/python_scrapers/SouthOxfordshire.cgi
+++ b/python_scrapers/SouthOxfordshire.cgi
@@ -0,0 +1,20 @@
 #!/usr/bin/python

 import cgi
 import cgitb; cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')

 from SouthOxfordshireParser import SouthOxfordshireParser

 parser = SouthOxfordshireParser()

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/python_scrapers/SouthOxfordshireParser.py
+++ b/python_scrapers/SouthOxfordshireParser.py
@@ -0,0 +1,248 @@

 import urllib, urllib2

 import HTMLParser
 import urlparse
 import datetime, time

 # This needs a page number inserting
 search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d"

 # This needs the council reference
 comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s"

 authority_name = "South Oxfordshire District Council"
 authority_short_name = "South Oxfordshire"


 from PlanningUtils import fixNewlines, \
                          getPostcodeFromText, \
                          PlanningAuthorityResults, \
                          PlanningApplication

 class SouthOxfordshireParser(HTMLParser.HTMLParser):
    """In this case we'll take the date, so that we can avoid doing dowloads for
    the other days in this week's file. This date should be a datetime.date object.
    """
    def __init__(self):
 	HTMLParser.HTMLParser.__init__(self)

        self._requested_date = None

        # We'll keep a count of the number of tables we have seen.
        # All the interesting stuff is in table 3
        self._table_count = 0

        # While inside table 3, we'll keep a count of the number of
        # <td>s we have seen. What is in which numbered <td> is detailed below.
        # 1 reference
        # 3 place and description
        # 5 date received
        # 2 and 4 are just padding
        self._td_count = 0

        # This is just a flag to say that we are now ready to get the reference
        # from the next bit of data
        self._get_reference = False

        self._data = ''

        # this will hold the application we are currently working on.
        self._current_application = None
        
        # The object which stores our set of planning application results
        self._results = PlanningAuthorityResults(authority_name, authority_short_name)

    def handle_starttag(self, tag, attrs):
        # if we see a table tag, increment the table count.
        if tag == 'table':
            self._table_count += 1
            
        # we are only interested in other tags if we are in table 3. 
        if self._table_count == 3:
            
            # If we are starting a <tr>, create a new PlanningApplication object
            # for the application currently being processed
            if tag == 'tr':
                self._current_application = PlanningApplication()

            # if we see a td, increment the <td> count.
            if tag == 'td':
                self._td_count += 1

            # if we are in the first <td>, and we see a link,
            # then it is to the info page for this applicaion.
            if tag == 'a' and self._td_count == 1:
                for key, value in attrs:
                    if key == 'href':
                        url_end = value
                        self._current_application.info_url = urlparse.urljoin(search_url,url_end)

                        # We now know that the next bit of data is the reference
                        self._get_reference = True
                        
                        # href is the only attribute we are interested in.
                        break

    def handle_endtag(self, tag):
        # There is no need to do anything unless we are in table 3.
        if self._table_count == 3:

            # The end <tr> indicates that the current application is finished.
            # Now we can fetch the info_page to get the address, postcode,
            # and description.
            # If we don't have a reference, then we are in the header row,
            # which we don't want.
            # There is no point in doing this if the date is not the requested one.

            if tag == 'tr' and \
                   self._current_application.council_reference is not None and \
                   self._current_application.date_received == self._requested_date:
                
                info_page_parser = SouthOxfordshireInfoURLParser()
                info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())

                self._current_application.address = info_page_parser.address
                self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
                self._current_application.description = info_page_parser.description

                # Add the current application to the results set
                self._results.addApplication(self._current_application)

            # At the end of the 5th <td>, self._data should contain
            # the received date of the application.
            if tag == 'td' and self._td_count == 5:
                app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
                self._current_application.date_received = datetime.date(app_year, app_month, app_day)
                self._data = ''
                self._td_count = 0

    def handle_data(self, data):
        # There is no need to do anything if we aren't in table 3.
        if self._table_count == 3:
            # If we are in the first <td>, and the get_reference flag is set,
            # then the next data is the reference.
            if self._td_count == 1 and self._get_reference:
                self._current_application.council_reference = data

                # The comment url can now be made, as it depends only on the reference.
                # On this site, the link to the comment page is only displayed once
                # the planning authority has decided who is handling this application
                # and has opened consultations. The link below works straight away,
                # and also works for apps for which the consultation period is over.
                # I have no idea if anything is actually done with these comments if
                # it is followed too early...
                self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}

                # Set the get_reference flag back to False.
                self._get_reference = False

            # If we are in the 5th <td>, then we need to collect all the data together
            # before we can use it. This is actually processed in handle_endtag.
            if self._td_count == 5:
                self._data += data

    def handle_entityref( self, ref ):
        # We might have some entity_refs to clear up.
        # there is no need to bother with this if we aren't in the results table.
        if self._table_count == 3 and self._td_count == 5:
            if ref == 'nbsp':
                self._data += ' '


    def getResultsByDayMonthYear(self, day, month, year):
        """This will return an ApplicationResults object containg the
        applications for the date passed in."""

        today = datetime.date.today()
        self._requested_date = datetime.date(year, month, day)
        delta = today - self._requested_date

        # to get the correct page, we need
        # page ((days mod 7) + 1)
        page_number = delta.days/7 + 1

        response = urllib2.urlopen(search_url %page_number)

        contents = response.read()

        self.feed(contents)

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
    """This parser is to get the description and address out of the info page
    for a South Oxfordshire application."""

    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)

        self.address = None
        self.description = None

        # These two states will be set to:
        # 0 - if we haven't yet got that bit
        # 1 - if we are currently working on it
        # 2 - if we have finished
        self._address_state = 0
        self._description_state = 0

        # We well need to know whether or not we are in a <td>
        self._in_td = False

        # This is used for collecting together date which comes in several bits.
        self._data = ''
        
    def handle_starttag(self, tag, attrs):
        # If we see the start of a <td> and we are still interested in some data
        # then set the td flag to true, and blank the data
        if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
            self._in_td = True
            self._data = ''

    def handle_endtag(self, tag):
        if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
            # If we are working on the description,
            # set description from _data and note that we need to work on it no more.
            if self._description_state == 1:
                self.description = self._data
                self._description_state = 2


            # If we are working on the address,
            # set address from _data and note that we need to work on it no more.
            elif self._address_state == 1:
                self.address = self._data
                self._address_state = 2

            # If we see data which says 'Descripton',
            # then set the description state to working.
            elif self._data.strip() == 'Description':
                self._description_state = 1
                
            # If we see data which says 'Location',
            # then set the addresss state to working.
            elif self._data.strip() == 'Location':
                self._address_state = 1

            # Note that we are leaving the <td>
            self._in_td = False
            
    def handle_data(self, data):
        # if we are in a td, and we are still interested in the data for something,
        # append the current bit to self._data
        if self._in_td and (self._address_state < 2 or self._description_state < 2):
            self._data += data


 # TODO

 # find out what time of day this is run - does it matter that
 # we aren't being careful with daylight saving time etc.

 # Can we check that scraped email address really is
 # an email address?
--- a/python_scrapers/createCGI.sh
+++ b/python_scrapers/createCGI.sh
@@ -0,0 +1,9 @@
 #!/bin/bash

 echo Removing contents of CGI directory
 svn rm --force ../CGI/*

 echo Running generateCGIScripts
 python generateCGIScripts.py

 svn add ../CGI/*
--- a/python_scrapers/generateCGIScripts.py
+++ b/python_scrapers/generateCGIScripts.py
@@ -0,0 +1,58 @@
 #!/usr/bin/python

 list_of_sites_filename = "PublicAccessSites.csv"
 template_filename = "CGITemplate"
 python_location = "/usr/bin/python"

 cgi_dir = "../CGI/"

 # this should be a config file
 other_files = ["PublicAccess.py", "PlanningUtils.py", "SouthOxfordshireParser.py", "SouthOxfordshire.cgi"]

 import csv
 from os import chmod
 from shutil import copyfile

 list_of_sites_file = open(list_of_sites_filename)
 csv_reader = csv.DictReader(list_of_sites_file, quoting=csv.QUOTE_ALL, skipinitialspace=True)

 # svn rm the cgi directory

 # create the cgi directory


 # create cgi files and write them in the cgi directory
 template_contents = open(template_filename).read()

 template = "#!" + python_location +"\n\n" + template_contents

 for site_dict in csv_reader:
    filename = cgi_dir + "%s.cgi" %site_dict["authority_short_name"] 
    contents = template %site_dict

    this_file = open(filename, "w")
    print "Writing %s" %filename
    this_file.write(contents)
    this_file.close()

    chmod(filename, 0755)

 # copy across other files that are needed
 # these should probably come from a config file
 for filename in other_files:
    copyfile(filename, cgi_dir+filename)
    

 # write a README to warn people not to svn add stuff to CGI directory
 readme_message = """
 WARNING - this directory is only for generated files
 and files which are automatically copied in.
 Anything manually added here will be svn deleted.

 """
 readme_file = open(cgi_dir+ "README", "w")
 readme_file.write(readme_message)
 readme_file.close()

 # svn add the cgi directory and its contents