Bläddra i källkod

add the south oxfordshire scraper, and code to generate

some publicaccess scrapers
import/raw
duncan.parkes 18 år sedan
förälder
incheckning
9a4eaf6976
8 ändrade filer med 835 tillägg och 0 borttagningar
  1. +29
    -0
      trunk/python_scrapers/CGITemplate
  2. +101
    -0
      trunk/python_scrapers/PlanningUtils.py
  3. +341
    -0
      trunk/python_scrapers/PublicAccess.py
  4. +29
    -0
      trunk/python_scrapers/PublicAccessSites.csv
  5. +20
    -0
      trunk/python_scrapers/SouthOxfordshire.cgi
  6. +248
    -0
      trunk/python_scrapers/SouthOxfordshireParser.py
  7. +9
    -0
      trunk/python_scrapers/createCGI.sh
  8. +58
    -0
      trunk/python_scrapers/generateCGIScripts.py

+ 29
- 0
trunk/python_scrapers/CGITemplate Visa fil

@@ -0,0 +1,29 @@
# This is the parser for %(authority_name)s.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "%(authority_name)s"
authority_short_name = "%(authority_short_name)s"
base_url = "%(base_url)s"

import PublicAccess

parser = PublicAccess.PublicAccessParser(authority_name,
authority_short_name,
base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 101
- 0
trunk/python_scrapers/PlanningUtils.py Visa fil

@@ -0,0 +1,101 @@
__auth__ = None

import re

date_format = "%d/%m/%Y"


def xmlQuote(text):
# Change &s to &s
# I suspect there is probably some standard python
# function I should be using for this...
return text.replace('&', '&')

def fixNewlines(text):
# This can be used to sort out windows newlines
return text.replace("\r\n","\n")

# So what can a postcode look like then?
# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
#AN NAA M1 1AA
#ANN NAA M60 1NW
#AAN NAA CR2 6XH
#AANN NAA DN55 1PT
#ANA NAA W1A 1HP
#AANA NAA EC1A 1BB

postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")

def getPostcodeFromText(text):
"""This function takes a piece of text and returns the first
bit of it that looks like a postcode."""

postcode_match = postcode_regex.search(text)

if postcode_match is not None:
return postcode_match.group()

class PlanningAuthorityResults:
"""This class represents a set of results of a planning search.

This should probably be separated out so that it can be used for
authorities other than Cherwell.
"""

def __init__(self, authority_name, authority_short_name):
self.authority_name = authority_name
self.authority_short_name = authority_short_name
# this will be a list of PlanningApplication objects
self.planning_applications = []


def addApplication(self, application):
self.planning_applications.append(application)

def __repr__(self):
return self.displayXML()

def displayXML(self):
"""This should display the contents of this object in the planningalerts format.
i.e. in the same format as this one:
http://www.planningalerts.com/lambeth.xml
"""

applications_bit = "".join([x.displayXML() for x in self.planning_applications])

return "<planning>\n" +\
"<authority_name>%s</authority_name>\n" %self.authority_name +\
"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
"<applications>\n" + applications_bit +\
"</applications>\n" +\
"</planning>\n"



class PlanningApplication:
def __init__(self, no_postcode_default='No postcode'):
self.council_reference = None
self.address = None
self.postcode = no_postcode_default
self.description = None
self.info_url = None
self.comment_url = None

# expecting this as a datetime.date object
self.date_received = None

def __repr__(self):
return self.displayXML()

def displayXML(self):
return "<application>\n" +\
"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
"<address>%s</address>\n" %xmlQuote(self.address) +\
"<postcode>%s</postcode>\n" %self.postcode +\
"<description>%s</description>\n" %xmlQuote(self.description) +\
"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
"</application>\n"

+ 341
- 0
trunk/python_scrapers/PublicAccess.py Visa fil

@@ -0,0 +1,341 @@
#!/usr/bin/python

import urllib, urllib2
import HTMLParser
import urlparse
import datetime, time

import cookielib

cookie_jar = cookielib.CookieJar()


from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication


search_form_url_end = "tdc/DcApplication/application_searchform.aspx"
search_results_url_end = "tdc/DcApplication/application_searchresults.aspx"
comments_url_end = "tdc/DcApplication/application_comments_entryform.aspx"

class PublicAccessParser(HTMLParser.HTMLParser):
"""This is the class which parses the PublicAccess search results page.
"""

def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):
HTMLParser.HTMLParser.__init__(self)

self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.debug = debug

# this will change to True when we enter the table of results
self._in_results_table = False

# this will be set to True when we have passed the header row
# in the results table
self._past_header_row = False

# this will be true when we are in a <td> in the results table
self._in_td = False

# For each row, this will say how many tds we have seen so far
self._td_count = 0

# The object which stores our set of planning application results
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

# This will store the planning application we are currently working on.
self._current_application = None

def handle_starttag(self, tag, attrs):
if tag == "table":
self.handle_start_table(attrs)
# we are only interested in tr tags if we are in the results table
elif self._in_results_table and tag == "tr":
self.handle_start_tr(attrs)
# we are only interested in td tags if we are in the results table
elif self._in_results_table and tag == "td":
self.handle_start_td(attrs)
# we are only interested in <a> tags if we are in the 6th td in
# the results table.
elif self._in_td and self._td_count == 6 and tag == "a":
self.handle_start_a(attrs)
# If the tag is not one of these then we aren't interested

def handle_endtag(self, tag):
# we only need to consider end tags if we are in the results table
if self._in_results_table:
if tag == "table":
self.handle_end_table()
if tag == "tr":
self.handle_end_tr()
if tag == "td":
self.handle_end_td()

def handle_start_table(self, attrs):
for attr,value in attrs:
if attr == "class":
if value == "cResultsForm":
self._in_results_table = True
break

def handle_end_table(self):
# If we see an end table tag, then note that we have left the
# results table. This method is only called if we are in that table.
self._in_results_table = False

def handle_start_tr(self, attrs):
# The first tr we meet in the results table is just headers
# We will set a flag at the end of that tr to avoid creating
# a blank PlanningApplication
if self._past_header_row:
# Create a candidate result object
self._current_application = PlanningApplication()
self._td_count = 0

def handle_end_tr(self):
# If we are in the results table, and not finishing the header row
# append the current result to the results list.
if self._past_header_row:
self._results.addApplication(self._current_application)
else:
# The first row of the results table is headers
# We want to do nothing until after it
self._past_header_row = True
def handle_start_td(self, attrs):
# increase the td count by one
self._td_count += 1
# note that we are now in a td
self._in_td = True

def handle_end_td(self):
# note that we are now not in a td
self._in_td = False

def handle_start_a(self, attrs):
# this method is only getting called if we are in the
# 6th td of a non-header row of the results table.

# go through the attributes of the <a> looking for one
# named 'href'
for attr,value in attrs:
if attr == "href":
# the value of this tag is a relative url.
# parse it so we can get the query string from it
parsed_info_url = urlparse.urlparse(value)
# the 4th part of the tuple is the query string
query_string = parsed_info_url[4]

# join this query string to the search URL, and store this as
# the info URL of the current planning application
self._current_application.info_url = urlparse.urljoin(self.base_url, value)

# Join this query string to the comments URL, and store this as
# the comments URL of the current planning application
comments_url = urlparse.urljoin(self.base_url, comments_url_end)
self._current_application.comment_url = urlparse.urljoin(comments_url, query_string)

# while we're here, let's follow some links to find the postcode...
# the postcode is in an input tag in the property page. This page
# can be found by following the info url.
# The newlines in the info page need fixing.
info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read())
info_file_parser = PublicAccessInfoPageParser()
info_file_parser.feed(info_file_contents)

property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url)
# the newlines in this page need fixing
property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read())
property_file_parser = PublicAccessPropertyPageParser()
property_file_parser.feed(property_file_contents)

# Set the postcode on the current planning application from the
# one found on the property page
if property_file_parser.postcode is not None:
self._current_application.postcode = property_file_parser.postcode

# There is no need for us to look at any more attributes.
break

def handle_data(self, data):
if self._in_td:
# The first td contains the reference
if self._td_count == 1:
self._current_application.council_reference = data
# The second td contains the date the application was received
elif self._td_count == 2:
year, month, day = time.strptime(data, "%d/%m/%Y")[:3]
received_date = datetime.date(year, month, day)

self._current_application.date_received = received_date
# The third td contains the address
elif self._td_count == 3:
#data = data.replace("^M","\n")
self._current_application.address = data
# The fourth td contains the description
elif self._td_count == 4:
self._current_application.description = data
# 5 is status - we don't need it.
# 6 is a button - this is where we will get our postcode,
# comment_url, and info_url from (when handling the <a> tag).


def getResultsByDayMonthYear(self, day, month, year):
# First download the search form (in order to get a session cookie
search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end))
search_form_response = urllib2.urlopen(search_form_request)
cookie_jar.extract_cookies(search_form_response, search_form_request)

# We are only doing this first search in order to get a cookie
# The paging on the site doesn't work with cookies turned off.

search_data1 = urllib.urlencode({"searchType":"ADV",
"caseNo":"",
"PPReference":"",
"AltReference":"",
"srchtype":"",
"srchstatus":"",
"srchdecision":"",
"srchapstatus":"",
"srchappealdecision":"",
"srchwardcode":"",
"srchparishcode":"",
"srchagentdetails":"",
"srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year},
"srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} })

if self.debug:
print search_data1


search_url = urlparse.urljoin(self.base_url, search_results_url_end)
request1 = urllib2.Request(search_url, search_data1)
cookie_jar.add_cookie_header(request1)
response1 = urllib2.urlopen(request1)

# This search is the one we will actually use.
# a maximum of 100 results are returned on this site,
# hence setting "pagesize" to 100. I doubt there will ever
# be more than 100 in one day in PublicAccess...
# "currentpage" = 1 gets us to the first page of results
# (there will only be one anyway, as we are asking for 100 results...)

#http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=&currentpage=2&pagesize=10&module=P3

search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3")))

if self.debug:
print search_data2

# This time we want to do a get request, so add the search data into the url
request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2)

request2 = urllib2.Request(request2_url)

# add the cookie we stored from our first search
cookie_jar.add_cookie_header(request2)

response2 = urllib2.urlopen(request2)

contents = fixNewlines(response2.read())

if self.debug:
print contents

self.feed(contents)

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()




class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
"""A parser to get the URL for the property details page out of the
info page (this url is needed in order to get the postcode of the
application.
"""

def __init__(self):
HTMLParser.HTMLParser.__init__(self)

self.property_page_url = None

def handle_starttag(self, tag, attrs):
"""The URL of the property details page is contained in an <a> tag in
an attribute with key 'A_btnPropertyDetails'. There is some garbage on
either side of it which we will have to clear up before storing it...

We go through the <a> tags looking for one with an attribute with
key 'id' and value 'A_btnPropertyDetails'. When we find it we go through
its attributes looking for one with key 'href' - the value of this attribute
contains the URL we want, after a bit of tidying up.

Once we have got the URL, there is no need for us to look at any more <a> tags.
"""
if tag == "a" and self.property_page_url is None:
if attrs.count(("id","A_btnPropertyDetails")) > 0:
for attr,value in attrs:
if attr == "href":
the_link = value

# this has some garbage on either side of it...
# let's strip that off

# the garbage on the left is separated by whitespace.
# the garbage on the right is separated by a "'".

self.property_page_url = the_link.split()[1].split("'")[0]




class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):
"""A parser to get the postcode out of the property details page."""
def __init__(self):
HTMLParser.HTMLParser.__init__(self)

self.postcode = None

def handle_starttag(self, tag, attrs):
"""The postcode is contained in an <input> tag.
This tag has an attribute 'name' with value postcode.
It also has an attribute 'value' with value the postcode of this application.

We go through the input tags looking for one with an attribute with
key 'name' and value 'postcode'. When we find one,
we look through its attributes for one with key 'value' - we store the value of this
attribute as self.postcode.

Once we have the postcode, there is no need to look at any more input tags.
"""
if tag == "input" and self.postcode is None:
if attrs.count(("name","postcode")) > 0:
for attr,value in attrs:
if attr == "value":
self.postcode = value


+ 29
- 0
trunk/python_scrapers/PublicAccessSites.csv Visa fil

@@ -0,0 +1,29 @@
"authority_name", "authority_short_name", "base_url"
"City of York Council", "York", "http://planning.york.gov.uk/PublicAccess/"
"Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/"
"Angus Council", "Angus", "http://planning.angus.gov.uk/PublicAccess/"
"Huntingdonshire District Council", "Huntingdonshire", "http://planning.huntsdc.gov.uk/publicaccess/"
"South Staffordshire Council", "South Staffs", "https://services.sstaffs.gov.uk/PublicAccess/"
"Bexley Council", "Bexley", "http://publicaccess.bexley.gov.uk/publicaccess/"
"Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/"
"Bristol City Council", "Bristol", "http://e2eweb.bristol-city.gov.uk/publicaccess/"
"Portsmouth City Council", "Portsmouth", "http://planning.portsmouth.gov.uk/PublicAccess/"
"The Borough of Oadby and Wigston", "Oadby and Wigston", "http://web.owbc.net/PublicAccess/"
"Test Valley Borough Council", "Test Valley", "http://publicaccess.testvalley.gov.uk/publicaccess/"
"Kings Lynn and West Norfolk Borough Council", "West Norfolk", "http://online.west-norfolk.gov.uk/publicaccess/"
"Sunderland City Council", "Sunderland", "http://www.sunderland.gov.uk/publicaccess/"
"Southampton City Council", "Southampton", "http://publicaccess.southampton.gov.uk/publicaccess/"
"Bath and North East Somerset", "Bath", "http://planning.bathnes.gov.uk/publicaccess/"
"Buckinghamshire County Council", "Buckinghamshire", "http://www.bucksplanning.gov.uk/PublicAccess/"
"Spelthorne Borough Council", "Spelthorne", "http://phoenix.spelthorne.gov.uk/PublicAccess/"
"Stevenage Borough Council", "Stevenage", "http://publicaccess.stevenage.gov.uk/publicaccess/"
"Tonbridge and Malling Borough Council", "Tonbridge", "http://publicaccess.tmbc.gov.uk/publicaccess/"
"Hart District Council", "Hart", "http://publicaccess.hart.gov.uk/publicaccess/"
"Luton Borough Council", "Luton", "http://www.eplan.luton.gov.uk/PublicAccess/"
"Rushmoor Borough Council", "Rushmoor", "http://pa-dc.rushmoor.gov.uk/publicaccess/"
"Blaby District Council", "Blaby", "http://www.blaby.gov.uk/PublicAccess/"
"East Devon District Council", "East Devon", "http://planning.eastdevon.gov.uk/PublicAccess/"
"Mid Devon District Council", "Mid Devon", "http://planning.middevon.gov.uk/publicaccess/"
"Sevenoaks District Council", "Sevenoaks", "http://publicaccess.sevenoaks.gov.uk/publicaccess/"
"Woking Borough Council", "Woking", "http://caps.woking.gov.uk/publicaccess/"
"Basildon District Council", "Basildon", "http://planning.basildon.gov.uk/publicaccess/"

+ 20
- 0
trunk/python_scrapers/SouthOxfordshire.cgi Visa fil

@@ -0,0 +1,20 @@
#!/usr/bin/python

import cgi
import cgitb; cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')

from SouthOxfordshireParser import SouthOxfordshireParser

parser = SouthOxfordshireParser()

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 248
- 0
trunk/python_scrapers/SouthOxfordshireParser.py Visa fil

@@ -0,0 +1,248 @@

import urllib, urllib2

import HTMLParser
import urlparse
import datetime, time

# This needs a page number inserting
search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d"

# This needs the council reference
comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s"

authority_name = "South Oxfordshire District Council"
authority_short_name = "South Oxfordshire"


from PlanningUtils import fixNewlines, \
getPostcodeFromText, \
PlanningAuthorityResults, \
PlanningApplication

class SouthOxfordshireParser(HTMLParser.HTMLParser):
"""In this case we'll take the date, so that we can avoid doing dowloads for
the other days in this week's file. This date should be a datetime.date object.
"""
def __init__(self):
HTMLParser.HTMLParser.__init__(self)

self._requested_date = None

# We'll keep a count of the number of tables we have seen.
# All the interesting stuff is in table 3
self._table_count = 0

# While inside table 3, we'll keep a count of the number of
# <td>s we have seen. What is in which numbered <td> is detailed below.
# 1 reference
# 3 place and description
# 5 date received
# 2 and 4 are just padding
self._td_count = 0

# This is just a flag to say that we are now ready to get the reference
# from the next bit of data
self._get_reference = False

self._data = ''

# this will hold the application we are currently working on.
self._current_application = None
# The object which stores our set of planning application results
self._results = PlanningAuthorityResults(authority_name, authority_short_name)

def handle_starttag(self, tag, attrs):
# if we see a table tag, increment the table count.
if tag == 'table':
self._table_count += 1
# we are only interested in other tags if we are in table 3.
if self._table_count == 3:
# If we are starting a <tr>, create a new PlanningApplication object
# for the application currently being processed
if tag == 'tr':
self._current_application = PlanningApplication()

# if we see a td, increment the <td> count.
if tag == 'td':
self._td_count += 1

# if we are in the first <td>, and we see a link,
# then it is to the info page for this applicaion.
if tag == 'a' and self._td_count == 1:
for key, value in attrs:
if key == 'href':
url_end = value
self._current_application.info_url = urlparse.urljoin(search_url,url_end)

# We now know that the next bit of data is the reference
self._get_reference = True
# href is the only attribute we are interested in.
break

def handle_endtag(self, tag):
# There is no need to do anything unless we are in table 3.
if self._table_count == 3:

# The end <tr> indicates that the current application is finished.
# Now we can fetch the info_page to get the address, postcode,
# and description.
# If we don't have a reference, then we are in the header row,
# which we don't want.
# There is no point in doing this if the date is not the requested one.

if tag == 'tr' and \
self._current_application.council_reference is not None and \
self._current_application.date_received == self._requested_date:
info_page_parser = SouthOxfordshireInfoURLParser()
info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())

self._current_application.address = info_page_parser.address
self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
self._current_application.description = info_page_parser.description

# Add the current application to the results set
self._results.addApplication(self._current_application)

# At the end of the 5th <td>, self._data should contain
# the received date of the application.
if tag == 'td' and self._td_count == 5:
app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
self._current_application.date_received = datetime.date(app_year, app_month, app_day)
self._data = ''
self._td_count = 0

def handle_data(self, data):
# There is no need to do anything if we aren't in table 3.
if self._table_count == 3:
# If we are in the first <td>, and the get_reference flag is set,
# then the next data is the reference.
if self._td_count == 1 and self._get_reference:
self._current_application.council_reference = data

# The comment url can now be made, as it depends only on the reference.
# On this site, the link to the comment page is only displayed once
# the planning authority has decided who is handling this application
# and has opened consultations. The link below works straight away,
# and also works for apps for which the consultation period is over.
# I have no idea if anything is actually done with these comments if
# it is followed too early...
self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}

# Set the get_reference flag back to False.
self._get_reference = False

# If we are in the 5th <td>, then we need to collect all the data together
# before we can use it. This is actually processed in handle_endtag.
if self._td_count == 5:
self._data += data

def handle_entityref( self, ref ):
# We might have some entity_refs to clear up.
# there is no need to bother with this if we aren't in the results table.
if self._table_count == 3 and self._td_count == 5:
if ref == 'nbsp':
self._data += ' '


def getResultsByDayMonthYear(self, day, month, year):
"""This will return an ApplicationResults object containg the
applications for the date passed in."""

today = datetime.date.today()
self._requested_date = datetime.date(year, month, day)
delta = today - self._requested_date

# to get the correct page, we need
# page ((days mod 7) + 1)
page_number = delta.days/7 + 1

response = urllib2.urlopen(search_url %page_number)

contents = response.read()

self.feed(contents)

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
"""This parser is to get the description and address out of the info page
for a South Oxfordshire application."""

def __init__(self):
HTMLParser.HTMLParser.__init__(self)

self.address = None
self.description = None

# These two states will be set to:
# 0 - if we haven't yet got that bit
# 1 - if we are currently working on it
# 2 - if we have finished
self._address_state = 0
self._description_state = 0

# We well need to know whether or not we are in a <td>
self._in_td = False

# This is used for collecting together date which comes in several bits.
self._data = ''
def handle_starttag(self, tag, attrs):
# If we see the start of a <td> and we are still interested in some data
# then set the td flag to true, and blank the data
if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
self._in_td = True
self._data = ''

def handle_endtag(self, tag):
if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
# If we are working on the description,
# set description from _data and note that we need to work on it no more.
if self._description_state == 1:
self.description = self._data
self._description_state = 2


# If we are working on the address,
# set address from _data and note that we need to work on it no more.
elif self._address_state == 1:
self.address = self._data
self._address_state = 2

# If we see data which says 'Descripton',
# then set the description state to working.
elif self._data.strip() == 'Description':
self._description_state = 1
# If we see data which says 'Location',
# then set the addresss state to working.
elif self._data.strip() == 'Location':
self._address_state = 1

# Note that we are leaving the <td>
self._in_td = False
def handle_data(self, data):
# if we are in a td, and we are still interested in the data for something,
# append the current bit to self._data
if self._in_td and (self._address_state < 2 or self._description_state < 2):
self._data += data


# TODO

# find out what time of day this is run - does it matter that
# we aren't being careful with daylight saving time etc.

# Can we check that scraped email address really is
# an email address?

+ 9
- 0
trunk/python_scrapers/createCGI.sh Visa fil

@@ -0,0 +1,9 @@
#!/bin/bash

echo Removing contents of CGI directory
svn rm --force ../CGI/*

echo Running generateCGIScripts
python generateCGIScripts.py

svn add ../CGI/*

+ 58
- 0
trunk/python_scrapers/generateCGIScripts.py Visa fil

@@ -0,0 +1,58 @@
#!/usr/bin/python

list_of_sites_filename = "PublicAccessSites.csv"
template_filename = "CGITemplate"
python_location = "/usr/bin/python"

cgi_dir = "../CGI/"

# this should be a config file
other_files = ["PublicAccess.py", "PlanningUtils.py", "SouthOxfordshireParser.py", "SouthOxfordshire.cgi"]

import csv
from os import chmod
from shutil import copyfile

list_of_sites_file = open(list_of_sites_filename)
csv_reader = csv.DictReader(list_of_sites_file, quoting=csv.QUOTE_ALL, skipinitialspace=True)

# svn rm the cgi directory

# create the cgi directory


# create cgi files and write them in the cgi directory
template_contents = open(template_filename).read()

template = "#!" + python_location +"\n\n" + template_contents

for site_dict in csv_reader:
filename = cgi_dir + "%s.cgi" %site_dict["authority_short_name"]
contents = template %site_dict

this_file = open(filename, "w")
print "Writing %s" %filename
this_file.write(contents)
this_file.close()

chmod(filename, 0755)

# copy across other files that are needed
# these should probably come from a config file
for filename in other_files:
copyfile(filename, cgi_dir+filename)

# write a README to warn people not to svn add stuff to CGI directory
readme_message = """
WARNING - this directory is only for generated files
and files which are automatically copied in.
Anything manually added here will be svn deleted.

"""
readme_file = open(cgi_dir+ "README", "w")
readme_file.write(readme_message)
readme_file.close()

# svn add the cgi directory and its contents

Laddar…
Avbryt
Spara