diff --git a/trunk/python_scrapers/CGITemplate b/trunk/python_scrapers/CGITemplate
index 0950f07..03ea055 100644
--- a/trunk/python_scrapers/CGITemplate
+++ b/trunk/python_scrapers/CGITemplate
@@ -28,4 +28,4 @@ xml = parser.getResults(day, month, year)
print "Content-Type: text/xml" # XML is following
print
-print xml # print the xml
+print xml.encode("utf-8") # print the xml
diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv
index f35d274..e0e0eca 100644
--- a/trunk/python_scrapers/OtherFilesToCopy.csv
+++ b/trunk/python_scrapers/OtherFilesToCopy.csv
@@ -23,3 +23,4 @@
"Ocella.py", "420"
"IsleOfWight.py", "420"
"Barnsley.py", "420"
+"Shetland.py", "420"
diff --git a/trunk/python_scrapers/PlanningUtils.py b/trunk/python_scrapers/PlanningUtils.py
index 273a652..f97affd 100644
--- a/trunk/python_scrapers/PlanningUtils.py
+++ b/trunk/python_scrapers/PlanningUtils.py
@@ -65,12 +65,13 @@ class PlanningAuthorityResults:
applications_bit = "".join([x.displayXML() for x in self.planning_applications])
- return "\n" +\
- "%s\n" %self.authority_name +\
- "%s\n" %self.authority_short_name +\
- "\n" + applications_bit +\
- "\n" +\
- "\n"
+ return u"""\n""" + \
+ u"\n" +\
+ u"%s\n" %self.authority_name +\
+ u"%s\n" %self.authority_short_name +\
+ u"\n" + applications_bit +\
+ u"\n" +\
+ u"\n"
@@ -104,12 +105,14 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
- return "\n" +\
- "%s\n" %xmlQuote(self.council_reference) +\
- "%s\n" %xmlQuote(self.address) +\
- "%s\n" %self.postcode +\
- "%s\n" %xmlQuote(self.description) +\
- "%s\n" %xmlQuote(self.info_url) +\
- "%s\n" %xmlQuote(self.comment_url) +\
- "%s\n" %self.date_received.strftime(date_format) +\
- "\n"
+ return u"\n" +\
+ u"%s\n" %xmlQuote(self.council_reference) +\
+ u"%s\n" %xmlQuote(self.address) +\
+ u"%s\n" %self.postcode +\
+ u"%s\n" %xmlQuote(self.description) +\
+ u"%s\n" %xmlQuote(self.info_url) +\
+ u"%s\n" %xmlQuote(self.comment_url) +\
+ u"%s\n" %self.date_received.strftime(date_format) +\
+ u"\n"
+
+
diff --git a/trunk/python_scrapers/Shetland.py b/trunk/python_scrapers/Shetland.py
index c42877e..45a7ae4 100644
--- a/trunk/python_scrapers/Shetland.py
+++ b/trunk/python_scrapers/Shetland.py
@@ -1,9 +1,13 @@
+"""
+The Shetland Isles site shows applications from the last 14 days.
+These are paginated into groups of ten.
+"""
+
import urllib2
-import urllib
import urlparse
import datetime, time
-import cgi
+import re
from BeautifulSoup import BeautifulSoup
@@ -13,88 +17,110 @@ from PlanningUtils import PlanningApplication, \
date_format = "%d/%m/%Y"
+page_count_regex = re.compile("Records 1 to 10 of (\d*) Records Found")
+
class ShetlandParser:
def __init__(self, *args):
self.authority_name = "Shetland Islands Council"
self.authority_short_name = "Shetland Islands"
- self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0"
+ self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d"
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
- def getResultsByDayMonthYear(self):
- # Note that we don't take the day, month and year parameters here.
+ def getResultsByDayMonthYear(self, day, month, year):
+ search_date = datetime.datetime(year, month, day)
+
+ offset = 0
# First get the search page
- request = urllib2.Request(self.base_url)
- response = urllib2.urlopen(request)
+ response = urllib2.urlopen(self.base_url %(offset))
+
+ contents = response.read()
- soup = BeautifulSoup(response.read())
+ # First let's find out how many records there are (they are displayed ten per page).
+ match = page_count_regex.search(contents)
+ app_count = int(match.groups()[0])
- # The apps are in the 5th table on the page (not a very good way to get it...)
- results_table = soup.findAll("table")[5]
+ while offset < app_count:
+ if offset != 0:
+ contents = urllib2.urlopen(self.base_url %(offset)).read()
- # Now we need to find the trs which contain the apps.
- # The first TR is just headers.
- # After that they alternate between containing an app and just some display graphics
- # until the third from last. After that, they contain more rubbish.
+ soup = BeautifulSoup(contents)
+
+ # The apps are in the 5th table on the page (not a very good way to get it...)
+ results_table = soup.findAll("table")[5]
- trs = results_table.findAll("tr")[1:-2]
+ # Now we need to find the trs which contain the apps.
+ # The first TR is just headers.
+ # After that they alternate between containing an app and just some display graphics
+ # until the third from last. After that, they contain more rubbish.
- for i in range(len(trs)):
- # We are only interested in the trs in even positions in the list.
- if i % 2 == 0:
- tr = trs[i]
+ trs = results_table.findAll("tr")[1:-2]
- application = PlanningApplication()
+ for i in range(len(trs)):
+ # We are only interested in the trs in even positions in the list.
+ if i % 2 == 0:
+ tr = trs[i]
- application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))
+ application = PlanningApplication()
- application.council_reference = tr.a.string
+ comment_url_element = tr.find(text="comment on this planning application").parent
+ application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))
- comment_url_element = tr.find(text="comment on this planning application").parent
- application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])
+ # If the date of this application is earlier than the date
+ # we are searching for then don't download it.
+ # We could optimize this a bit more by not doing the later pages.
- application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
+ if application.date_received < search_date:
+ break
- info_response = urllib2.urlopen(application.info_url)
+ application.council_reference = tr.a.string
- info_soup = BeautifulSoup(info_response.read())
-
- info_table = info_soup.findAll("table")[2]
+ application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])
- application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
- application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()
+ application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
- # Now to get the address. This will be split across several tds.
+ info_response = urllib2.urlopen(application.info_url)
- address_start_td = info_table.find("td", rowspan="4")
+ info_soup = BeautifulSoup(info_response.read())
- # We need the first bit of the address from this tr
- address_bits = [address_start_td.findNext("td").string.strip()]
+ info_table = info_soup.findAll("table")[2]
- # We will need the first td from the next three trs after this
- for address_tr in address_start_td.findAllNext("tr")[:3]:
- address_line = address_tr.td.string.strip()
-
- if address_line:
- address_bits.append(address_line)
-
- address_bits.append(application.postcode)
+ application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
+ application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()
+
+ # Now to get the address. This will be split across several tds.
+
+ address_start_td = info_table.find("td", rowspan="4")
- application.address = ', '.join(address_bits)
+ # We need the first bit of the address from this tr
+ address_bits = [address_start_td.findNext("td").string.strip()]
- self._results.addApplication(application)
-
+ # We will need the first td from the next three trs after this
+ for address_tr in address_start_td.findAllNext("tr")[:3]:
+ address_line = address_tr.td.string.strip()
+
+ if address_line:
+ address_bits.append(address_line)
+
+ address_bits.append(application.postcode)
+
+ application.address = ', '.join(address_bits)
+
+ self._results.addApplication(application)
+
+ offset += 10
return self._results
def getResults(self, day, month, year):
- return self.getResultsByDayMonthYear().displayXML()
+ return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
if __name__ == '__main__':
parser = ShetlandParser()
- print parser.getResults(21,5,2008)
-# TODO: Sort out pagination
+ # Note: to test this, you will need to pick a current date.
+ print parser.getResults(9,6,2008)
+
diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv
index d6cd45d..af74a3d 100644
--- a/trunk/python_scrapers/SitesToGenerate.csv
+++ b/trunk/python_scrapers/SitesToGenerate.csv
@@ -221,3 +221,4 @@
"Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser"
"Barnsley Metropolitan Borough Council", "Barnsley", "", "Barnsley", "BarnsleyParser"
"Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
+"Shetland Islands Council", "Shetland Islands", "", "Shetland", "ShetlandParser"