From 03084bd2dbbe13d6ac628890df5c805d52791200 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Thu, 12 Jun 2008 17:15:14 +0000
Subject: [PATCH] Add scraper for Shetland Islands. Encode all output as UTF-8.

---
 python_scrapers/CGITemplate          |   2 +-
 python_scrapers/OtherFilesToCopy.csv |   1 +
 python_scrapers/PlanningUtils.py     |  33 +++----
 python_scrapers/Shetland.py          | 124 ++++++++++++++++-----------
 python_scrapers/SitesToGenerate.csv  |   1 +
 5 files changed, 96 insertions(+), 65 deletions(-)
diff --git a/python_scrapers/CGITemplate b/python_scrapers/CGITemplate
index 0950f07..03ea055 100644
--- a/python_scrapers/CGITemplate
+++ b/python_scrapers/CGITemplate
@@ -28,4 +28,4 @@ xml = parser.getResults(day, month, year)
 
 print "Content-Type: text/xml"     # XML is following
 print
-print xml                          # print the xml
+print xml.encode("utf-8")                          # print the xml
diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index f35d274..e0e0eca 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -23,3 +23,4 @@
 "Ocella.py", "420"
 "IsleOfWight.py", "420"
 "Barnsley.py", "420"
+"Shetland.py", "420"
diff --git a/python_scrapers/PlanningUtils.py b/python_scrapers/PlanningUtils.py
index 273a652..f97affd 100644
--- a/python_scrapers/PlanningUtils.py
+++ b/python_scrapers/PlanningUtils.py
@@ -65,12 +65,13 @@ class PlanningAuthorityResults:
 
 	applications_bit = "".join([x.displayXML() for x in self.planning_applications])
 
-	return "<planning>\n" +\
-               "<authority_name>%s</authority_name>\n" %self.authority_name +\
-               "<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
-               "<applications>\n" + applications_bit +\
-	       "</applications>\n" +\
-               "</planning>\n"
+	return u"""<?xml version="1.0" encoding="UTF-8"?>\n""" + \
+            u"<planning>\n" +\
+            u"<authority_name>%s</authority_name>\n" %self.authority_name +\
+            u"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
+            u"<applications>\n" + applications_bit +\
+            u"</applications>\n" +\
+            u"</planning>\n"
 
 
 
@@ -104,12 +105,14 @@ class PlanningApplication:
         
     def displayXML(self):
         #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
-	return "<application>\n" +\
-	"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
-        "<address>%s</address>\n" %xmlQuote(self.address) +\
-        "<postcode>%s</postcode>\n" %self.postcode +\
-	"<description>%s</description>\n" %xmlQuote(self.description) +\
-	"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
-	"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
-        "<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
-        "</application>\n"
+	return  u"<application>\n" +\
+	u"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
+        u"<address>%s</address>\n" %xmlQuote(self.address) +\
+        u"<postcode>%s</postcode>\n" %self.postcode +\
+	u"<description>%s</description>\n" %xmlQuote(self.description) +\
+	u"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
+	u"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
+        u"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
+        u"</application>\n"
+
+        
diff --git a/python_scrapers/Shetland.py b/python_scrapers/Shetland.py
index c42877e..45a7ae4 100644
--- a/python_scrapers/Shetland.py
+++ b/python_scrapers/Shetland.py
@@ -1,9 +1,13 @@
+"""
+The Shetland Isles site shows applications from the last 14 days.
+These are paginated into groups of ten.
+"""
+
 import urllib2
-import urllib
 import urlparse
 
 import datetime, time
-import cgi
+import re
 
 from BeautifulSoup import BeautifulSoup
 
@@ -13,88 +17,110 @@ from PlanningUtils import PlanningApplication, \
 
 date_format = "%d/%m/%Y"
 
+page_count_regex = re.compile("Records 1 to 10 of (\d*) Records Found")
+
 class ShetlandParser:
     def __init__(self, *args):
 
         self.authority_name = "Shetland Islands Council"
         self.authority_short_name = "Shetland Islands"
-        self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0"
+        self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d"
 
         self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
 
 
-    def getResultsByDayMonthYear(self):
-        # Note that we don't take the day, month and year parameters here.
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_date = datetime.datetime(year, month, day)
+
+        offset = 0
 
         # First get the search page
-        request = urllib2.Request(self.base_url)
-        response = urllib2.urlopen(request)
+        response = urllib2.urlopen(self.base_url %(offset))
+        
+        contents = response.read()
 
-        soup = BeautifulSoup(response.read())
+        # First let's find out how many records there are (they are displayed ten per page).
+        match = page_count_regex.search(contents)        
+        app_count = int(match.groups()[0])
 
-        # The apps are in the 5th table on the page (not a very good way to get it...)
-        results_table = soup.findAll("table")[5]
+        while offset < app_count:
+            if offset != 0:
+                contents = urllib2.urlopen(self.base_url %(offset)).read()
 
-        # Now we need to find the trs which contain the apps.
-        # The first TR is just headers.
-        # After that they alternate between containing an app and just some display graphics
-        # until the third from last. After that, they contain more rubbish.
+            soup = BeautifulSoup(contents)
+            
+            # The apps are in the 5th table on the page (not a very good way to get it...)
+            results_table = soup.findAll("table")[5]
 
-        trs = results_table.findAll("tr")[1:-2]
+            # Now we need to find the trs which contain the apps.
+            # The first TR is just headers.
+            # After that they alternate between containing an app and just some display graphics
+            # until the third from last. After that, they contain more rubbish.
 
-        for i in range(len(trs)):
-            # We are only interested in the trs in even positions in the list.
-            if i % 2 == 0:
-                tr = trs[i]
+            trs = results_table.findAll("tr")[1:-2]
 
-                application = PlanningApplication()
+            for i in range(len(trs)):
+                # We are only interested in the trs in even positions in the list.
+                if i % 2 == 0:
+                    tr = trs[i]
 
-                application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))
+                    application = PlanningApplication()
 
-                application.council_reference = tr.a.string
+                    comment_url_element = tr.find(text="comment on this planning application").parent
+                    application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))
 
-                comment_url_element = tr.find(text="comment on this planning application").parent
-                application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])
+                    # If the date of this application is earlier than the date 
+                    # we are searching for then don't download it.
+                    # We could optimize this a bit more by not doing the later pages.
 
-                application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
+                    if application.date_received < search_date:
+                        break
 
-                info_response = urllib2.urlopen(application.info_url)
+                    application.council_reference = tr.a.string
 
-                info_soup = BeautifulSoup(info_response.read())
-                
-                info_table = info_soup.findAll("table")[2]
+                    application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])
 
-                application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
-                application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()
+                    application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
 
-                # Now to get the address. This will be split across several tds.
+                    info_response = urllib2.urlopen(application.info_url)
 
-                address_start_td = info_table.find("td", rowspan="4")
+                    info_soup = BeautifulSoup(info_response.read())
 
-                # We need the first bit of the address from this tr
-                address_bits = [address_start_td.findNext("td").string.strip()]
+                    info_table = info_soup.findAll("table")[2]
 
-                # We will need the first td from the next three trs after this
-                for address_tr in address_start_td.findAllNext("tr")[:3]:
-                    address_line = address_tr.td.string.strip()
-                    
-                    if address_line:
-                        address_bits.append(address_line)
-                        
-                address_bits.append(application.postcode)
+                    application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
+                    application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()
+
+                    # Now to get the address. This will be split across several tds.
+
+                    address_start_td = info_table.find("td", rowspan="4")
 
-                application.address = ', '.join(address_bits)
+                    # We need the first bit of the address from this tr
+                    address_bits = [address_start_td.findNext("td").string.strip()]
 
-                self._results.addApplication(application)
-                
+                    # We will need the first td from the next three trs after this
+                    for address_tr in address_start_td.findAllNext("tr")[:3]:
+                        address_line = address_tr.td.string.strip()
+
+                        if address_line:
+                            address_bits.append(address_line)
+
+                    address_bits.append(application.postcode)
+
+                    application.address = ', '.join(address_bits)
+
+                    self._results.addApplication(application)
+                    
+            offset += 10
 
         return self._results
 
     def getResults(self, day, month, year):
-        return self.getResultsByDayMonthYear().displayXML()
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
 
 if __name__ == '__main__':
     parser = ShetlandParser()
-    print parser.getResults(21,5,2008)
 
-# TODO: Sort out pagination
+    # Note: to test this, you will need to pick a current date.
+    print parser.getResults(9,6,2008)
+
diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv
index d6cd45d..af74a3d 100644
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -221,3 +221,4 @@
 "Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser"
 "Barnsley Metropolitan Borough Council", "Barnsley", "", "Barnsley", "BarnsleyParser"
 "Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
+"Shetland Islands Council", "Shetland Islands", "", "Shetland", "ShetlandParser"