From c5690efa2da3cf57740ca0421af8cedaad2954f9 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Fri, 13 Jun 2008 14:38:12 +0000
Subject: [PATCH] Add scraper for Flintshire. I've included the OSGB x,y
 coordinates in the hope that we can start using these where we don't have
 postcodes.

---
 python_scrapers/Flintshire.py        | 93 ++++++++++++++++++++++++++++
 python_scrapers/OtherFilesToCopy.csv |  1 +
 python_scrapers/PlanningUtils.py     | 30 ++++++---
 python_scrapers/SitesToGenerate.csv  |  1 +
 4 files changed, 115 insertions(+), 10 deletions(-)
 create mode 100644 python_scrapers/Flintshire.py

diff --git a/python_scrapers/Flintshire.py b/python_scrapers/Flintshire.py
new file mode 100644
index 0000000..845e2f6
--- /dev/null
+++ b/python_scrapers/Flintshire.py
@@ -0,0 +1,93 @@
+import urllib2
+import urllib
+import urlparse
+
+import datetime, time
+import cgi
+
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from PlanningUtils import PlanningApplication, \
+    PlanningAuthorityResults, \
+    getPostcodeFromText
+
+date_format = "%d/%m/%Y"
+
+class FlintshireParser:
+    def __init__(self, *args):
+
+        self.authority_name = "Flintshire County Council"
+        self.authority_short_name = "Flintshire"
+
+        # I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
+        self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"
+
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+
+
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_date = datetime.date(year, month, day)
+
+        # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
+        response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
+                                                   "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
+        soup = BeautifulSoup(response.read())
+
+        # Each app is stored in it's own table
+        result_tables = soup.findAll("table", border="1")
+
+        # For the moment, we'll have to ignore the first result (see TODO list).
+        for table in result_tables[1:]:
+            application = PlanningApplication()
+
+            # It's not clear to me why this next one isn't the string of the next sibling. This works though!
+            application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]
+
+            application.address = table.find(text="Location").parent.findNextSibling().string.strip()
+            application.postcode = getPostcodeFromText(application.address)
+
+            application.info_url = urlparse.urljoin(self.base_url, table.a['href'])
+
+            # Let's go to the info_page and get the OSGB and the date_received
+            info_request = urllib2.Request(application.info_url)
+
+            # We need to add the language header in order to get UK style dates
+            info_request.add_header("Accept-Language", "en-gb,en")
+            info_response = urllib2.urlopen(info_request)
+            info_soup = BeautifulSoup(info_response.read())
+            
+            grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
+            x_element = grid_reference_td.font
+            
+            application.osgb_x = x_element.string.strip()
+            application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
+            
+            date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()
+
+            application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))
+
+            application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()
+
+
+            # There is a link to comment from the info page, though I can't click it.
+            application.comment_url = application.info_url
+
+            self._results.addApplication(application)
+
+        return self._results
+
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+if __name__ == '__main__':
+    parser = FlintshireParser()
+    print parser.getResults(22,5,2008)
+
+# TODO
+
+# 1) Email the council about broken first result.
+# This is always
+# slightly broken (two </td>s for one of the <td>s and upsets beautiful
+# soup.
diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index 6059344..79e74c2 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -27,3 +27,4 @@
 "Kensington.py", "420"
 "Fife.py", "420"
 "ForestOfDean.py", "420"
+"Flintshire.py", "420"
diff --git a/python_scrapers/PlanningUtils.py b/python_scrapers/PlanningUtils.py
index f97affd..569d892 100644
--- a/python_scrapers/PlanningUtils.py
+++ b/python_scrapers/PlanningUtils.py
@@ -87,6 +87,11 @@ class PlanningApplication:
         # expecting this as a datetime.date object
 	self.date_received = None
 
+        # If we can get them, we may as well include OSGB.
+        # These will be the entirely numeric version.
+        self.osgb_x = None
+        self.osgb_y = None
+
     def __repr__(self):
 	return self.displayXML()
 
@@ -105,14 +110,19 @@ class PlanningApplication:
         
     def displayXML(self):
         #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
-	return  u"<application>\n" +\
-	u"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
-        u"<address>%s</address>\n" %xmlQuote(self.address) +\
-        u"<postcode>%s</postcode>\n" %self.postcode +\
-	u"<description>%s</description>\n" %xmlQuote(self.description) +\
-	u"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
-	u"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
-        u"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
-        u"</application>\n"
 
-        
+	contents = [
+            u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
+            u"<address>%s</address>" %xmlQuote(self.address),
+            u"<postcode>%s</postcode>" %self.postcode,
+            u"<description>%s</description>" %xmlQuote(self.description),
+            u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
+            u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
+            u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
+            ]
+        if self.osgb_x:
+            contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
+        if self.osgb_y:
+            contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y))
+
+        return u"<application>\n%s\n</application>" %('\n'.join(contents))
diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv
index 0f75ef1..53364fb 100644
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -227,3 +227,4 @@
 "East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
 "Fife Council", "Fife", "", "Fife", "FifeParser"
 "Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"
+"Flintshire County Council", "Flintshire", "", "Flintshire", "FlintshireParser"