From d92f3bb6fd13b8c3cab65e4246e1ed2697e8c117 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Mon, 8 Sep 2008 01:35:40 +0000
Subject: [PATCH] Add scraper for Herefordshire. Alter PlanningUtils to CDATA
 everything, scrapping the xmlquote function.

---
 python_scrapers/Herefordshire.py     | 85 ++++++++++++++++++++++++++++
 python_scrapers/OtherFilesToCopy.csv |  1 +
 python_scrapers/PlanningExplorer.py  |  6 +-
 python_scrapers/PlanningUtils.py     | 21 +++----
 python_scrapers/SitesToGenerate.csv  |  1 +
 5 files changed, 97 insertions(+), 17 deletions(-)
 create mode 100644 python_scrapers/Herefordshire.py

diff --git a/python_scrapers/Herefordshire.py b/python_scrapers/Herefordshire.py
new file mode 100644
index 0000000..19cfbfa
--- /dev/null
+++ b/python_scrapers/Herefordshire.py
@@ -0,0 +1,85 @@
+
+import urllib2
+import urllib
+import urlparse
+
+import datetime, time
+import cgi
+import re
+
+from BeautifulSoup import BeautifulSoup
+
+from PlanningUtils import PlanningApplication, \
+    PlanningAuthorityResults, \
+    getPostcodeFromText
+
+date_format = "%d/%m/%Y"
+
+class HerefordshireParser:
+    comments_email_address = "Developmentcontrol@barnsley.gov.uk"
+
+    def __init__(self, *args):
+
+        self.authority_name = "Herefordshire County Council"
+        self.authority_short_name = "Herefordshire"
+        self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0"
+        #As we are going to the info page, we may as well pick up the comment url from there.
+#        self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting
+
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+
+
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_day = datetime.date(year, month, day)
+
+        post_data = urllib.urlencode(
+            (("show", "0"),
+             ("Go", "GO"),
+             )
+            )
+
+        # Now get the search page
+        response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
+
+        soup = BeautifulSoup(response.read())
+
+        if not soup.find(text=re.compile("Sorry, no matches found")):
+            # There were apps for this date
+
+            trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:]
+
+            for tr in trs:
+                application = PlanningApplication()
+                application.date_received = search_day
+
+                application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
+                application.council_reference = tr.a.string
+    #            application.comment_url = self.comment_url %(application.council_reference)
+
+                tds = tr.findAll("td")
+
+                application.address = tds[1].string
+                application.postcode = getPostcodeFromText(application.address)
+
+                # This just gets us an initial segment of the description.
+                # We are going to have to download the info page...
+                #application.description = tds[2].string.strip()
+
+                info_response = urllib.urlopen(application.info_url)
+
+                info_soup = BeautifulSoup(info_response.read())
+
+                application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()
+                application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href'])
+
+                self._results.addApplication(application)
+
+        return self._results
+
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+if __name__ == '__main__':
+    parser = HerefordshireParser()
+    print parser.getResults(31,8,2008)
+
diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index dafa174..1bf9904 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -55,3 +55,4 @@
 "Halton.py", "420"
 "Hampshire.py", "420"
 "Hastings.py", "420"
+"Herefordshire.py", "420"
diff --git a/python_scrapers/PlanningExplorer.py b/python_scrapers/PlanningExplorer.py
index 2da3961..5c3e2ec 100644
--- a/python_scrapers/PlanningExplorer.py
+++ b/python_scrapers/PlanningExplorer.py
@@ -631,7 +631,7 @@ if __name__ == '__main__':
     # NOTE - 04/11/2007 is a sunday
     # I'm using it to test that the scrapers behave on days with no apps.
     
-    parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
+#    parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
 #    parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
 #    parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
 #    parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
@@ -639,7 +639,7 @@ if __name__ == '__main__':
 #    parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
 #    parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
 #    parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
-#    parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
+    parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
 #    parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
 #    parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
 #    parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
@@ -655,7 +655,7 @@ if __name__ == '__main__':
 #    parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
 #    parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
 #    parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
-    print parser.getResults(3, 7, 2008)
+    print parser.getResults(4, 9, 2008)
 
 # To Do
 
diff --git a/python_scrapers/PlanningUtils.py b/python_scrapers/PlanningUtils.py
index 569d892..6d854e0 100644
--- a/python_scrapers/PlanningUtils.py
+++ b/python_scrapers/PlanningUtils.py
@@ -4,13 +4,6 @@ import re
 
 date_format = "%d/%m/%Y"
 
-
-def xmlQuote(text):
-    # Change &s to &amp;s
-    # I suspect there is probably some standard python
-    # function I should be using for this...
-    return text.replace('&', '&amp;')
-
 def fixNewlines(text):
     # This can be used to sort out windows newlines
     return text.replace("\r\n","\n")
@@ -112,13 +105,13 @@ class PlanningApplication:
         #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
 
 	contents = [
-            u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
-            u"<address>%s</address>" %xmlQuote(self.address),
-            u"<postcode>%s</postcode>" %self.postcode,
-            u"<description>%s</description>" %xmlQuote(self.description),
-            u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
-            u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
-            u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
+            u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
+            u"<address><![CDATA[%s]]></address>" %(self.address),
+            u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
+            u"<description><![CDATA[%s]]></description>" %(self.description),
+            u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
+            u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
+            u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
             ]
         if self.osgb_x:
             contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv
index 5de335b..7e36475 100644
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -259,3 +259,4 @@
 "Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
 "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
 "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
+"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"