From df32136695bb34b1b098ade8ceb1289b3f849796 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Sun, 30 Nov 2008 14:55:02 +0000
Subject: [PATCH] Add Thomas' scraper for Solihull.

---
 python_scrapers/OtherFilesToCopy.csv |  1 +
 python_scrapers/SitesToGenerate.csv  |  1 +
 python_scrapers/Solihull.py          | 81 ++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 python_scrapers/Solihull.py

diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index c0a1657..d8e2a79 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -68,3 +68,4 @@
 "Broxtowe.py", "420"
 "Mendip.py", "420"
 "Weymouth.py", "420"
+"Solihull.py", "420"
\ No newline at end of file
diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv
index 57739be..a473f3b 100644
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -274,3 +274,4 @@
 "Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"
 "Mendip District Council", "Mendip", "", "Mendip", "MendipParser"
 "Weymouth and Portland Borough Council", "Weymouth and Portland", "", "Weymouth", "WeymouthParser"
+"Solihull Metropolitan Borough Council", "Solihull", "", "Solihull", "SolihullParser"
diff --git a/python_scrapers/Solihull.py b/python_scrapers/Solihull.py
new file mode 100644
index 0000000..84a8a47
--- /dev/null
+++ b/python_scrapers/Solihull.py
@@ -0,0 +1,81 @@
+"""
+This is the screenscraper for planning apps for 
+Solihull Metropolitan Borough Council.
+
+The apps for Solihull are displayed in html pages one per week, starting on Monday. 
+"""
+
+import urllib2
+import urllib
+import urlparse
+
+import datetime, time
+import cgi
+
+from BeautifulSoup import BeautifulSoup
+
+from PlanningUtils import PlanningApplication, \
+    PlanningAuthorityResults, \
+    getPostcodeFromText
+
+date_format = "%d/%m/%Y"
+
+class SolihullParser:
+
+    def __init__(self, *args):
+
+        self.authority_name = "Solihull Metropolitan Borough Council"
+        self.authority_short_name = "Solihull"
+        self.base_url = "http://www.solihull.gov.uk/planning/dc/weeklist.asp?SD=%s&ward=ALL"
+
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+
+
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_day = datetime.date(year, month, day)
+
+        # What we actually need is the monday before the date searched for:
+        monday_before = search_day - datetime.timedelta(search_day.weekday())
+
+        # Now get the search page
+        response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format)))
+        soup = BeautifulSoup(response.read())
+
+        result_tables = soup.findAll("table", width="98%", cellpadding="2")
+
+        for table in result_tables:
+            application = PlanningApplication()
+
+            trs = table.findAll("tr")
+	    application.council_reference = trs[0].strong.string.strip()
+            relative_info_url = trs[0].a['href']
+            application.info_url = urlparse.urljoin(self.base_url, relative_info_url)
+
+            application.address = trs[1].findAll("td")[1].string.strip()
+            application.postcode = getPostcodeFromText(application.address)
+            application.description = trs[2].findAll("td")[1].string.strip()
+
+	    #There's probably a prettier way to get the date, but with Python, it's easier for me to reinvent the wheel than to find an existing wheel!
+	    raw_date_recv = trs[3].findAll("td")[3].string.strip().split("/")
+	    #Check whether the application is on the target day. If not, discard it and move on.
+	    if int(raw_date_recv[0]) != day:
+	      continue
+	    application.date_received = datetime.date(int(raw_date_recv[2]), int(raw_date_recv[1]), int(raw_date_recv[0]))
+
+            try:
+                relative_comment_url = trs[5].findAll("td")[1].a['href']
+                application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url)
+            except:
+                application.comment_url = "No Comment URL."
+
+            self._results.addApplication(application)
+
+        return self._results
+
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+if __name__ == '__main__':
+    parser = SolihullParser()
+    #Put this in with constant numbers, copying the Barnsley example. Works for testing, but should it use the arguments for a real run?
+    print parser.getResults(27,10,2008)