From 53d0b25f7828de2e8685069ae479926a6ae6eb7e Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Tue, 14 Oct 2008 09:46:56 +0000
Subject: [PATCH] Add scraper for Gosport. Factor out
 CookieAddingHTTPRedirectHandler.

---
 python_scrapers/Gosport.py           | 100 +++++++++++++++++++++++++++
 python_scrapers/HTTPHandlers.py      |  18 +++++
 python_scrapers/Ocella.py            |  13 +---
 python_scrapers/OtherFilesToCopy.csv |   2 +
 python_scrapers/SitesToGenerate.csv  |   1 +
 5 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 python_scrapers/Gosport.py
 create mode 100644 python_scrapers/HTTPHandlers.py
diff --git a/python_scrapers/Gosport.py b/python_scrapers/Gosport.py
new file mode 100644
index 0000000..14d48a0
--- /dev/null
+++ b/python_scrapers/Gosport.py
@@ -0,0 +1,100 @@
+import urllib2
+import urllib
+import urlparse
+
+import datetime, time
+import cgi
+
+import re
+
+import cookielib
+
+cookie_jar = cookielib.CookieJar()
+
+
+from BeautifulSoup import BeautifulSoup
+
+from PlanningUtils import PlanningApplication, \
+    PlanningAuthorityResults, \
+    getPostcodeFromText
+
+
+from HTTPHandlers import CookieAddingHTTPRedirectHandler
+
+cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar))
+
+
+search_date_format = "%m/%d/%Y" #That's right, the search date is US style.
+info_page_date_format = "%d/%m/%Y" # and the info page is UK style
+
+class GosportParser:
+    def __init__(self, *args):
+
+        self.authority_name = "Gosport Borough Council"
+        self.authority_short_name = "Gosport"
+
+        self.base_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationSearch2.aspx"
+        self.info_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationDetails.aspx?ID=%s"
+
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+
+
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_date = datetime.date(year, month, day)
+
+        get_request = urllib2.Request(self.base_url)
+        get_response = urllib2.urlopen(get_request)
+        cookie_jar.extract_cookies(get_response, get_request)
+        
+        get_soup = BeautifulSoup(get_response.read())
+
+        post_data = (
+            ("__VIEWSTATE", get_soup.find("input", {"name": "__VIEWSTATE"})["value"]),
+            ("pgid", get_soup.find("input", {"name": "pgid"})["value"]),
+            ("action", "Search"),
+#            ("ApplicationSearch21%3AtbDevAddress", ""),
+#            ("ApplicationSearch21%3AtbApplicantName", ""),
+#            ("ApplicationSearch21%3AtbAgentName", ""),
+            ("ApplicationSearch21:tbDateSubmitted", "10/01/2008"),
+            ("ApplicationSearch21:btnDateSubmitted", "Search"),
+#            ("ApplicationSearch21%3AtbDateDetermined", ""),
+            )
+
+        
+        post_request = urllib2.Request(self.base_url, urllib.urlencode(post_data))
+        cookie_jar.add_cookie_header(post_request)
+        post_response = cookie_handling_opener.open(post_request)
+
+        post_soup = BeautifulSoup(post_response.read())
+
+        # Discard the first <tr>, which contains headers
+        trs = post_soup.find("table", id="SearchResults1_dgSearchResults").findAll("tr")[1:]
+
+        for tr in trs:
+            application = PlanningApplication()
+            
+            tds = tr.findAll("td")
+
+            application.council_reference = tds[0].string.strip()
+            application.address = tds[1].string.strip()
+            application.postcode = getPostcodeFromText(application.address)
+            application.description = tds[2].string.strip()
+
+            application.date_received = datetime.datetime(*(time.strptime(tds[3].string.strip(), info_page_date_format)[0:6]))
+            application.info_url = self.info_url %(application.council_reference)
+
+            # The comment url must be accessed by a POST, so we'll just use the info url for that as well
+
+            application.comment_url = application.info_url
+
+            self._results.addApplication(application)
+
+        return self._results
+
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+if __name__ == '__main__':
+    parser = GosportParser()
+    print parser.getResults(1,10,2008)
+
diff --git a/python_scrapers/HTTPHandlers.py b/python_scrapers/HTTPHandlers.py
new file mode 100644
index 0000000..0015c5e
--- /dev/null
+++ b/python_scrapers/HTTPHandlers.py
@@ -0,0 +1,18 @@
+
+from urllib2 import HTTPRedirectHandler
+
+class CookieAddingHTTPRedirectHandler(HTTPRedirectHandler):
+    """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""
+
+    def __init__(self, cookie_jar):
+        self.cookie_jar = cookie_jar
+
+        # This really ought to call the superclasses init method, but there doesn't seem to be one.
+
+
+    def redirect_request(self, *args):
+        new_request = HTTPRedirectHandler.redirect_request(self, *args)
+        # We need to add a cookie from the cookie_jar
+        self.cookie_jar.add_cookie_header(new_request)
+
+        return new_request
diff --git a/python_scrapers/Ocella.py b/python_scrapers/Ocella.py
index 924a349..61ffa99 100644
--- a/python_scrapers/Ocella.py
+++ b/python_scrapers/Ocella.py
@@ -18,16 +18,9 @@ search_date_format = "%d-%m-%Y" # Format used for the accepted date when searchi
 
 possible_date_formats = [search_date_format, "%d/%m/%Y"]
 
-class CookieAddingHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
-    """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""
-    def redirect_request(self, req, fp, code, msg, headers, newurl):
-        new_request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
-        # We need to add a cookie from the cookie_jar
-        cookie_jar.add_cookie_header(new_request)
+from HTTPHandlers import CookieAddingHTTPRedirectHandler
 
-        return new_request
-
-cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler())
+cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar))
 
 
 class OcellaParser:
@@ -206,7 +199,7 @@ if __name__ == '__main__':
 #    parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly")
 #    parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
-    parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
+#    parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
     parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")
 
diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index f5819a5..eae8963 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -1,6 +1,7 @@
 "filename", "permissions"
 "PublicAccess.py", "420"
 "PlanningUtils.py", "420"
+"HTTPHandlers.py", "420"
 "SouthOxfordshireParser.py", "420"  
 "SouthOxfordshire.cgi", "493"
 "ApplicationSearchServletParser.py", "420"
@@ -58,3 +59,4 @@
 "Herefordshire.py", "420"
 "Exmoor.py", "420"
 "Eastbourne.py", "420"
+"Gosport.py", "420"
diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv
index 95bac87..0661ca7 100644
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -264,3 +264,4 @@
 "Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"
 "Waltham Forest Council", "Waltham Forest", "http://planning.walthamforest.gov.uk/", "PlanningExplorer", "WalthamForestParser"
 "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
+"Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"