From 1510528a8aaccb21c36eb1cf79cdabdbdc7bad68 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Fri, 5 Sep 2008 12:52:07 +0000
Subject: [PATCH] Adding scraper for Halton. Also adding the pycurl scraper for
 Westminster, just in case it is useful to remind us how to do stuff later.

---
 python_scrapers/Halton.py             | 134 ++++++++++++++++++++
 python_scrapers/OtherFilesToCopy.csv  |   1 +
 python_scrapers/SitesToGenerate.csv   |   1 +
 python_scrapers/Westminster_pycurl.py | 170 ++++++++++++++++++++++++++
 4 files changed, 306 insertions(+)
 create mode 100644 python_scrapers/Halton.py
 create mode 100644 python_scrapers/Westminster_pycurl.py

diff --git a/python_scrapers/Halton.py b/python_scrapers/Halton.py
new file mode 100644
index 0000000..c10874c
--- /dev/null
+++ b/python_scrapers/Halton.py
@@ -0,0 +1,134 @@
+
+import urllib2
+import urllib
+import urlparse
+
+import datetime, time
+import cgi
+
+
+import cookielib
+
+cookie_jar = cookielib.CookieJar()
+
+
+from BeautifulSoup import BeautifulSoup
+
+from PlanningUtils import PlanningApplication, \
+    PlanningAuthorityResults, \
+    getPostcodeFromText
+
+#date_format = "%d-%m-%Y"
+date_format = "%d/%m/%Y"
+received_date_format = "%d %B %Y"
+
+import re
+
+# We're going to use this for a re.split
+# A whitespace char, "of" or "at" (case independent), and then a whitespace char.
+address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I)
+
+class HaltonParser:
+    def __init__(self, *args):
+
+        self.authority_name = "Halton Borough Council"
+        self.authority_short_name = "Halton"
+        self.base_url = "http://www.halton.gov.uk/planningapps/index.asp"
+
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+
+#CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search
+
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_day = datetime.date(year, month, day)
+
+        # It seems dates are interpreted as midnight on
+        post_data = urllib.urlencode(
+            [
+#                ("CaseNo", ""),
+#                ("AppName", ""),
+                ("DateApValFrom", search_day.strftime(date_format)),
+                ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)),
+#                ("AdrsNo", ""),
+#                ("StName", ""),
+#                ("StTown", ""),
+                ("DropWeekDate", "0"),#search_day.strftime(date_format)),
+                ("DropAppealStatus", "0"),
+#                ("DateAppealValFrom", ""),
+#                ("DateAppealValTo", ""),
+                ("PageSize", "10"),
+                ("Action", "Search"),                 
+                ]
+            )
+
+        request = urllib2.Request(self.base_url, post_data)
+
+        while request:
+            # Now get the search page
+            # We need to deal with cookies, since pagination depends on them.
+            cookie_jar.add_cookie_header(request)
+            response = urllib2.urlopen(request)
+
+            cookie_jar.extract_cookies(response, request)
+
+            soup = BeautifulSoup(response.read())
+
+            # This should find us each Case on the current page.
+            caseno_strings = soup.findAll(text="Case No:")
+
+            for caseno_string in caseno_strings:
+                application = PlanningApplication()
+
+                application.council_reference = caseno_string.findNext("td").string
+                application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip()
+
+                application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date()
+
+                # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts.
+                # As a first go, we'll try splitting the description on the last occurence of " of " or " at ".
+
+                try:
+                    application.address = re.split(address_finder_re, application.description)[-1].strip()
+                except IndexError:
+                    # If we can't find of or at, we'll just have the description again, it's better than nothing.
+                    application.address = application.description
+
+                # We may as well get the postcode from the description rather than the address, in case things have gone wrong
+                application.postcode = getPostcodeFromText(application.description)
+
+                application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action'])
+
+                # Now what to have as info url...
+                # There is no way to link to a specific app, so we'll just have the search page.
+                application.info_url = self.base_url
+
+                self._results.addApplication(application)
+                
+            # Now we need to find the post data for the next page, if there is any.
+            # Find the form with id "formNext", if there is one
+            next_form = soup.find("form", id="formNext")
+
+            if next_form is not None:
+                action = next_form['action']
+            
+                # The HTML is borked - the inputs are outside the form, they are all
+                # in a td which follows it.
+                
+                inputs = next_form.findNext("td").findAll("input")
+                
+                post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs])
+                
+                request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data)
+            else:
+                request = None
+
+
+        return self._results
+
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+if __name__ == '__main__':
+    parser = HaltonParser()
+    print parser.getResults(4,8,2008)
+
diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index 4914f1c..8aec0ed 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -52,3 +52,4 @@
 "Hounslow.py", "420"
 "Harrow.py", "420"
 "Westminster.py", "420"
+"Halton.py", "420"
diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv
index 1231f5f..311c2ed 100644
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -256,3 +256,4 @@
 "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"
 "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser"
 "Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
+"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
diff --git a/python_scrapers/Westminster_pycurl.py b/python_scrapers/Westminster_pycurl.py
new file mode 100644
index 0000000..d5c8b68
--- /dev/null
+++ b/python_scrapers/Westminster_pycurl.py
@@ -0,0 +1,170 @@
+"""
+This is the screenscraper for Westminster City Council.
+
+I have just noticed that there is a PublicAccess underneath all this, but
+it only has the apps in for which they are accepting comments, so I think
+we may as well use this url and get the lot...
+
+This is the PublicAccess url:
+http://publicaccess.westminster.gov.uk/publicaccess/
+"""
+
+import urllib
+import urlparse
+
+import pycurl
+import StringIO
+
+import datetime, time
+import cgi
+
+import sys
+
+from BeautifulSoup import BeautifulSoup
+
+from PlanningUtils import PlanningApplication, \
+    PlanningAuthorityResults, \
+    getPostcodeFromText
+
+date_format = "%d%%2F%m%%2F%Y"
+
+class WestminsterParser:
+    def __init__(self, *args):
+
+        self.authority_name = "City of Westminster"
+        self.authority_short_name = "Westminster"
+        self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"
+
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+
+
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_day = datetime.date(year, month, day)
+
+#         post_data = [
+#             ("EFNO", ""),
+#             ("STName", ""),
+#             ("STNUMB", ""),
+#             ("ADRSNO", ""),
+#             ("WARD", "AllWards"),
+#             ("AGT", ""),
+#             ("ATCDE", "AllApps"),
+#             ("DECDE", "AllDecs"),
+#             ("DTErec", search_day.strftime(date_format)),
+#             ("DTErecTo", search_day.strftime(date_format)),
+#             ("DTEvalid", ""),
+#             ("DTEvalidTo", ""),
+#             ("APDECDE", "AllAppDecs"),
+#             ("submit", "Start+Search"),
+#             ]
+        post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}
+
+        while post_data:
+            
+
+            # Now get the search page
+
+            sys.stderr.write("Fetching: %s\n" %self.base_url)
+            sys.stderr.write("post data: %s\n" %post_data) 
+            
+
+            # This gives us something to use as the callback
+            fakefile = StringIO.StringIO()
+
+            curlobj = pycurl.Curl()
+            curlobj.setopt(pycurl.URL, self.base_url)
+            curlobj.setopt(pycurl.POST, True)
+            curlobj.setopt(pycurl.POSTFIELDS, post_data)
+            curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
+            curlobj.setopt(pycurl.FOLLOWLOCATION, True)
+            curlobj.setopt(pycurl.MAXREDIRS, 10)
+
+            curlobj.perform()
+
+            sys.stderr.write("Got it\n")
+            soup = BeautifulSoup(fakefile.getvalue())
+
+            # We may as well free up the memory used by fakefile
+            fakefile.close()
+
+            sys.stderr.write("Created soup\n")
+
+            results_form = soup.find("form", {"name": "currentsearchresultsNext"})
+
+            # Sort out the post_data for the next page, if there is one
+            # If there is no next page then there will be no inputs in the form.
+            # In this case, post_data will be '', which is false.
+
+            sys.stderr.write("Found form containing results\n")
+
+            post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
+
+            sys.stderr.write("Got post data\n")
+
+            # Each result has one link, and they are the only links in the form
+
+            links = results_form.findAll("a")
+
+            sys.stderr.write("Got list of links\n")
+
+            for link in links:
+
+                sys.stderr.write("Working on link: %s\n" %link['href'])
+
+                application = PlanningApplication()
+
+                application.date_received = search_day
+                application.info_url = urlparse.urljoin(self.base_url, link['href'])
+                application.council_reference = link.string.strip()
+
+                application.address = link.findNext("td").string.strip()
+                application.postcode = getPostcodeFromText(application.address)
+
+                application.description = link.findNext("tr").findAll("td")[-1].string.strip()
+
+                # To get the comment url, we're going to have to go to each info url :-(
+
+                sys.stderr.write("Fetching: %s\n" %application.info_url)
+
+
+                fakefile = StringIO.StringIO()
+
+
+                curlobj.setopt(pycurl.HTTPGET, True)
+                curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
+
+                # We have to convert the info url to ascii for curl
+                curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))
+
+                curlobj.perform()
+
+                sys.stderr.write("Got it\n")
+
+                info_soup = BeautifulSoup(fakefile.getvalue())
+
+                fakefile.close()
+
+                comment_nav_string = info_soup.find(text="Comment on this case")
+                if comment_nav_string:
+                    application.comment_url = comment_nav_string.parent['href']
+                else:
+                    application.comment_url = "No Comments"
+
+    #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500
+
+                self._results.addApplication(application)
+
+                sys.stderr.write("Finished that link\n")
+
+
+        sys.stderr.write("Finished while loop, returning stuff.\n")
+
+        return self._results
+
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+if __name__ == '__main__':
+    parser = WestminsterParser()
+    print parser.getResults(1,8,2008)
+