From 049d494db5402b5f844f75685c0aab03091ec6b6 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Thu, 4 Sep 2008 14:44:35 +0000
Subject: [PATCH] Go back to the urllib2 version of Westminster. This works on
 disruptiveproactivity.

---
 python_scrapers/Westminster.py | 65 +++++++++-------------------------
 1 file changed, 17 insertions(+), 48 deletions(-)

diff --git a/python_scrapers/Westminster.py b/python_scrapers/Westminster.py
index d5c8b68..052ccfb 100644
--- a/python_scrapers/Westminster.py
+++ b/python_scrapers/Westminster.py
@@ -9,12 +9,10 @@ This is the PublicAccess url:
 http://publicaccess.westminster.gov.uk/publicaccess/
 """
 
+import urllib2
 import urllib
 import urlparse
 
-import pycurl
-import StringIO
-
 import datetime, time
 import cgi
 
@@ -64,30 +62,15 @@ class WestminsterParser:
 
             # Now get the search page
 
-            sys.stderr.write("Fetching: %s\n" %self.base_url)
-            sys.stderr.write("post data: %s\n" %post_data) 
+#            sys.stderr.write("Fetching: %s\n" %self.base_url)
+#            sys.stderr.write("post data: %s\n" %post_data) 
             
+            response = urllib2.urlopen(self.base_url, post_data)
 
-            # This gives us something to use as the callback
-            fakefile = StringIO.StringIO()
-
-            curlobj = pycurl.Curl()
-            curlobj.setopt(pycurl.URL, self.base_url)
-            curlobj.setopt(pycurl.POST, True)
-            curlobj.setopt(pycurl.POSTFIELDS, post_data)
-            curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
-            curlobj.setopt(pycurl.FOLLOWLOCATION, True)
-            curlobj.setopt(pycurl.MAXREDIRS, 10)
-
-            curlobj.perform()
-
-            sys.stderr.write("Got it\n")
-            soup = BeautifulSoup(fakefile.getvalue())
+#            sys.stderr.write("Got it\n")
+            soup = BeautifulSoup(response.read())
 
-            # We may as well free up the memory used by fakefile
-            fakefile.close()
-
-            sys.stderr.write("Created soup\n")
+#            sys.stderr.write("Created soup\n")
 
             results_form = soup.find("form", {"name": "currentsearchresultsNext"})
 
@@ -95,21 +78,21 @@ class WestminsterParser:
             # If there is no next page then there will be no inputs in the form.
             # In this case, post_data will be '', which is false.
 
-            sys.stderr.write("Found form containing results\n")
+#            sys.stderr.write("Found form containing results\n")
 
             post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
 
-            sys.stderr.write("Got post data\n")
+#            sys.stderr.write("Got post data\n")
 
             # Each result has one link, and they are the only links in the form
 
             links = results_form.findAll("a")
 
-            sys.stderr.write("Got list of links\n")
+#            sys.stderr.write("Got list of links\n")
 
             for link in links:
 
-                sys.stderr.write("Working on link: %s\n" %link['href'])
+#                sys.stderr.write("Working on link: %s\n" %link['href'])
 
                 application = PlanningApplication()
 
@@ -124,25 +107,11 @@ class WestminsterParser:
 
                 # To get the comment url, we're going to have to go to each info url :-(
 
-                sys.stderr.write("Fetching: %s\n" %application.info_url)
-
-
-                fakefile = StringIO.StringIO()
-
-
-                curlobj.setopt(pycurl.HTTPGET, True)
-                curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
-
-                # We have to convert the info url to ascii for curl
-                curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))
-
-                curlobj.perform()
-
-                sys.stderr.write("Got it\n")
-
-                info_soup = BeautifulSoup(fakefile.getvalue())
+#                sys.stderr.write("Fetching: %s\n" %application.info_url)
+                info_response = urllib2.urlopen(application.info_url)
+#                sys.stderr.write("Got it\n")
 
-                fakefile.close()
+                info_soup = BeautifulSoup(info_response)
 
                 comment_nav_string = info_soup.find(text="Comment on this case")
                 if comment_nav_string:
@@ -154,10 +123,10 @@ class WestminsterParser:
 
                 self._results.addApplication(application)
 
-                sys.stderr.write("Finished that link\n")
+#                sys.stderr.write("Finished that link\n")
 
 
-        sys.stderr.write("Finished while loop, returning stuff.\n")
+#        sys.stderr.write("Finished while loop, returning stuff.\n")
 
         return self._results