From 049d494db5402b5f844f75685c0aab03091ec6b6 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 4 Sep 2008 14:44:35 +0000 Subject: [PATCH] Go back to the urllib2 version of Westminster. This works on disruptiveproactivity. --- python_scrapers/Westminster.py | 65 +++++++++------------------------- 1 file changed, 17 insertions(+), 48 deletions(-) diff --git a/python_scrapers/Westminster.py b/python_scrapers/Westminster.py index d5c8b68..052ccfb 100644 --- a/python_scrapers/Westminster.py +++ b/python_scrapers/Westminster.py @@ -9,12 +9,10 @@ This is the PublicAccess url: http://publicaccess.westminster.gov.uk/publicaccess/ """ +import urllib2 import urllib import urlparse -import pycurl -import StringIO - import datetime, time import cgi @@ -64,30 +62,15 @@ class WestminsterParser: # Now get the search page - sys.stderr.write("Fetching: %s\n" %self.base_url) - sys.stderr.write("post data: %s\n" %post_data) +# sys.stderr.write("Fetching: %s\n" %self.base_url) +# sys.stderr.write("post data: %s\n" %post_data) + response = urllib2.urlopen(self.base_url, post_data) - # This gives us something to use as the callback - fakefile = StringIO.StringIO() - - curlobj = pycurl.Curl() - curlobj.setopt(pycurl.URL, self.base_url) - curlobj.setopt(pycurl.POST, True) - curlobj.setopt(pycurl.POSTFIELDS, post_data) - curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) - curlobj.setopt(pycurl.FOLLOWLOCATION, True) - curlobj.setopt(pycurl.MAXREDIRS, 10) - - curlobj.perform() - - sys.stderr.write("Got it\n") - soup = BeautifulSoup(fakefile.getvalue()) +# sys.stderr.write("Got it\n") + soup = BeautifulSoup(response.read()) - # We may as well free up the memory used by fakefile - fakefile.close() - - sys.stderr.write("Created soup\n") +# sys.stderr.write("Created soup\n") results_form = soup.find("form", {"name": "currentsearchresultsNext"}) @@ -95,21 +78,21 @@ class WestminsterParser: # If there is no next page then there will be no inputs in the form. # In this case, post_data will be '', which is false. - sys.stderr.write("Found form containing results\n") +# sys.stderr.write("Found form containing results\n") post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) - sys.stderr.write("Got post data\n") +# sys.stderr.write("Got post data\n") # Each result has one link, and they are the only links in the form links = results_form.findAll("a") - sys.stderr.write("Got list of links\n") +# sys.stderr.write("Got list of links\n") for link in links: - sys.stderr.write("Working on link: %s\n" %link['href']) +# sys.stderr.write("Working on link: %s\n" %link['href']) application = PlanningApplication() @@ -124,25 +107,11 @@ class WestminsterParser: # To get the comment url, we're going to have to go to each info url :-( - sys.stderr.write("Fetching: %s\n" %application.info_url) - - - fakefile = StringIO.StringIO() - - - curlobj.setopt(pycurl.HTTPGET, True) - curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) - - # We have to convert the info url to ascii for curl - curlobj.setopt(pycurl.URL, application.info_url.encode("ascii")) - - curlobj.perform() - - sys.stderr.write("Got it\n") - - info_soup = BeautifulSoup(fakefile.getvalue()) +# sys.stderr.write("Fetching: %s\n" %application.info_url) + info_response = urllib2.urlopen(application.info_url) +# sys.stderr.write("Got it\n") - fakefile.close() + info_soup = BeautifulSoup(info_response) comment_nav_string = info_soup.find(text="Comment on this case") if comment_nav_string: @@ -154,10 +123,10 @@ class WestminsterParser: self._results.addApplication(application) - sys.stderr.write("Finished that link\n") +# sys.stderr.write("Finished that link\n") - sys.stderr.write("Finished while loop, returning stuff.\n") +# sys.stderr.write("Finished while loop, returning stuff.\n") return self._results