Go back to the urllib2 version of Westminster. This works on disruptiveproactivity.

18 anni fa · 049d494db5
--- a/python_scrapers/Westminster.py
+++ b/python_scrapers/Westminster.py
@@ -9,12 +9,10 @@ This is the PublicAccess url:
 http://publicaccess.westminster.gov.uk/publicaccess/
 """

 import urllib2
 import urllib
 import urlparse

 import pycurl
 import StringIO

 import datetime, time
 import cgi

@@ -64,30 +62,15 @@ class WestminsterParser:

            # Now get the search page

            sys.stderr.write("Fetching: %s\n" %self.base_url)
            sys.stderr.write("post data: %s\n" %post_data) 
 #            sys.stderr.write("Fetching: %s\n" %self.base_url)
 #            sys.stderr.write("post data: %s\n" %post_data) 
            
            response = urllib2.urlopen(self.base_url, post_data)

            # This gives us something to use as the callback
            fakefile = StringIO.StringIO()

            curlobj = pycurl.Curl()
            curlobj.setopt(pycurl.URL, self.base_url)
            curlobj.setopt(pycurl.POST, True)
            curlobj.setopt(pycurl.POSTFIELDS, post_data)
            curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
            curlobj.setopt(pycurl.FOLLOWLOCATION, True)
            curlobj.setopt(pycurl.MAXREDIRS, 10)

            curlobj.perform()

            sys.stderr.write("Got it\n")
            soup = BeautifulSoup(fakefile.getvalue())
 #            sys.stderr.write("Got it\n")
            soup = BeautifulSoup(response.read())

            # We may as well free up the memory used by fakefile
            fakefile.close()

            sys.stderr.write("Created soup\n")
 #            sys.stderr.write("Created soup\n")

            results_form = soup.find("form", {"name": "currentsearchresultsNext"})

@@ -95,21 +78,21 @@ class WestminsterParser:
            # If there is no next page then there will be no inputs in the form.
            # In this case, post_data will be '', which is false.

            sys.stderr.write("Found form containing results\n")
 #            sys.stderr.write("Found form containing results\n")

            post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])

            sys.stderr.write("Got post data\n")
 #            sys.stderr.write("Got post data\n")

            # Each result has one link, and they are the only links in the form

            links = results_form.findAll("a")

            sys.stderr.write("Got list of links\n")
 #            sys.stderr.write("Got list of links\n")

            for link in links:

                sys.stderr.write("Working on link: %s\n" %link['href'])
 #                sys.stderr.write("Working on link: %s\n" %link['href'])

                application = PlanningApplication()

@@ -124,25 +107,11 @@ class WestminsterParser:

                # To get the comment url, we're going to have to go to each info url :-(

                sys.stderr.write("Fetching: %s\n" %application.info_url)


                fakefile = StringIO.StringIO()


                curlobj.setopt(pycurl.HTTPGET, True)
                curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

                # We have to convert the info url to ascii for curl
                curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))

                curlobj.perform()

                sys.stderr.write("Got it\n")

                info_soup = BeautifulSoup(fakefile.getvalue())
 #                sys.stderr.write("Fetching: %s\n" %application.info_url)
                info_response = urllib2.urlopen(application.info_url)
 #                sys.stderr.write("Got it\n")

                fakefile.close()
                info_soup = BeautifulSoup(info_response)

                comment_nav_string = info_soup.find(text="Comment on this case")
                if comment_nav_string:
@@ -154,10 +123,10 @@ class WestminsterParser:

                self._results.addApplication(application)

                sys.stderr.write("Finished that link\n")
 #                sys.stderr.write("Finished that link\n")


        sys.stderr.write("Finished while loop, returning stuff.\n")
 #        sys.stderr.write("Finished while loop, returning stuff.\n")

        return self._results