|
|
@@ -9,12 +9,10 @@ This is the PublicAccess url: |
|
|
|
http://publicaccess.westminster.gov.uk/publicaccess/ |
|
|
|
""" |
|
|
|
|
|
|
|
import urllib2 |
|
|
|
import urllib |
|
|
|
import urlparse |
|
|
|
|
|
|
|
import pycurl |
|
|
|
import StringIO |
|
|
|
|
|
|
|
import datetime, time |
|
|
|
import cgi |
|
|
|
|
|
|
@@ -64,30 +62,15 @@ class WestminsterParser: |
|
|
|
|
|
|
|
# Now get the search page |
|
|
|
|
|
|
|
sys.stderr.write("Fetching: %s\n" %self.base_url) |
|
|
|
sys.stderr.write("post data: %s\n" %post_data) |
|
|
|
# sys.stderr.write("Fetching: %s\n" %self.base_url) |
|
|
|
# sys.stderr.write("post data: %s\n" %post_data) |
|
|
|
|
|
|
|
response = urllib2.urlopen(self.base_url, post_data) |
|
|
|
|
|
|
|
# This gives us something to use as the callback |
|
|
|
fakefile = StringIO.StringIO() |
|
|
|
|
|
|
|
curlobj = pycurl.Curl() |
|
|
|
curlobj.setopt(pycurl.URL, self.base_url) |
|
|
|
curlobj.setopt(pycurl.POST, True) |
|
|
|
curlobj.setopt(pycurl.POSTFIELDS, post_data) |
|
|
|
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) |
|
|
|
curlobj.setopt(pycurl.FOLLOWLOCATION, True) |
|
|
|
curlobj.setopt(pycurl.MAXREDIRS, 10) |
|
|
|
|
|
|
|
curlobj.perform() |
|
|
|
|
|
|
|
sys.stderr.write("Got it\n") |
|
|
|
soup = BeautifulSoup(fakefile.getvalue()) |
|
|
|
# sys.stderr.write("Got it\n") |
|
|
|
soup = BeautifulSoup(response.read()) |
|
|
|
|
|
|
|
# We may as well free up the memory used by fakefile |
|
|
|
fakefile.close() |
|
|
|
|
|
|
|
sys.stderr.write("Created soup\n") |
|
|
|
# sys.stderr.write("Created soup\n") |
|
|
|
|
|
|
|
results_form = soup.find("form", {"name": "currentsearchresultsNext"}) |
|
|
|
|
|
|
@@ -95,21 +78,21 @@ class WestminsterParser: |
|
|
|
# If there is no next page then there will be no inputs in the form. |
|
|
|
# In this case, post_data will be '', which is false. |
|
|
|
|
|
|
|
sys.stderr.write("Found form containing results\n") |
|
|
|
# sys.stderr.write("Found form containing results\n") |
|
|
|
|
|
|
|
post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) |
|
|
|
|
|
|
|
sys.stderr.write("Got post data\n") |
|
|
|
# sys.stderr.write("Got post data\n") |
|
|
|
|
|
|
|
# Each result has one link, and they are the only links in the form |
|
|
|
|
|
|
|
links = results_form.findAll("a") |
|
|
|
|
|
|
|
sys.stderr.write("Got list of links\n") |
|
|
|
# sys.stderr.write("Got list of links\n") |
|
|
|
|
|
|
|
for link in links: |
|
|
|
|
|
|
|
sys.stderr.write("Working on link: %s\n" %link['href']) |
|
|
|
# sys.stderr.write("Working on link: %s\n" %link['href']) |
|
|
|
|
|
|
|
application = PlanningApplication() |
|
|
|
|
|
|
@@ -124,25 +107,11 @@ class WestminsterParser: |
|
|
|
|
|
|
|
# To get the comment url, we're going to have to go to each info url :-( |
|
|
|
|
|
|
|
sys.stderr.write("Fetching: %s\n" %application.info_url) |
|
|
|
|
|
|
|
|
|
|
|
fakefile = StringIO.StringIO() |
|
|
|
|
|
|
|
|
|
|
|
curlobj.setopt(pycurl.HTTPGET, True) |
|
|
|
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) |
|
|
|
|
|
|
|
# We have to convert the info url to ascii for curl |
|
|
|
curlobj.setopt(pycurl.URL, application.info_url.encode("ascii")) |
|
|
|
|
|
|
|
curlobj.perform() |
|
|
|
|
|
|
|
sys.stderr.write("Got it\n") |
|
|
|
|
|
|
|
info_soup = BeautifulSoup(fakefile.getvalue()) |
|
|
|
# sys.stderr.write("Fetching: %s\n" %application.info_url) |
|
|
|
info_response = urllib2.urlopen(application.info_url) |
|
|
|
# sys.stderr.write("Got it\n") |
|
|
|
|
|
|
|
fakefile.close() |
|
|
|
info_soup = BeautifulSoup(info_response) |
|
|
|
|
|
|
|
comment_nav_string = info_soup.find(text="Comment on this case") |
|
|
|
if comment_nav_string: |
|
|
@@ -154,10 +123,10 @@ class WestminsterParser: |
|
|
|
|
|
|
|
self._results.addApplication(application) |
|
|
|
|
|
|
|
sys.stderr.write("Finished that link\n") |
|
|
|
# sys.stderr.write("Finished that link\n") |
|
|
|
|
|
|
|
|
|
|
|
sys.stderr.write("Finished while loop, returning stuff.\n") |
|
|
|
# sys.stderr.write("Finished while loop, returning stuff.\n") |
|
|
|
|
|
|
|
return self._results |
|
|
|
|
|
|
|