|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170 |
- """
- This is the screenscraper for Westminster City Council.
-
- I have just noticed that there is a PublicAccess underneath all this, but
- it only has the apps in for which they are accepting comments, so I think
- we may as well use this url and get the lot...
-
- This is the PublicAccess url:
- http://publicaccess.westminster.gov.uk/publicaccess/
- """
-
- import urllib
- import urlparse
-
- import pycurl
- import StringIO
-
- import datetime, time
- import cgi
-
- import sys
-
- from BeautifulSoup import BeautifulSoup
-
- from PlanningUtils import PlanningApplication, \
- PlanningAuthorityResults, \
- getPostcodeFromText
-
- date_format = "%d%%2F%m%%2F%Y"
-
- class WestminsterParser:
- def __init__(self, *args):
-
- self.authority_name = "City of Westminster"
- self.authority_short_name = "Westminster"
- self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"
-
- self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
-
-
- def getResultsByDayMonthYear(self, day, month, year):
- search_day = datetime.date(year, month, day)
-
- # post_data = [
- # ("EFNO", ""),
- # ("STName", ""),
- # ("STNUMB", ""),
- # ("ADRSNO", ""),
- # ("WARD", "AllWards"),
- # ("AGT", ""),
- # ("ATCDE", "AllApps"),
- # ("DECDE", "AllDecs"),
- # ("DTErec", search_day.strftime(date_format)),
- # ("DTErecTo", search_day.strftime(date_format)),
- # ("DTEvalid", ""),
- # ("DTEvalidTo", ""),
- # ("APDECDE", "AllAppDecs"),
- # ("submit", "Start+Search"),
- # ]
- post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}
-
- while post_data:
-
-
- # Now get the search page
-
- sys.stderr.write("Fetching: %s\n" %self.base_url)
- sys.stderr.write("post data: %s\n" %post_data)
-
-
- # This gives us something to use as the callback
- fakefile = StringIO.StringIO()
-
- curlobj = pycurl.Curl()
- curlobj.setopt(pycurl.URL, self.base_url)
- curlobj.setopt(pycurl.POST, True)
- curlobj.setopt(pycurl.POSTFIELDS, post_data)
- curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
- curlobj.setopt(pycurl.FOLLOWLOCATION, True)
- curlobj.setopt(pycurl.MAXREDIRS, 10)
-
- curlobj.perform()
-
- sys.stderr.write("Got it\n")
- soup = BeautifulSoup(fakefile.getvalue())
-
- # We may as well free up the memory used by fakefile
- fakefile.close()
-
- sys.stderr.write("Created soup\n")
-
- results_form = soup.find("form", {"name": "currentsearchresultsNext"})
-
- # Sort out the post_data for the next page, if there is one
- # If there is no next page then there will be no inputs in the form.
- # In this case, post_data will be '', which is false.
-
- sys.stderr.write("Found form containing results\n")
-
- post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
-
- sys.stderr.write("Got post data\n")
-
- # Each result has one link, and they are the only links in the form
-
- links = results_form.findAll("a")
-
- sys.stderr.write("Got list of links\n")
-
- for link in links:
-
- sys.stderr.write("Working on link: %s\n" %link['href'])
-
- application = PlanningApplication()
-
- application.date_received = search_day
- application.info_url = urlparse.urljoin(self.base_url, link['href'])
- application.council_reference = link.string.strip()
-
- application.address = link.findNext("td").string.strip()
- application.postcode = getPostcodeFromText(application.address)
-
- application.description = link.findNext("tr").findAll("td")[-1].string.strip()
-
- # To get the comment url, we're going to have to go to each info url :-(
-
- sys.stderr.write("Fetching: %s\n" %application.info_url)
-
-
- fakefile = StringIO.StringIO()
-
-
- curlobj.setopt(pycurl.HTTPGET, True)
- curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
-
- # We have to convert the info url to ascii for curl
- curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))
-
- curlobj.perform()
-
- sys.stderr.write("Got it\n")
-
- info_soup = BeautifulSoup(fakefile.getvalue())
-
- fakefile.close()
-
- comment_nav_string = info_soup.find(text="Comment on this case")
- if comment_nav_string:
- application.comment_url = comment_nav_string.parent['href']
- else:
- application.comment_url = "No Comments"
-
- #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500
-
- self._results.addApplication(application)
-
- sys.stderr.write("Finished that link\n")
-
-
- sys.stderr.write("Finished while loop, returning stuff.\n")
-
- return self._results
-
- def getResults(self, day, month, year):
- return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
-
- if __name__ == '__main__':
- parser = WestminsterParser()
- print parser.getResults(1,8,2008)
-
|