Also adding the pycurl scraper for Westminster, just in case it is useful to remind us how to do stuff later.master
| @@ -0,0 +1,134 @@ | |||||
| import urllib2 | |||||
| import urllib | |||||
| import urlparse | |||||
| import datetime, time | |||||
| import cgi | |||||
| import cookielib | |||||
| cookie_jar = cookielib.CookieJar() | |||||
| from BeautifulSoup import BeautifulSoup | |||||
| from PlanningUtils import PlanningApplication, \ | |||||
| PlanningAuthorityResults, \ | |||||
| getPostcodeFromText | |||||
| #date_format = "%d-%m-%Y" | |||||
| date_format = "%d/%m/%Y" | |||||
| received_date_format = "%d %B %Y" | |||||
| import re | |||||
| # We're going to use this for a re.split | |||||
| # A whitespace char, "of" or "at" (case independent), and then a whitespace char. | |||||
| address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I) | |||||
| class HaltonParser: | |||||
| def __init__(self, *args): | |||||
| self.authority_name = "Halton Borough Council" | |||||
| self.authority_short_name = "Halton" | |||||
| self.base_url = "http://www.halton.gov.uk/planningapps/index.asp" | |||||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||||
| #CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search | |||||
| def getResultsByDayMonthYear(self, day, month, year): | |||||
| search_day = datetime.date(year, month, day) | |||||
| # It seems dates are interpreted as midnight on | |||||
| post_data = urllib.urlencode( | |||||
| [ | |||||
| # ("CaseNo", ""), | |||||
| # ("AppName", ""), | |||||
| ("DateApValFrom", search_day.strftime(date_format)), | |||||
| ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)), | |||||
| # ("AdrsNo", ""), | |||||
| # ("StName", ""), | |||||
| # ("StTown", ""), | |||||
| ("DropWeekDate", "0"),#search_day.strftime(date_format)), | |||||
| ("DropAppealStatus", "0"), | |||||
| # ("DateAppealValFrom", ""), | |||||
| # ("DateAppealValTo", ""), | |||||
| ("PageSize", "10"), | |||||
| ("Action", "Search"), | |||||
| ] | |||||
| ) | |||||
| request = urllib2.Request(self.base_url, post_data) | |||||
| while request: | |||||
| # Now get the search page | |||||
| # We need to deal with cookies, since pagination depends on them. | |||||
| cookie_jar.add_cookie_header(request) | |||||
| response = urllib2.urlopen(request) | |||||
| cookie_jar.extract_cookies(response, request) | |||||
| soup = BeautifulSoup(response.read()) | |||||
| # This should find us each Case on the current page. | |||||
| caseno_strings = soup.findAll(text="Case No:") | |||||
| for caseno_string in caseno_strings: | |||||
| application = PlanningApplication() | |||||
| application.council_reference = caseno_string.findNext("td").string | |||||
| application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip() | |||||
| application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date() | |||||
| # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts. | |||||
| # As a first go, we'll try splitting the description on the last occurence of " of " or " at ". | |||||
| try: | |||||
| application.address = re.split(address_finder_re, application.description)[-1].strip() | |||||
| except IndexError: | |||||
| # If we can't find of or at, we'll just have the description again, it's better than nothing. | |||||
| application.address = application.description | |||||
| # We may as well get the postcode from the description rather than the address, in case things have gone wrong | |||||
| application.postcode = getPostcodeFromText(application.description) | |||||
| application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action']) | |||||
| # Now what to have as info url... | |||||
| # There is no way to link to a specific app, so we'll just have the search page. | |||||
| application.info_url = self.base_url | |||||
| self._results.addApplication(application) | |||||
| # Now we need to find the post data for the next page, if there is any. | |||||
| # Find the form with id "formNext", if there is one | |||||
| next_form = soup.find("form", id="formNext") | |||||
| if next_form is not None: | |||||
| action = next_form['action'] | |||||
| # The HTML is borked - the inputs are outside the form, they are all | |||||
| # in a td which follows it. | |||||
| inputs = next_form.findNext("td").findAll("input") | |||||
| post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs]) | |||||
| request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data) | |||||
| else: | |||||
| request = None | |||||
| return self._results | |||||
| def getResults(self, day, month, year): | |||||
| return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||||
| if __name__ == '__main__': | |||||
| parser = HaltonParser() | |||||
| print parser.getResults(4,8,2008) | |||||
| @@ -52,3 +52,4 @@ | |||||
| "Hounslow.py", "420" | "Hounslow.py", "420" | ||||
| "Harrow.py", "420" | "Harrow.py", "420" | ||||
| "Westminster.py", "420" | "Westminster.py", "420" | ||||
| "Halton.py", "420" | |||||
| @@ -256,3 +256,4 @@ | |||||
| "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser" | "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser" | ||||
| "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser" | "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser" | ||||
| "Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser" | "Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser" | ||||
| "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" | |||||
| @@ -0,0 +1,170 @@ | |||||
| """ | |||||
| This is the screenscraper for Westminster City Council. | |||||
| I have just noticed that there is a PublicAccess underneath all this, but | |||||
| it only has the apps in for which they are accepting comments, so I think | |||||
| we may as well use this url and get the lot... | |||||
| This is the PublicAccess url: | |||||
| http://publicaccess.westminster.gov.uk/publicaccess/ | |||||
| """ | |||||
| import urllib | |||||
| import urlparse | |||||
| import pycurl | |||||
| import StringIO | |||||
| import datetime, time | |||||
| import cgi | |||||
| import sys | |||||
| from BeautifulSoup import BeautifulSoup | |||||
| from PlanningUtils import PlanningApplication, \ | |||||
| PlanningAuthorityResults, \ | |||||
| getPostcodeFromText | |||||
| date_format = "%d%%2F%m%%2F%Y" | |||||
| class WestminsterParser: | |||||
| def __init__(self, *args): | |||||
| self.authority_name = "City of Westminster" | |||||
| self.authority_short_name = "Westminster" | |||||
| self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm" | |||||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||||
| def getResultsByDayMonthYear(self, day, month, year): | |||||
| search_day = datetime.date(year, month, day) | |||||
| # post_data = [ | |||||
| # ("EFNO", ""), | |||||
| # ("STName", ""), | |||||
| # ("STNUMB", ""), | |||||
| # ("ADRSNO", ""), | |||||
| # ("WARD", "AllWards"), | |||||
| # ("AGT", ""), | |||||
| # ("ATCDE", "AllApps"), | |||||
| # ("DECDE", "AllDecs"), | |||||
| # ("DTErec", search_day.strftime(date_format)), | |||||
| # ("DTErecTo", search_day.strftime(date_format)), | |||||
| # ("DTEvalid", ""), | |||||
| # ("DTEvalidTo", ""), | |||||
| # ("APDECDE", "AllAppDecs"), | |||||
| # ("submit", "Start+Search"), | |||||
| # ] | |||||
| post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)} | |||||
| while post_data: | |||||
| # Now get the search page | |||||
| sys.stderr.write("Fetching: %s\n" %self.base_url) | |||||
| sys.stderr.write("post data: %s\n" %post_data) | |||||
| # This gives us something to use as the callback | |||||
| fakefile = StringIO.StringIO() | |||||
| curlobj = pycurl.Curl() | |||||
| curlobj.setopt(pycurl.URL, self.base_url) | |||||
| curlobj.setopt(pycurl.POST, True) | |||||
| curlobj.setopt(pycurl.POSTFIELDS, post_data) | |||||
| curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) | |||||
| curlobj.setopt(pycurl.FOLLOWLOCATION, True) | |||||
| curlobj.setopt(pycurl.MAXREDIRS, 10) | |||||
| curlobj.perform() | |||||
| sys.stderr.write("Got it\n") | |||||
| soup = BeautifulSoup(fakefile.getvalue()) | |||||
| # We may as well free up the memory used by fakefile | |||||
| fakefile.close() | |||||
| sys.stderr.write("Created soup\n") | |||||
| results_form = soup.find("form", {"name": "currentsearchresultsNext"}) | |||||
| # Sort out the post_data for the next page, if there is one | |||||
| # If there is no next page then there will be no inputs in the form. | |||||
| # In this case, post_data will be '', which is false. | |||||
| sys.stderr.write("Found form containing results\n") | |||||
| post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) | |||||
| sys.stderr.write("Got post data\n") | |||||
| # Each result has one link, and they are the only links in the form | |||||
| links = results_form.findAll("a") | |||||
| sys.stderr.write("Got list of links\n") | |||||
| for link in links: | |||||
| sys.stderr.write("Working on link: %s\n" %link['href']) | |||||
| application = PlanningApplication() | |||||
| application.date_received = search_day | |||||
| application.info_url = urlparse.urljoin(self.base_url, link['href']) | |||||
| application.council_reference = link.string.strip() | |||||
| application.address = link.findNext("td").string.strip() | |||||
| application.postcode = getPostcodeFromText(application.address) | |||||
| application.description = link.findNext("tr").findAll("td")[-1].string.strip() | |||||
| # To get the comment url, we're going to have to go to each info url :-( | |||||
| sys.stderr.write("Fetching: %s\n" %application.info_url) | |||||
| fakefile = StringIO.StringIO() | |||||
| curlobj.setopt(pycurl.HTTPGET, True) | |||||
| curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) | |||||
| # We have to convert the info url to ascii for curl | |||||
| curlobj.setopt(pycurl.URL, application.info_url.encode("ascii")) | |||||
| curlobj.perform() | |||||
| sys.stderr.write("Got it\n") | |||||
| info_soup = BeautifulSoup(fakefile.getvalue()) | |||||
| fakefile.close() | |||||
| comment_nav_string = info_soup.find(text="Comment on this case") | |||||
| if comment_nav_string: | |||||
| application.comment_url = comment_nav_string.parent['href'] | |||||
| else: | |||||
| application.comment_url = "No Comments" | |||||
| #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500 | |||||
| self._results.addApplication(application) | |||||
| sys.stderr.write("Finished that link\n") | |||||
| sys.stderr.write("Finished while loop, returning stuff.\n") | |||||
| return self._results | |||||
| def getResults(self, day, month, year): | |||||
| return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||||
| if __name__ == '__main__': | |||||
| parser = WestminsterParser() | |||||
| print parser.getResults(1,8,2008) | |||||