From 1510528a8aaccb21c36eb1cf79cdabdbdc7bad68 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Fri, 5 Sep 2008 12:52:07 +0000 Subject: [PATCH] Adding scraper for Halton. Also adding the pycurl scraper for Westminster, just in case it is useful to remind us how to do stuff later. --- python_scrapers/Halton.py | 134 ++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + python_scrapers/Westminster_pycurl.py | 170 ++++++++++++++++++++++++++ 4 files changed, 306 insertions(+) create mode 100644 python_scrapers/Halton.py create mode 100644 python_scrapers/Westminster_pycurl.py diff --git a/python_scrapers/Halton.py b/python_scrapers/Halton.py new file mode 100644 index 0000000..c10874c --- /dev/null +++ b/python_scrapers/Halton.py @@ -0,0 +1,134 @@ + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + + +import cookielib + +cookie_jar = cookielib.CookieJar() + + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +#date_format = "%d-%m-%Y" +date_format = "%d/%m/%Y" +received_date_format = "%d %B %Y" + +import re + +# We're going to use this for a re.split +# A whitespace char, "of" or "at" (case independent), and then a whitespace char. +address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I) + +class HaltonParser: + def __init__(self, *args): + + self.authority_name = "Halton Borough Council" + self.authority_short_name = "Halton" + self.base_url = "http://www.halton.gov.uk/planningapps/index.asp" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + +#CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + # It seems dates are interpreted as midnight on + post_data = urllib.urlencode( + [ +# ("CaseNo", ""), +# ("AppName", ""), + ("DateApValFrom", search_day.strftime(date_format)), + ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)), +# ("AdrsNo", ""), +# ("StName", ""), +# ("StTown", ""), + ("DropWeekDate", "0"),#search_day.strftime(date_format)), + ("DropAppealStatus", "0"), +# ("DateAppealValFrom", ""), +# ("DateAppealValTo", ""), + ("PageSize", "10"), + ("Action", "Search"), + ] + ) + + request = urllib2.Request(self.base_url, post_data) + + while request: + # Now get the search page + # We need to deal with cookies, since pagination depends on them. + cookie_jar.add_cookie_header(request) + response = urllib2.urlopen(request) + + cookie_jar.extract_cookies(response, request) + + soup = BeautifulSoup(response.read()) + + # This should find us each Case on the current page. + caseno_strings = soup.findAll(text="Case No:") + + for caseno_string in caseno_strings: + application = PlanningApplication() + + application.council_reference = caseno_string.findNext("td").string + application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip() + + application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date() + + # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts. + # As a first go, we'll try splitting the description on the last occurence of " of " or " at ". + + try: + application.address = re.split(address_finder_re, application.description)[-1].strip() + except IndexError: + # If we can't find of or at, we'll just have the description again, it's better than nothing. + application.address = application.description + + # We may as well get the postcode from the description rather than the address, in case things have gone wrong + application.postcode = getPostcodeFromText(application.description) + + application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action']) + + # Now what to have as info url... + # There is no way to link to a specific app, so we'll just have the search page. + application.info_url = self.base_url + + self._results.addApplication(application) + + # Now we need to find the post data for the next page, if there is any. + # Find the form with id "formNext", if there is one + next_form = soup.find("form", id="formNext") + + if next_form is not None: + action = next_form['action'] + + # The HTML is borked - the inputs are outside the form, they are all + # in a td which follows it. + + inputs = next_form.findNext("td").findAll("input") + + post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs]) + + request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data) + else: + request = None + + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = HaltonParser() + print parser.getResults(4,8,2008) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 4914f1c..8aec0ed 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -52,3 +52,4 @@ "Hounslow.py", "420" "Harrow.py", "420" "Westminster.py", "420" +"Halton.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 1231f5f..311c2ed 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -256,3 +256,4 @@ "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser" "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser" "Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser" +"Halton Borough Council", "Halton", "", "Halton", "HaltonParser" diff --git a/python_scrapers/Westminster_pycurl.py b/python_scrapers/Westminster_pycurl.py new file mode 100644 index 0000000..d5c8b68 --- /dev/null +++ b/python_scrapers/Westminster_pycurl.py @@ -0,0 +1,170 @@ +""" +This is the screenscraper for Westminster City Council. + +I have just noticed that there is a PublicAccess underneath all this, but +it only has the apps in for which they are accepting comments, so I think +we may as well use this url and get the lot... + +This is the PublicAccess url: +http://publicaccess.westminster.gov.uk/publicaccess/ +""" + +import urllib +import urlparse + +import pycurl +import StringIO + +import datetime, time +import cgi + +import sys + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d%%2F%m%%2F%Y" + +class WestminsterParser: + def __init__(self, *args): + + self.authority_name = "City of Westminster" + self.authority_short_name = "Westminster" + self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + +# post_data = [ +# ("EFNO", ""), +# ("STName", ""), +# ("STNUMB", ""), +# ("ADRSNO", ""), +# ("WARD", "AllWards"), +# ("AGT", ""), +# ("ATCDE", "AllApps"), +# ("DECDE", "AllDecs"), +# ("DTErec", search_day.strftime(date_format)), +# ("DTErecTo", search_day.strftime(date_format)), +# ("DTEvalid", ""), +# ("DTEvalidTo", ""), +# ("APDECDE", "AllAppDecs"), +# ("submit", "Start+Search"), +# ] + post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)} + + while post_data: + + + # Now get the search page + + sys.stderr.write("Fetching: %s\n" %self.base_url) + sys.stderr.write("post data: %s\n" %post_data) + + + # This gives us something to use as the callback + fakefile = StringIO.StringIO() + + curlobj = pycurl.Curl() + curlobj.setopt(pycurl.URL, self.base_url) + curlobj.setopt(pycurl.POST, True) + curlobj.setopt(pycurl.POSTFIELDS, post_data) + curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) + curlobj.setopt(pycurl.FOLLOWLOCATION, True) + curlobj.setopt(pycurl.MAXREDIRS, 10) + + curlobj.perform() + + sys.stderr.write("Got it\n") + soup = BeautifulSoup(fakefile.getvalue()) + + # We may as well free up the memory used by fakefile + fakefile.close() + + sys.stderr.write("Created soup\n") + + results_form = soup.find("form", {"name": "currentsearchresultsNext"}) + + # Sort out the post_data for the next page, if there is one + # If there is no next page then there will be no inputs in the form. + # In this case, post_data will be '', which is false. + + sys.stderr.write("Found form containing results\n") + + post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) + + sys.stderr.write("Got post data\n") + + # Each result has one link, and they are the only links in the form + + links = results_form.findAll("a") + + sys.stderr.write("Got list of links\n") + + for link in links: + + sys.stderr.write("Working on link: %s\n" %link['href']) + + application = PlanningApplication() + + application.date_received = search_day + application.info_url = urlparse.urljoin(self.base_url, link['href']) + application.council_reference = link.string.strip() + + application.address = link.findNext("td").string.strip() + application.postcode = getPostcodeFromText(application.address) + + application.description = link.findNext("tr").findAll("td")[-1].string.strip() + + # To get the comment url, we're going to have to go to each info url :-( + + sys.stderr.write("Fetching: %s\n" %application.info_url) + + + fakefile = StringIO.StringIO() + + + curlobj.setopt(pycurl.HTTPGET, True) + curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write) + + # We have to convert the info url to ascii for curl + curlobj.setopt(pycurl.URL, application.info_url.encode("ascii")) + + curlobj.perform() + + sys.stderr.write("Got it\n") + + info_soup = BeautifulSoup(fakefile.getvalue()) + + fakefile.close() + + comment_nav_string = info_soup.find(text="Comment on this case") + if comment_nav_string: + application.comment_url = comment_nav_string.parent['href'] + else: + application.comment_url = "No Comments" + + #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500 + + self._results.addApplication(application) + + sys.stderr.write("Finished that link\n") + + + sys.stderr.write("Finished while loop, returning stuff.\n") + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = WestminsterParser() + print parser.getResults(1,8,2008) +