Adding scraper for Halton.

Also adding the pycurl scraper for Westminster, just in case it is useful to remind us how to do stuff later.
16 years ago · 1510528a8a
--- a/python_scrapers/Halton.py
+++ b/python_scrapers/Halton.py
@@ -0,0 +1,134 @@
 import urllib2
 import urllib
 import urlparse
 import datetime, time
 import cgi
 import cookielib
 cookie_jar = cookielib.CookieJar()
 from BeautifulSoup import BeautifulSoup
 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText
 #date_format = "%d-%m-%Y"
 date_format = "%d/%m/%Y"
 received_date_format = "%d %B %Y"
 import re
 # We're going to use this for a re.split
 # A whitespace char, "of" or "at" (case independent), and then a whitespace char.
 address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I)
 class HaltonParser:
    def __init__(self, *args):
        self.authority_name = "Halton Borough Council"
        self.authority_short_name = "Halton"
        self.base_url = "http://www.halton.gov.uk/planningapps/index.asp"
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
 #CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search
    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)
        # It seems dates are interpreted as midnight on
        post_data = urllib.urlencode(
            [
 #                ("CaseNo", ""),
 #                ("AppName", ""),
                ("DateApValFrom", search_day.strftime(date_format)),
                ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)),
 #                ("AdrsNo", ""),
 #                ("StName", ""),
 #                ("StTown", ""),
                ("DropWeekDate", "0"),#search_day.strftime(date_format)),
                ("DropAppealStatus", "0"),
 #                ("DateAppealValFrom", ""),
 #                ("DateAppealValTo", ""),
                ("PageSize", "10"),
                ("Action", "Search"),                 
                ]
            )
        request = urllib2.Request(self.base_url, post_data)
        while request:
            # Now get the search page
            # We need to deal with cookies, since pagination depends on them.
            cookie_jar.add_cookie_header(request)
            response = urllib2.urlopen(request)
            cookie_jar.extract_cookies(response, request)
            soup = BeautifulSoup(response.read())
            # This should find us each Case on the current page.
            caseno_strings = soup.findAll(text="Case No:")
            for caseno_string in caseno_strings:
                application = PlanningApplication()
                application.council_reference = caseno_string.findNext("td").string
                application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip()
                application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date()
                # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts.
                # As a first go, we'll try splitting the description on the last occurence of " of " or " at ".
                try:
                    application.address = re.split(address_finder_re, application.description)[-1].strip()
                except IndexError:
                    # If we can't find of or at, we'll just have the description again, it's better than nothing.
                    application.address = application.description
                # We may as well get the postcode from the description rather than the address, in case things have gone wrong
                application.postcode = getPostcodeFromText(application.description)
                application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action'])
                # Now what to have as info url...
                # There is no way to link to a specific app, so we'll just have the search page.
                application.info_url = self.base_url
                self._results.addApplication(application)
            # Now we need to find the post data for the next page, if there is any.
            # Find the form with id "formNext", if there is one
            next_form = soup.find("form", id="formNext")
            if next_form is not None:
                action = next_form['action']
                # The HTML is borked - the inputs are outside the form, they are all
                # in a td which follows it.
                inputs = next_form.findNext("td").findAll("input")
                post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs])
                request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data)
            else:
                request = None
        return self._results
    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
 if __name__ == '__main__':
    parser = HaltonParser()
    print parser.getResults(4,8,2008)
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -52,3 +52,4 @@
 "Hounslow.py", "420"
 "Harrow.py", "420"
 "Westminster.py", "420"
 "Halton.py", "420"
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -256,3 +256,4 @@
 "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"
 "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser"
 "Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
 "Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
--- a/python_scrapers/Westminster_pycurl.py
+++ b/python_scrapers/Westminster_pycurl.py
@@ -0,0 +1,170 @@
 """
 This is the screenscraper for Westminster City Council.
 I have just noticed that there is a PublicAccess underneath all this, but
 it only has the apps in for which they are accepting comments, so I think
 we may as well use this url and get the lot...
 This is the PublicAccess url:
 http://publicaccess.westminster.gov.uk/publicaccess/
 """
 import urllib
 import urlparse
 import pycurl
 import StringIO
 import datetime, time
 import cgi
 import sys
 from BeautifulSoup import BeautifulSoup
 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText
 date_format = "%d%%2F%m%%2F%Y"
 class WestminsterParser:
    def __init__(self, *args):
        self.authority_name = "City of Westminster"
        self.authority_short_name = "Westminster"
        self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)
 #         post_data = [
 #             ("EFNO", ""),
 #             ("STName", ""),
 #             ("STNUMB", ""),
 #             ("ADRSNO", ""),
 #             ("WARD", "AllWards"),
 #             ("AGT", ""),
 #             ("ATCDE", "AllApps"),
 #             ("DECDE", "AllDecs"),
 #             ("DTErec", search_day.strftime(date_format)),
 #             ("DTErecTo", search_day.strftime(date_format)),
 #             ("DTEvalid", ""),
 #             ("DTEvalidTo", ""),
 #             ("APDECDE", "AllAppDecs"),
 #             ("submit", "Start+Search"),
 #             ]
        post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}
        while post_data:
            # Now get the search page
            sys.stderr.write("Fetching: %s\n" %self.base_url)
            sys.stderr.write("post data: %s\n" %post_data) 
            # This gives us something to use as the callback
            fakefile = StringIO.StringIO()
            curlobj = pycurl.Curl()
            curlobj.setopt(pycurl.URL, self.base_url)
            curlobj.setopt(pycurl.POST, True)
            curlobj.setopt(pycurl.POSTFIELDS, post_data)
            curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
            curlobj.setopt(pycurl.FOLLOWLOCATION, True)
            curlobj.setopt(pycurl.MAXREDIRS, 10)
            curlobj.perform()
            sys.stderr.write("Got it\n")
            soup = BeautifulSoup(fakefile.getvalue())
            # We may as well free up the memory used by fakefile
            fakefile.close()
            sys.stderr.write("Created soup\n")
            results_form = soup.find("form", {"name": "currentsearchresultsNext"})
            # Sort out the post_data for the next page, if there is one
            # If there is no next page then there will be no inputs in the form.
            # In this case, post_data will be '', which is false.
            sys.stderr.write("Found form containing results\n")
            post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
            sys.stderr.write("Got post data\n")
            # Each result has one link, and they are the only links in the form
            links = results_form.findAll("a")
            sys.stderr.write("Got list of links\n")
            for link in links:
                sys.stderr.write("Working on link: %s\n" %link['href'])
                application = PlanningApplication()
                application.date_received = search_day
                application.info_url = urlparse.urljoin(self.base_url, link['href'])
                application.council_reference = link.string.strip()
                application.address = link.findNext("td").string.strip()
                application.postcode = getPostcodeFromText(application.address)
                application.description = link.findNext("tr").findAll("td")[-1].string.strip()
                # To get the comment url, we're going to have to go to each info url :-(
                sys.stderr.write("Fetching: %s\n" %application.info_url)
                fakefile = StringIO.StringIO()
                curlobj.setopt(pycurl.HTTPGET, True)
                curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
                # We have to convert the info url to ascii for curl
                curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))
                curlobj.perform()
                sys.stderr.write("Got it\n")
                info_soup = BeautifulSoup(fakefile.getvalue())
                fakefile.close()
                comment_nav_string = info_soup.find(text="Comment on this case")
                if comment_nav_string:
                    application.comment_url = comment_nav_string.parent['href']
                else:
                    application.comment_url = "No Comments"
    #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500
                self._results.addApplication(application)
                sys.stderr.write("Finished that link\n")
        sys.stderr.write("Finished while loop, returning stuff.\n")
        return self._results
    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
 if __name__ == '__main__':
    parser = WestminsterParser()
    print parser.getResults(1,8,2008)