Add scraper for Gosport.

Factor out CookieAddingHTTPRedirectHandler.
16 years ago · 53d0b25f78
--- a/python_scrapers/Gosport.py
+++ b/python_scrapers/Gosport.py
@@ -0,0 +1,100 @@
 import urllib2
 import urllib
 import urlparse

 import datetime, time
 import cgi

 import re

 import cookielib

 cookie_jar = cookielib.CookieJar()


 from BeautifulSoup import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText


 from HTTPHandlers import CookieAddingHTTPRedirectHandler

 cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar))


 search_date_format = "%m/%d/%Y" #That's right, the search date is US style.
 info_page_date_format = "%d/%m/%Y" # and the info page is UK style

 class GosportParser:
    def __init__(self, *args):

        self.authority_name = "Gosport Borough Council"
        self.authority_short_name = "Gosport"

        self.base_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationSearch2.aspx"
        self.info_url = "http://www.gosport.gov.uk/gbcplanning/ApplicationDetails.aspx?ID=%s"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        get_request = urllib2.Request(self.base_url)
        get_response = urllib2.urlopen(get_request)
        cookie_jar.extract_cookies(get_response, get_request)
        
        get_soup = BeautifulSoup(get_response.read())

        post_data = (
            ("__VIEWSTATE", get_soup.find("input", {"name": "__VIEWSTATE"})["value"]),
            ("pgid", get_soup.find("input", {"name": "pgid"})["value"]),
            ("action", "Search"),
 #            ("ApplicationSearch21%3AtbDevAddress", ""),
 #            ("ApplicationSearch21%3AtbApplicantName", ""),
 #            ("ApplicationSearch21%3AtbAgentName", ""),
            ("ApplicationSearch21:tbDateSubmitted", "10/01/2008"),
            ("ApplicationSearch21:btnDateSubmitted", "Search"),
 #            ("ApplicationSearch21%3AtbDateDetermined", ""),
            )

        
        post_request = urllib2.Request(self.base_url, urllib.urlencode(post_data))
        cookie_jar.add_cookie_header(post_request)
        post_response = cookie_handling_opener.open(post_request)

        post_soup = BeautifulSoup(post_response.read())

        # Discard the first <tr>, which contains headers
        trs = post_soup.find("table", id="SearchResults1_dgSearchResults").findAll("tr")[1:]

        for tr in trs:
            application = PlanningApplication()
            
            tds = tr.findAll("td")

            application.council_reference = tds[0].string.strip()
            application.address = tds[1].string.strip()
            application.postcode = getPostcodeFromText(application.address)
            application.description = tds[2].string.strip()

            application.date_received = datetime.datetime(*(time.strptime(tds[3].string.strip(), info_page_date_format)[0:6]))
            application.info_url = self.info_url %(application.council_reference)

            # The comment url must be accessed by a POST, so we'll just use the info url for that as well

            application.comment_url = application.info_url

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 if __name__ == '__main__':
    parser = GosportParser()
    print parser.getResults(1,10,2008)

--- a/python_scrapers/HTTPHandlers.py
+++ b/python_scrapers/HTTPHandlers.py
@@ -0,0 +1,18 @@

 from urllib2 import HTTPRedirectHandler

 class CookieAddingHTTPRedirectHandler(HTTPRedirectHandler):
    """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""

    def __init__(self, cookie_jar):
        self.cookie_jar = cookie_jar

        # This really ought to call the superclasses init method, but there doesn't seem to be one.


    def redirect_request(self, *args):
        new_request = HTTPRedirectHandler.redirect_request(self, *args)
        # We need to add a cookie from the cookie_jar
        self.cookie_jar.add_cookie_header(new_request)

        return new_request
--- a/python_scrapers/Ocella.py
+++ b/python_scrapers/Ocella.py
@@ -18,16 +18,9 @@ search_date_format = "%d-%m-%Y" # Format used for the accepted date when searchi

 possible_date_formats = [search_date_format, "%d/%m/%Y"]

 class CookieAddingHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
    """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        new_request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
        # We need to add a cookie from the cookie_jar
        cookie_jar.add_cookie_header(new_request)
 from HTTPHandlers import CookieAddingHTTPRedirectHandler

        return new_request

 cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler())
 cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler(cookie_jar))


 class OcellaParser:
@@ -206,7 +199,7 @@ if __name__ == '__main__':
 #    parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly")
 #    parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
    parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")

--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -1,6 +1,7 @@
 "filename", "permissions"
 "PublicAccess.py", "420"
 "PlanningUtils.py", "420"
 "HTTPHandlers.py", "420"
 "SouthOxfordshireParser.py", "420"  
 "SouthOxfordshire.cgi", "493"
 "ApplicationSearchServletParser.py", "420"
@@ -58,3 +59,4 @@
 "Herefordshire.py", "420"
 "Exmoor.py", "420"
 "Eastbourne.py", "420"
 "Gosport.py", "420"
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -264,3 +264,4 @@
 "Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"
 "Waltham Forest Council", "Waltham Forest", "http://planning.walthamforest.gov.uk/", "PlanningExplorer", "WalthamForestParser"
 "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"