Fix Stockport (which is now acolnet, not PlanningExplorer).

Add Blackburn and Greenwich (acolnet sites - patch from Peter Collingbourne. Fixes to Ocella parser (also from Peter). Adds Bridgend, Castle Point, Great Yarmouth, Havering). Fix typo and escaping in getinvolved (patch from David Sheldon).
18年前 · 884e8f97ec
--- a/docs/templates/getinvolved.tpl
+++ b/docs/templates/getinvolved.tpl
@@ -7,13 +7,13 @@
        You can help by writing a <a href="http://en.wikipedia.org/wiki/Screen_scraping">screen scraper</a> for your local authority that was can import into planningalerts.com. There are only 2 criteria for the screen scraper:
    </p>
    <ol>
        <li>That it can output data in the following format: <a href="http://www.planningalerts.com/lambeth.xml">http://www.planningalerts.com/lambeth.xml</a></li>
        <li>That it can accept a query sting in the format day=X&month=Y&year=Z</li>
        <li>That it can output data in the following format: <a href="/lambeth.xml">http://www.planningalerts.com/lambeth.xml</a></li>
        <li>That it can accept a query sting in the format day=X&amp;month=Y&amp;year=Z</li>
    </ol>
    <p>
       Other than that it's up to you. It can be in any language. You can host them yourself or we can host it for you.
    </p>
    <p><span class="highlight">You can grab the code for this site and view some developent tickets <a href="http://code.google.com/p/planningalerts/">here</a> and join our developer mailing list <a href="http://groups.google.com/group/planningalerts-dev"> here</a>.</span>
    <p><span class="highlight">You can grab the code for this site and view some development tickets <a href="http://code.google.com/p/planningalerts/">here</a> and join our developer mailing list <a href="http://groups.google.com/group/planningalerts-dev"> here</a>.</span>
    </p>
    <h3>I work for a local authority and would like to make our data available</h3>
    <p>
--- a/python_scrapers/AcolnetParser.py
+++ b/python_scrapers/AcolnetParser.py
@@ -34,6 +34,7 @@ end_head_regex = re.compile("</head>?", re.IGNORECASE)


 class AcolnetParser:
    received_date_label = "Registration Date:"
    received_date_format = "%d/%m/%Y"

    comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
@@ -55,7 +56,7 @@ class AcolnetParser:
        return app_table.a.string.strip()

    def _getDateReceived(self, app_table):
        date_str = ''.join(app_table.find(text="Registration Date:").findNext("td").string.strip().split())
        date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split())
        day, month, year = date_str.split('/')
        return date(int(year), int(month), int(day))

@@ -205,6 +206,15 @@ class BridgnorthParser(AcolnetParser):
 #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
        return self._current_application.info_url.replace("NewPages", "PgeCommentForm")

 class BlackpoolParser(AcolnetParser):
    received_date_label = "Application Date:"

    def _getResultsSections(self, soup):
        return soup.findAll("table", {"class": "acolnet-results-table"})

    def _getCommentUrl(self, app_table):
        ref = self._getCouncilReference(app_table)
        return "https://www.blackpool.gov.uk/Services/M-R/PlanningApplications/Forms/PlanningNeighbourResponseForm.htm?Application_No=" + ref.replace('/','%2F')

 class CanterburyParser(AcolnetParser):
    """Here the apps are one row each in a big table."""
@@ -227,14 +237,17 @@ class CanterburyParser(AcolnetParser):
    def _getDescription(self, app_table):
        return app_table.findAll("td")[2].string.strip()        

 #Kensington and chelsea is sufficiently different, it may as well be handled separately    
 class GreenwichParser(AcolnetParser):
    received_date_label = "Registration date:"
    comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentNeighbourForm&TheSystemkey=%s"

 # Mid Bedfordshire - there is an acolnet here, but you have to have a username
 # and password to access it!
    def _getInfoUrl(self, app_table):
        return AcolnetParser._getInfoUrl(self, app_table).replace('/?', '/acolnetcgi.gov?', 1)

 #Kensington and chelsea is sufficiently different, it may as well be handled separately    

 class MidBedsParser(AcolnetParser):
    def _getCouncilReference(self, app_table):
 #        return app_table.findAll("a")[1].string.strip()
        return app_table.findAll("a")[1].string.strip()
    
 class OldhamParser(AcolnetParser):
@@ -361,11 +374,11 @@ if __name__ == '__main__':
    #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
 #    parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

    #parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

 #    parser = MidBedsParser("Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
 #    parser = AcolnetParser("Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

 #    parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
 #    parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
 #    parser = BlackpoolParser("Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    parser = GreenwichParser("London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    print parser.getResults(day, month, year)
    
--- a/python_scrapers/Ocella.py
+++ b/python_scrapers/Ocella.py
@@ -50,6 +50,7 @@ class OcellaParser:
        # These will be used to store the column numbers of the appropriate items in the results table
        self.reference_col = None
        self.address_col = None
        self.applicant_col = None
        self.description_col = None
        self.received_date_col = None
        self.accepted_date_col = None
@@ -59,6 +60,7 @@ class OcellaParser:

        # First get the search page
        get_request = urllib2.Request(self.base_url)
        get_request.add_header('Accept', 'text/html')
        get_response = urllib2.urlopen(get_request)

        cookie_jar.extract_cookies(get_response, get_request)
@@ -75,6 +77,14 @@ class OcellaParser:
            # but it seems we don't need it...
            session_id = None

        # Unless we retrieve the correct form name, we will simply get the last week's applications
        submit_tag = get_soup.find('input', {'value': 'Search'}) or get_soup.find('input', {'value': 'Search for Applications'}) or get_soup.find('input', {'value': 'Submit'})
        try:
            submit_name = submit_tag['name']
            form_name = submit_name.split('.')[0]
        except TypeError:
            form_name = 'FRM_PLANNING_LIST'

 # # From Breckland

 # p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
@@ -88,21 +98,23 @@ class OcellaParser:
 # FRM_WEEKLY_LIST.DEFAULT.PARISH.01=

        post_data = urllib.urlencode(
            [('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
            [('p_object_name', form_name + '.DEFAULT.SUBMIT_TOP.01'),
             ('p_instance', '1'),
             ('p_event_type', 'ON_CLICK'),
             ('p_user_args', ''),
             ('p_session_id', session_id),
             ('p_page_url', self.base_url),
             ('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
             ('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
             ('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
             (form_name + '.DEFAULT.AGENT.01', ''),
             (form_name + '.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
             (form_name + '.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
             (form_name + '.DEFAULT.PARISH.01', ''),
                ]
            )
        
        post_request = urllib2.Request(action, post_data)
        cookie_jar.add_cookie_header(post_request)

        post_request.add_header('Accept', 'text/html')
        post_request.add_header('Referer', self.base_url)

        post_response = cookie_handling_opener.open(post_request)
@@ -119,10 +131,12 @@ class OcellaParser:
        th_index = 0
        for th in ths:
            th_content = th.font.string.strip()
            if th_content == 'Reference' or th_content == 'Application Ref':
            if th_content == 'Reference' or th_content == 'Application Ref' or th_content == 'Application Number':
                self.reference_col = th_index
            elif th_content == 'Location':
                self.address_col = th_index
            elif th_content == 'Applicant Details':
                self.applicant_col = th_index
            elif th_content == 'Proposal':
                self.description_col = th_index
            elif th_content == 'Development Description':
@@ -159,8 +173,12 @@ class OcellaParser:

            self._current_application.address = tds[self.address_col].font.string.strip()
            self._current_application.postcode = getPostcodeFromText(self._current_application.address)
            if self._current_application.postcode is None and self.applicant_col is not None:
                # won't always be accurate to do this but better than nothing (needed for Havering)
                self._current_application.postcode = getPostcodeFromText(tds[self.applicant_col].font.string.strip())
            self._current_application.description = tds[self.description_col].font.string.strip()
            self._current_application.info_url = tds[self.reference_col].a['href']
            # seems to be dependent on the implementation whether the URL is encoded (e.g. Great Yarmouth does this), so we cannot do anything more "standard"
            self._current_application.info_url = urlparse.urljoin(post_response.geturl(), tds[self.reference_col].a['href'].replace('&amp;','&'))

 # This is what a comment url looks like
 # It seems to be no problem to remove the sessionid (which is in any case blank...)
@@ -184,20 +202,13 @@ if __name__ == '__main__':
 #    parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL")


    # Bad status line? Try changing browser id string?
 #    parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly")
 #    parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")

 # Post never comes back
 #    parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")

    # Can't find the URL similar to the others, even though it is clearly Ocella
    # We get a 406 at the moment. Try browser id string?
    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/search")
    parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")


    print parser.getResults(21,5,2008)
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -192,7 +192,6 @@
 "Chichester District Council", "Chichester", "http://pa.chichester.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "London Borough of Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", "WAM", "WAMParser"
 "Braintree District Council", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", "WAM", "BraintreeParser"
 "Castle Point Borough Council", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
 "Colchester Borough Council", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", "WAM", "BraintreeParser"
 "East Lothian Council", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
 "North Somerset Council", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", "WAM", "BraintreeParser"
@@ -213,9 +212,9 @@
 "Ellesmere Port and Neston Borough Council", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Fareham Borough Council", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "London Borough of Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"
 "Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "MidBedsParser"
 "Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
 "Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser"
@@ -237,3 +236,10 @@
 "Lewes District Council", "Lewes", "http://planning.lewes.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
 "Olympic Delivery Authority", "Olympics", "http://planning.london2012.com/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
 "Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BlackpoolParser"
 "London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "GreenwichParser"
 "Bridgend County Borough Council", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "London Borough of Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Castle Point Borough Council", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"
 "Great Yarmouth Borough Council", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly", "Ocella", "OcellaParser"
--- a/python_scrapers/SwiftLG.py
+++ b/python_scrapers/SwiftLG.py
@@ -215,7 +215,8 @@ if __name__ == '__main__':
 #    parser = SwiftLGParser("St Edmundsbury", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/")
 #    parser = MacclesfieldParser("Macclesfield", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/")
 #    parser = SwiftLGParser("Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display")
    parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display")
 #    parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display")
    parser = SwiftLGParser("Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display")
    print parser.getResults(26,6,2008)