Add Ocella scraper that works for seven councils. Woop!

18 лет назад · 9c238b2f15
--- a/python_scrapers/Ocella.py
+++ b/python_scrapers/Ocella.py
@@ -2,7 +2,7 @@ import urllib2
 import urllib
 import urlparse

 import datetime
 import datetime, time

 import cookielib

@@ -14,9 +14,25 @@ from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 date_format = "%d-%m-%Y"
 search_date_format = "%d-%m-%Y" # Format used for the accepted date when searching

 possible_date_formats = [search_date_format, "%d/%m/%Y"]

 class CookieAddingHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
    """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        new_request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
        # We need to add a cookie from the cookie_jar
        cookie_jar.add_cookie_header(new_request)

        return new_request

 cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler())


 class OcellaParser:
    received_date_format = search_date_format

    def __init__(self,
                 authority_name,
                 authority_short_name,
@@ -31,6 +47,12 @@ class OcellaParser:

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

        # These will be used to store the column numbers of the appropriate items in the results table
        self.reference_col = None
        self.address_col = None
        self.description_col = None
        self.received_date_col = None
        self.accepted_date_col = None

    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)
@@ -47,16 +69,28 @@ class OcellaParser:
        action = get_soup.form['action']
        session_id = get_soup.find('input', {'name': 'p_session_id'})['value']

 # # From Breckland

 # p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
 # p_instance=1
 # p_event_type=ON_CLICK
 # p_user_args=
 # p_session_id=53573
 # p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
 # FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=02-06-2008
 # FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=09-06-2008
 # FRM_WEEKLY_LIST.DEFAULT.PARISH.01=

        post_data = urllib.urlencode(
            [#('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
             #('p_instance', '1'),
             #('p_event_type', 'ON_CLICK'),
             #('p_user_args', ''),
            [('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
             ('p_instance', '1'),
             ('p_event_type', 'ON_CLICK'),
             ('p_user_args', ''),
             ('p_session_id', session_id),
             #('p_page_url', self.base_url),
             ('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', '02-06-2008'), #search_date.strftime(date_format),
             ('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', '09-06-2008'),#search_date.strftime(date_format),
             #('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
             ('p_page_url', self.base_url),
             ('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
             ('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
             ('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
                ]
            )
        
@@ -65,39 +99,101 @@ class OcellaParser:

        post_request.add_header('Referer', self.base_url)

        post_response = urllib2.urlopen(post_request)
        post_response = cookie_handling_opener.open(post_request)

        import pdb;pdb.set_trace()
        post_soup = BeautifulSoup(post_response.read())

 # # From Breckland
        results_table = post_soup.find("table", summary="Printing Table Headers")

 # p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
 # p_instance=1
 # p_event_type=ON_CLICK
 # p_user_args=
 # p_session_id=53573
 # p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
 # FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=02-06-2008
 # FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=09-06-2008
 # FRM_WEEKLY_LIST.DEFAULT.PARISH.01=
        trs = results_table.findAll("tr")

 # # Mine
 # p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
 # p_user_args=
 # FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=21-05-2008
 # FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=21-05-2008
 # p_session_id=53576
 # p_instance=1
 # p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
 # FRM_WEEKLY_LIST.DEFAULT.PARISH.01=
 # p_event_type=ON_CLICK
        # We'll use the headings in the first tr to find out what columns the address, description, etc are in.
        ths = trs[0].findAll("th")

        th_index = 0
        for th in ths:
            th_content = th.font.string.strip()
            if th_content == 'Reference' or th_content == 'Application Ref':
                self.reference_col = th_index
            elif th_content == 'Location':
                self.address_col = th_index
            elif th_content == 'Proposal':
                self.description_col = th_index
            elif th_content == 'Development Description':
                self.description_col = th_index
            elif th_content == 'Received Date' or th_content == 'Date Received':
                self.received_date_col = th_index
            elif th_content == 'Accepted Date':
                self.accepted_date_col = th_index

            th_index += 1
            
        # If there is a received date, we'll use that, otherwise, we'll have to settle for the accepted date.
        self.received_date_col = self.received_date_col or self.accepted_date_col

        # We want all the trs except the first one, which is just headers, 
        # and the last, which is empty
        trs = trs[1:-1]

        for tr in trs:
            self._current_application = PlanningApplication()

            tds = tr.findAll("td")

            self._current_application.council_reference = (tds[self.reference_col].font.a or tds[self.reference_col].a.font).string.strip()

            date_string = tds[self.received_date_col]

            for possible_format in possible_date_formats:
                
                try:
                    self._current_application.date_received = datetime.datetime(*(time.strptime(tds[self.received_date_col].font.string.strip(), possible_format)[0:6]))
                except ValueError:
                    pass

            self._current_application.address = tds[self.address_col].font.string.strip()
            self._current_application.postcode = getPostcodeFromText(self._current_application.address)
            self._current_application.description = tds[self.description_col].font.string.strip()
            self._current_application.info_url = tds[self.reference_col].a['href']

 # This is what a comment url looks like
 # It seems to be no problem to remove the sessionid (which is in any case blank...)
 # I can't see a good way to avoid having to go to the info page to find the moduleid though.

 #http://wplan01.intranet.breckland.gov.uk:7778/pls/portal/PORTAL.wwa_app_module.link?p_arg_names=_moduleid&p_arg_values=8941787057&p_arg_names=_sessionid&p_arg_values=&p_arg_names=APPLICATION_REFERENCE&p_arg_values=3PL%2F2008%2F0877%2FF

            # For the moment, we'll just use the info url, as that seems to work.
            self._current_application.comment_url = self._current_application.info_url
            
            self._results.addApplication(self._current_application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


 if __name__ == '__main__':
    parser = OcellaParser("Breckland Council", "Breckland", "http://wplan01.intranet.breckland.gov.uk:7778/portal/page?_pageid=33,30988&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Arun", "Arun", "http://www.arun.gov.uk/iplanning/portal/page?_pageid=33,4139&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Breckland Council", "Breckland", "http://wplan01.intranet.breckland.gov.uk:7778/portal/page?_pageid=33,30988&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL")

    # Bad status line?
 #    parser = BrecklandParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
 #    parser = ArunParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")

    # Can't find the URL similar to the others, even though it is clearly Ocella
    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://www.great-yarmouth.gov.uk/wmplan_application_search-6.htm")



    print parser.getResults(21,5,2008)

 #TODO

 # 1) Sort out proper comment url?
 # 2) Check for pagination
 # 3) Check no results case