Adding Planet scraper (not quite finished, but this time I don't want to lose it!)

16 anos atrás · bf9599d38c
--- a/python_scrapers/AcolnetParser.py
+++ b/python_scrapers/AcolnetParser.py
@@ -359,6 +359,6 @@ if __name__ == '__main__':
    #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "http://planningonline.southwarksites.com/planningonline2/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    #parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    #parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
    print parser.getResults(day, month, year)
    
--- a/python_scrapers/Planet.py
+++ b/python_scrapers/Planet.py
@@ -0,0 +1,155 @@
 import urllib2
 import urllib
 import urlparse

 from cgi import parse_qs

 import datetime

 import cookielib

 cookie_jar = cookielib.CookieJar()

 from BeautifulSoup import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 date_format = "%d/%m/%Y"

 class PlanetParser:
    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

    def getResultsByDayMonthYear(self, day, month, year):
        # What is the serviceKey for this council?
        # It's in our base url
        query_string = urlparse.urlsplit(self.base_url)[3]

        # This query string just contains the servicekey
        query_dict = parse_qs(query_string)

        service_key = query_dict['serviceKey'][0]

        # First get the form
        get_request = urllib2.Request(self.base_url)
        get_response = urllib2.urlopen(get_request)

        cookie_jar.extract_cookies(get_response, get_request)

        # We also need to get the security token
        get_soup = BeautifulSoup(get_response.read())

        security_token = get_soup.find('input', {'name': 'securityToken'})['value']

        # Now post to it
        search_date = datetime.date(year, month, day)

        search_data = urllib.urlencode(
            {
                "serviceKey":service_key,
                "securityToken": security_token,
                "STEP":"Planet_SearchCriteria",
                #X.resultCount=
                "X.pageNumber": "0",
                "X.searchCriteria_StartDate": search_date.strftime(date_format),
                "X.searchCriteria_EndDate": search_date.strftime(date_format),
                }
            )

        post_request = urllib2.Request(self.base_url, search_data)
        cookie_jar.add_cookie_header(post_request)

        post_response = urllib2.urlopen(post_request)

        post_soup = BeautifulSoup(post_response.read())

        # Now we need to find the results. We'll do this by searching for the text "Ref No" and then going forward from there. For some reason a search for the table gets the table without contents

        ref_no_text = post_soup.find(text="Ref No")

        first_tr = ref_no_text.findNext("tr")

        other_trs = first_tr.findNextSiblings()

        trs = [first_tr] + other_trs

        for tr in trs:
            self._current_application = PlanningApplication()

            # We don't need to get the date, it's the one we searched for.
            self._current_application.date_received = search_date

            tds = tr.findAll("td")

            self._current_application.council_reference = tds[0].a.string.strip()
            self._current_application.address = tds[1].string.strip()
            self._current_application.postcode = getPostcodeFromText(self._current_application.address)

            self._current_application.description = tds[2].string.strip()

            # There is no good info url, so we just give the search page.
            self._current_application.info_url = self.base_url

            # Similarly for the comment url
            self._current_application.comment_url = self.base_url
            
            self._results.addApplication(self._current_application)
            
        return self._results


 # post data for worcester
 # hopefully we can ignore almost all of this...

 #ACTION=NEXT
 #serviceKey=SysDoc-PlanetApplicationEnquiry
 #serviceGeneration=
 #securityToken=NTgxMjE3OTExMjA4OQ%3D%3D
 #enquiry=
 #STEP=Planet_SearchCriteria
 #RECEIVED=
 #COMMENTS=
 #LAST_UPDATED=
 #status=
 #X.endEnquiry=
 #X.resultCount=
 #X.applicationNotFound=
 #X.pageNumber=0
 #X.searchCriteria_ApplicationReference=
 #X.searchCriteria_StartDate=20%2F04%2F2008
 #X.searchCriteria_EndDate=20%2F04%2F2008
 #X.searchCriteria_Ward=
 #X.searchCriteria_Parish=
 #X.searchCriteria_Address=
 #X.searchCriteria_Postcode=
 #X.searchCriteria_ApplicantName=
 #X.searchCriteria_AgentName=
 #X.searchCriteria_UndecidedApplications=

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


 if __name__ == '__main__':
 #    parser = PlanetParser("Worcester City Council", "Worcester", "http://www.worcester.gov.uk:8080/planet/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry", debug=True)
 #    parser = PlanetParser("Elmbridge Borough Council", "Elmbridge", "http://www2.elmbridge.gov.uk/Planet/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry")
 #    parser = PlanetParser("North Lincolnshire Council", "North Lincolnshire", "http://www.planning.northlincs.gov.uk/planet/ispforms.asp?ServiceKey=SysDoc-PlanetApplicationEnquiry")
 #    parser = PlanetParser("Rydale District Council", "Rydale", "http://www.ryedale.gov.uk/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry")
    parser = PlanetParser("Tewkesbury Borough Council", "Tewkesbury", "http://planning.tewkesbury.gov.uk/Planet/ispforms.asp?serviceKey=07WCC04163103430")
    print parser.getResults(21,5,2008)