Update PublicAccess scraper to work with BeautifulSoup.

Add all publicaccess sites to the python scraper.
пре 17 година · 7a5a50ed58
--- a/trunk/python_scrapers/PublicAccess.py
+++ b/trunk/python_scrapers/PublicAccess.py
@@ -1,9 +1,12 @@
 #!/usr/local/bin/python

 import urllib, urllib2
 import HTMLParser

 import urlparse
 import datetime, time
 import datetime
 import re

 import BeautifulSoup

 import cookielib

@@ -17,7 +20,13 @@ search_form_url_end = "DcApplication/application_searchform.aspx"
 search_results_url_end = "DcApplication/application_searchresults.aspx"
 comments_url_end = "DcApplication/application_comments_entryform.aspx"

 class PublicAccessParser(HTMLParser.HTMLParser):
 def index_or_none(a_list, item):
    """
    Returns the index of item in a_list, or None, if it isn't in the list.
    """
    return a_list.count(item) and a_list.index(item)

 class PublicAccessParser:
    """This is the class which parses the PublicAccess search results page.
    """

@@ -27,183 +36,19 @@ class PublicAccessParser(HTMLParser.HTMLParser):
                 base_url,
                 debug=False):
        
 	HTMLParser.HTMLParser.__init__(self)

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        # this will change to True when we enter the table of results
        self._in_results_table = False

        # this will be set to True when we have passed the header row
        # in the results table
        self._past_header_row = False

        # this will be true when we are in a <td> in the results table
        self._in_td = False

        # For each row, this will say how many tds we have seen so far
        self._td_count = 0

        # The object which stores our set of planning application results
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

        # This will store the planning application we are currently working on.
        self._current_application = None

    def handle_starttag(self, tag, attrs):
 	if tag == "table":
 	    self.handle_start_table(attrs)
        # we are only interested in tr tags if we are in the results table
        elif self._in_results_table and tag == "tr":
 	    self.handle_start_tr(attrs)
        # we are only interested in td tags if we are in the results table
 	elif self._in_results_table and tag == "td":
 	    self.handle_start_td(attrs)
        # we are only interested in <a> tags if we are in the 6th td in
        # the results table.
        # UPDATE: It seems that, in the case of Chiltern, we are interested in
        # td 5.
 	elif self._in_td and (self._td_count == 5 or self._td_count == 6) and tag == "a":
 	    self.handle_start_a(attrs)
 	# If the tag is not one of these then we aren't interested

    def handle_endtag(self, tag):
        # we only need to consider end tags if we are in the results table
 	if self._in_results_table:
 	    if tag == "table":
 		self.handle_end_table()
 	    if tag == "tr":
                self.handle_end_tr()
            if tag == "td":
 		self.handle_end_td()

    def handle_start_table(self, attrs):
 	for attr,value in attrs:
 	    if attr == "class":
 		if value == "cResultsForm":
 		    self._in_results_table = True
 		    break

    def handle_end_table(self):
        # If we see an end table tag, then note that we have left the
        # results table. This method is only called if we are in that table.
        self._in_results_table = False
 	

    def handle_start_tr(self, attrs):
 	# The first tr we meet in the results table is just headers
 	# We will set a flag at the end of that tr to avoid creating
        # a blank PlanningApplication
 	if self._past_header_row:
 	    # Create a candidate result object
 	    self._current_application = PlanningApplication()
 	    self._td_count = 0

    def handle_end_tr(self):
 	# If we are in the results table, and not finishing the header row
        # append the current result to the results list.
 	if self._past_header_row:
 	    self._results.addApplication(self._current_application)
 	else:
 	    # The first row of the results table is headers
            # We want to do nothing until after it
 	    self._past_header_row = True	
 	
    def handle_start_td(self, attrs):
        # increase the td count by one
 	self._td_count += 1
        
        # note that we are now in a td
 	self._in_td = True

    def handle_end_td(self):
        # note that we are now not in a td
 	self._in_td = False

    def handle_start_a(self, attrs):
        # this method is only getting called if we are in the
        # 6th td of a non-header row of the results table.

        # go through the attributes of the <a> looking for one
        # named 'href'
        for attr,value in attrs:
 	    if attr == "href":
                # the value of this tag is a relative url.
                # parse it so we can get the query string from it
 		parsed_info_url = urlparse.urlparse(value)
                
 		# the 4th part of the tuple is the query string
 		query_string = parsed_info_url[4]

                # join this query string to the search URL, and store this as
                # the info URL of the current planning application
 		self._current_application.info_url = urlparse.urljoin(self.base_url, value)

                # Join this query string to the comments URL, and store this as
                # the comments URL of the current planning application
                comments_url = urlparse.urljoin(self.base_url, comments_url_end)
                self._current_application.comment_url = "?".join([comments_url, query_string])

 		# while we're here, let's follow some links to find the postcode...
                # the postcode is in an input tag in the property page. This page
                # can be found by following the info url.
                # The newlines in the info page need fixing.
 		info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read())
 		
 		info_file_parser = PublicAccessInfoPageParser()
 		info_file_parser.feed(info_file_contents)

 		property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url)
 		
                # the newlines in this page need fixing
 		property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read())
 	
 		property_file_parser = PublicAccessPropertyPageParser()
 		property_file_parser.feed(property_file_contents)

                # Set the postcode on the current planning application from the
                # one found on the property page
                if property_file_parser.postcode is not None:
                    self._current_application.postcode = property_file_parser.postcode
                else:
                    # If there is no postcode in here, then we'll have to make do with regexing one out of the address.
                    self._current_application.postcode = getPostcodeFromText(self._current_application.address)

                # There is no need for us to look at any more attributes.
 		break
 	

    def handle_data(self, data):
 	if self._in_td:
            # The first td contains the reference
 	    if self._td_count == 1:
 	        self._current_application.council_reference = data
                
            # The second td contains the date the application was received
 	    elif self._td_count == 2:
                year, month, day = time.strptime(data, "%d/%m/%Y")[:3]
                received_date = datetime.date(year, month, day)

 	        self._current_application.date_received = received_date
                
            # The third td contains the address
 	    elif self._td_count == 3:
 		#data = data.replace("^M","\n")
 	        self._current_application.address = data
                
            # The fourth td contains the description
 	    elif self._td_count == 4:
 	        self._current_application.description = data
 	    # 5 is status - we don't need it.
 	    # 6 is a button - this is where we will get our postcode,
 	    # comment_url, and info_url from (when handling the <a> tag).


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        # First download the search form (in order to get a session cookie
        search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end))
        search_form_response = urllib2.urlopen(search_form_request)
@@ -267,91 +112,54 @@ class PublicAccessParser(HTMLParser.HTMLParser):
        if self.debug:
            print contents

        self.feed(contents)

        return self._results


    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
        soup = BeautifulSoup.BeautifulSoup(contents)

        results_table = soup.find("table", {"class": "cResultsForm"})

        # First, we work out what column each thing of interest is in from the headings
        headings = [x.string for x in results_table.findAll("th")]

        ref_col = index_or_none(headings, "Application Ref.") or \
            index_or_none(headings, "Case Number") or \
            index_or_none(headings, "Application Number")

 class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
    """A parser to get the URL for the property details page out of the
       info page (this url is needed in order to get the postcode of the
       application.
       """
        address_col = headings.index("Address")
        description_col = headings.index("Proposal")

    def __init__(self):
 	HTMLParser.HTMLParser.__init__(self)
        comments_url = urlparse.urljoin(self.base_url, comments_url_end)

 	self.property_page_url = None

    def handle_starttag(self, tag, attrs):
        """The URL of the property details page is contained in an <a> tag in
        an attribute with key 'A_btnPropertyDetails'. There is some garbage on
        either side of it which we will have to clear up before storing it...
        for tr in results_table.findAll("tr")[1:]:
            application = PlanningApplication()

        We go through the <a> tags looking for one with an attribute with
        key 'id' and value 'A_btnPropertyDetails'. When we find it we go through
        its attributes looking for one with key 'href' - the value of this attribute
        contains the URL we want, after a bit of tidying up.
            application.date_received = search_date

        Once we have got the URL, there is no need for us to look at any more <a> tags.
        """
 	if tag == "a" and self.property_page_url is None:
            
            #print attrs
 	    if attrs.count(("id","A_btnPropertyDetails")) > 0:
 		for attr,value in attrs:
 		    if attr == "href":
 			the_link = value
            tds = tr.findAll(re.compile("t[dh]"))

 			# this may have some garbage on either side of it...
 			# let's strip that off
            application.council_reference = tds[ref_col].string.strip()
            application.address = tds[address_col].string.strip()
            application.description = tds[description_col].string.strip()

                        # If the stripping fails, take the whole link
            application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

                        # the garbage on the left is separated by whitespace.
                        # the garbage on the right is separated by a "'".
                        try:
                            self.property_page_url = the_link.split()[1].split("'")[0]
                        except IndexError:
                            self.property_page_url = the_link
            # We need the query string from this url to make the comments_url
            query_string = urlparse.urlsplit(application.info_url)[3]

            # This is probably slightly naughty, but I'm just going to add the querystring
            # on to the end manually
            application.comment_url = "%s?%s" %(comments_url, query_string)

 class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):
    """A parser to get the postcode out of the property details page."""
    def __init__(self):
 	HTMLParser.HTMLParser.__init__(self)
            self._results.addApplication(application)

 	self.postcode = None

    def handle_starttag(self, tag, attrs):
        """The postcode is contained in an <input> tag.
        This tag has an attribute 'name' with value postcode.
        It also has an attribute 'value' with value the postcode of this application.

        We go through the input tags looking for one with an attribute with
        key 'name' and value 'postcode'. When we find one,
        we look through its attributes for one with key 'value' - we store the value of this
        attribute as self.postcode.
        return self._results

        Once we have the postcode, there is no need to look at any more input tags.
        """
        
 	if tag == "input" and self.postcode is None:
 	    if attrs.count(("name","postcode")) > 0:
 		for attr,value in attrs:
 		    if attr == "value":
 			self.postcode = value

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 if __name__ == '__main__':
    day = 20
    month = 11
    month = 12
    year = 2008

    #parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True)
@@ -360,6 +168,12 @@ if __name__ == '__main__':
    #parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True)
    #parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True)
 #    parser = PublicAccessParser("Sheffield City Council", "Sheffield", "http://planning.sheffield.gov.uk/publicaccess/tdc/")
    parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/")
 #    parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/")
 #    parser = PublicAccessParser("Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/")
 #    parser = PublicAccessParser("Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/")
 #    parser = PublicAccessParser("Harrogate Borough Council", "Harrogate", "http://publicaccess.harrogate.gov.uk/publicaccess/tdc/")
 #    parser = PublicAccessParser("West Lancashire District Council", "West Lancashire", "http://publicaccess.westlancsdc.gov.uk/PublicAccess/tdc/")
 #    parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/")
    parser = PublicAccessParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/")
    print parser.getResults(day, month, year)
    
--- a/trunk/python_scrapers/SitesToGenerate.csv
+++ b/trunk/python_scrapers/SitesToGenerate.csv
@@ -8,7 +8,7 @@
 "Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Bristol City Council", "Bristol", "http://e2eweb.bristol-city.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Portsmouth City Council", "Portsmouth", "http://planning.portsmouth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "The Borough of Oadby and Wigston", "Oadby and Wigston", "http://web.owbc.net/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "The Borough of Oadby and Wigston", "Oadby and Wigston", "http://pa.owbc.net/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Test Valley Borough Council", "Test Valley", "http://publicaccess.testvalley.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Kings Lynn and West Norfolk Borough Council", "West Norfolk", "http://online.west-norfolk.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Sunderland City Council", "Sunderland", "http://www.sunderland.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
@@ -69,7 +69,7 @@
 "Denbighshire County Council", "Denbighshire", "http://planning.denbighshire.gov.uk/", "ApplicationSearchServletParser", "DenbighshireSearchParser"
 "Wear Valley District Council", "Wear Valley", "http://planning.wearvalley.gov.uk/", "ApplicationSearchServletParser", "WearValleySearchParser"
 "Chorley Borough Council", "Chorley", "http://planning.chorley.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Gravesham Borough Council", "Gravesham", "http://plan.gravesham.gov.uk/PublicAccess/TDC/", "PublicAccess", "PublicAccessParser"
 "Gravesham Borough Council", "Gravesham", "http://plan.gravesham.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "London Borough Of Newham", "Newham", "http://pacaps.newham.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "North West Leicestershire District Council", "NW Leicestershire", "http://paccess.nwleics.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Redditch Borough Council", "Redditch", "http://access.redditchbc.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
@@ -103,7 +103,7 @@
 "New Forest National Park", "New Forest NP", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BoltonLikeParser"
 "Bridgnorth District Council", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BridgnorthParser"
 "Carlisle City Council", "Carlisle", "http://planning.carlisle.gov.uk/PlanData/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
 "Newcastle City Council", "Newcastle",  "http://gispublic.newcastle.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Newcastle City Council", "Newcastle",  "http://gisccs013.newcastle.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "North Wiltshire District Council", "North Wiltshire", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
 "Oldham Metropolitan Borough Council", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "OldhamParser"
 "Renfrewshire Council", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BoltonLikeParser"
@@ -131,7 +131,7 @@
 "Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Hinckley and Bosworth Borough Council", "Hinckley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Tendring District Council", "Tendring", "http://195.99.151.54/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Argyl And Bute Council", "Argyl and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Argyll And Bute Council", "Argyll and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Havant Borough Council", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
 "Rochford District Council", "Rochford", "http://www.rochford.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
@@ -275,3 +275,30 @@
 "Mendip District Council", "Mendip", "", "Mendip", "MendipParser"
 "Weymouth and Portland Borough Council", "Weymouth and Portland", "", "Weymouth", "WeymouthParser"
 "Solihull Metropolitan Borough Council", "Solihull", "", "Solihull", "SolihullParser"
 "Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Bedford Borough Council", "Bedford", "http://www.publicaccess.bedford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Bradford Metropolitan District Council", "Bradford", "http://www.planning4bradford.com/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Cambridge City Council", "Cambridge", "http://www.cambridge.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Dartford Borough Council", "Dartford", "http://publicaccess.dartford.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "East Cambridgeshire District Council", "East Cambridgeshire", "http://pa.eastcambs.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "East Riding of Yorkshire Council", "East Riding", "http://www.eastriding.gov.uk/PublicAccess731c/tdc/", "PublicAccess", "PublicAccessParser"
 "Gloucester City Council", "Gloucester", "http://www.glcstrplnng11.co.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Horsham District Council", "Horsham", "http://publicaccess.horsham.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "London Borough of Lambeth", "Lambeth", "http://planning.lambeth.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Leeds City Council", "Leeds", "http://planningapplications.leeds.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Mid Sussex District Council", "Mid Sussex", "http://dc.midsussex.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "North East Derbyshire District Council", "North East Derbyshire", "http://planapps-online.ne-derbyshire.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Norwich City Council", "Norwich", "http://publicaccess.norwich.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Richmondshire District Council", "Richmondshire", "http://publicaccess.richmondshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Salford City Council", "Salford", "http://publicaccess.salford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Sandwell Metropolitan Borough Council", "Sandwell", "http://webcaps.sandwell.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Scottish Borders Council", "Scottish Borders", "http://eplanning.scotborders.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Stafford Borough Council", "Stafford", "http://www3.staffordbc.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Swindon Borough Council", "Swindon", "http://194.73.99.13/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Three Rivers District Council", "Three Rivers", "http://www2.threerivers.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Torridge District Council", "Torridge", "http://www.torridge.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Tunbridge Wells Borough Council", "Tunbridge Wells", "http://secure.tunbridgewells.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Wakefield Metropolitan District Council", "Wakefield", "http://planning.wakefield.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "West Wiltshire District Council", "West Wiltshire", "http://planning.westwiltshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Worthing Borough Council", "Worthing", "http://planning.worthing.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Wycombe District Council", "Wycombe", "http://planningpa.wycombe.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"