From 086b04b1c12178c7dcf29b4141e68fb29748da41 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sat, 3 Jan 2009 15:34:26 +0000 Subject: [PATCH] Update PublicAccess scraper to work with BeautifulSoup. Add all publicaccess sites to the python scraper. --- python_scrapers/PublicAccess.py | 284 +++++----------------------- python_scrapers/SitesToGenerate.csv | 35 +++- 2 files changed, 80 insertions(+), 239 deletions(-) diff --git a/python_scrapers/PublicAccess.py b/python_scrapers/PublicAccess.py index 823d0cf..19403df 100644 --- a/python_scrapers/PublicAccess.py +++ b/python_scrapers/PublicAccess.py @@ -1,9 +1,12 @@ #!/usr/local/bin/python import urllib, urllib2 -import HTMLParser + import urlparse -import datetime, time +import datetime +import re + +import BeautifulSoup import cookielib @@ -17,7 +20,13 @@ search_form_url_end = "DcApplication/application_searchform.aspx" search_results_url_end = "DcApplication/application_searchresults.aspx" comments_url_end = "DcApplication/application_comments_entryform.aspx" -class PublicAccessParser(HTMLParser.HTMLParser): +def index_or_none(a_list, item): + """ + Returns the index of item in a_list, or None, if it isn't in the list. + """ + return a_list.count(item) and a_list.index(item) + +class PublicAccessParser: """This is the class which parses the PublicAccess search results page. """ @@ -27,183 +36,19 @@ class PublicAccessParser(HTMLParser.HTMLParser): base_url, debug=False): - HTMLParser.HTMLParser.__init__(self) - self.authority_name = authority_name self.authority_short_name = authority_short_name self.base_url = base_url self.debug = debug - # this will change to True when we enter the table of results - self._in_results_table = False - - # this will be set to True when we have passed the header row - # in the results table - self._past_header_row = False - - # this will be true when we are in a in the results table - self._in_td = False - - # For each row, this will say how many tds we have seen so far - self._td_count = 0 - # The object which stores our set of planning application results self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) - # This will store the planning application we are currently working on. - self._current_application = None - - def handle_starttag(self, tag, attrs): - if tag == "table": - self.handle_start_table(attrs) - # we are only interested in tr tags if we are in the results table - elif self._in_results_table and tag == "tr": - self.handle_start_tr(attrs) - # we are only interested in td tags if we are in the results table - elif self._in_results_table and tag == "td": - self.handle_start_td(attrs) - # we are only interested in tags if we are in the 6th td in - # the results table. - # UPDATE: It seems that, in the case of Chiltern, we are interested in - # td 5. - elif self._in_td and (self._td_count == 5 or self._td_count == 6) and tag == "a": - self.handle_start_a(attrs) - # If the tag is not one of these then we aren't interested - - def handle_endtag(self, tag): - # we only need to consider end tags if we are in the results table - if self._in_results_table: - if tag == "table": - self.handle_end_table() - if tag == "tr": - self.handle_end_tr() - if tag == "td": - self.handle_end_td() - - def handle_start_table(self, attrs): - for attr,value in attrs: - if attr == "class": - if value == "cResultsForm": - self._in_results_table = True - break - - def handle_end_table(self): - # If we see an end table tag, then note that we have left the - # results table. This method is only called if we are in that table. - self._in_results_table = False - - - def handle_start_tr(self, attrs): - # The first tr we meet in the results table is just headers - # We will set a flag at the end of that tr to avoid creating - # a blank PlanningApplication - if self._past_header_row: - # Create a candidate result object - self._current_application = PlanningApplication() - self._td_count = 0 - - def handle_end_tr(self): - # If we are in the results table, and not finishing the header row - # append the current result to the results list. - if self._past_header_row: - self._results.addApplication(self._current_application) - else: - # The first row of the results table is headers - # We want to do nothing until after it - self._past_header_row = True - - def handle_start_td(self, attrs): - # increase the td count by one - self._td_count += 1 - - # note that we are now in a td - self._in_td = True - - def handle_end_td(self): - # note that we are now not in a td - self._in_td = False - - def handle_start_a(self, attrs): - # this method is only getting called if we are in the - # 6th td of a non-header row of the results table. - - # go through the attributes of the looking for one - # named 'href' - for attr,value in attrs: - if attr == "href": - # the value of this tag is a relative url. - # parse it so we can get the query string from it - parsed_info_url = urlparse.urlparse(value) - - # the 4th part of the tuple is the query string - query_string = parsed_info_url[4] - - # join this query string to the search URL, and store this as - # the info URL of the current planning application - self._current_application.info_url = urlparse.urljoin(self.base_url, value) - - # Join this query string to the comments URL, and store this as - # the comments URL of the current planning application - comments_url = urlparse.urljoin(self.base_url, comments_url_end) - self._current_application.comment_url = "?".join([comments_url, query_string]) - - # while we're here, let's follow some links to find the postcode... - # the postcode is in an input tag in the property page. This page - # can be found by following the info url. - # The newlines in the info page need fixing. - info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read()) - - info_file_parser = PublicAccessInfoPageParser() - info_file_parser.feed(info_file_contents) - - property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url) - - # the newlines in this page need fixing - property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read()) - - property_file_parser = PublicAccessPropertyPageParser() - property_file_parser.feed(property_file_contents) - - # Set the postcode on the current planning application from the - # one found on the property page - if property_file_parser.postcode is not None: - self._current_application.postcode = property_file_parser.postcode - else: - # If there is no postcode in here, then we'll have to make do with regexing one out of the address. - self._current_application.postcode = getPostcodeFromText(self._current_application.address) - - # There is no need for us to look at any more attributes. - break - - - def handle_data(self, data): - if self._in_td: - # The first td contains the reference - if self._td_count == 1: - self._current_application.council_reference = data - - # The second td contains the date the application was received - elif self._td_count == 2: - year, month, day = time.strptime(data, "%d/%m/%Y")[:3] - received_date = datetime.date(year, month, day) - - self._current_application.date_received = received_date - - # The third td contains the address - elif self._td_count == 3: - #data = data.replace("^M","\n") - self._current_application.address = data - - # The fourth td contains the description - elif self._td_count == 4: - self._current_application.description = data - # 5 is status - we don't need it. - # 6 is a button - this is where we will get our postcode, - # comment_url, and info_url from (when handling the tag). - def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + # First download the search form (in order to get a session cookie search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end)) search_form_response = urllib2.urlopen(search_form_request) @@ -267,91 +112,54 @@ class PublicAccessParser(HTMLParser.HTMLParser): if self.debug: print contents - self.feed(contents) - - return self._results - - - def getResults(self, day, month, year): - return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + soup = BeautifulSoup.BeautifulSoup(contents) + results_table = soup.find("table", {"class": "cResultsForm"}) + # First, we work out what column each thing of interest is in from the headings + headings = [x.string for x in results_table.findAll("th")] + ref_col = index_or_none(headings, "Application Ref.") or \ + index_or_none(headings, "Case Number") or \ + index_or_none(headings, "Application Number") -class PublicAccessInfoPageParser(HTMLParser.HTMLParser): - """A parser to get the URL for the property details page out of the - info page (this url is needed in order to get the postcode of the - application. - """ + address_col = headings.index("Address") + description_col = headings.index("Proposal") - def __init__(self): - HTMLParser.HTMLParser.__init__(self) + comments_url = urlparse.urljoin(self.base_url, comments_url_end) - self.property_page_url = None - def handle_starttag(self, tag, attrs): - """The URL of the property details page is contained in an tag in - an attribute with key 'A_btnPropertyDetails'. There is some garbage on - either side of it which we will have to clear up before storing it... + for tr in results_table.findAll("tr")[1:]: + application = PlanningApplication() - We go through the tags looking for one with an attribute with - key 'id' and value 'A_btnPropertyDetails'. When we find it we go through - its attributes looking for one with key 'href' - the value of this attribute - contains the URL we want, after a bit of tidying up. + application.date_received = search_date - Once we have got the URL, there is no need for us to look at any more tags. - """ - if tag == "a" and self.property_page_url is None: - - #print attrs - if attrs.count(("id","A_btnPropertyDetails")) > 0: - for attr,value in attrs: - if attr == "href": - the_link = value + tds = tr.findAll(re.compile("t[dh]")) - # this may have some garbage on either side of it... - # let's strip that off + application.council_reference = tds[ref_col].string.strip() + application.address = tds[address_col].string.strip() + application.description = tds[description_col].string.strip() - # If the stripping fails, take the whole link + application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) - # the garbage on the left is separated by whitespace. - # the garbage on the right is separated by a "'". - try: - self.property_page_url = the_link.split()[1].split("'")[0] - except IndexError: - self.property_page_url = the_link + # We need the query string from this url to make the comments_url + query_string = urlparse.urlsplit(application.info_url)[3] + # This is probably slightly naughty, but I'm just going to add the querystring + # on to the end manually + application.comment_url = "%s?%s" %(comments_url, query_string) -class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): - """A parser to get the postcode out of the property details page.""" - def __init__(self): - HTMLParser.HTMLParser.__init__(self) + self._results.addApplication(application) - self.postcode = None - - def handle_starttag(self, tag, attrs): - """The postcode is contained in an tag. - This tag has an attribute 'name' with value postcode. - It also has an attribute 'value' with value the postcode of this application. - - We go through the input tags looking for one with an attribute with - key 'name' and value 'postcode'. When we find one, - we look through its attributes for one with key 'value' - we store the value of this - attribute as self.postcode. + return self._results - Once we have the postcode, there is no need to look at any more input tags. - """ - - if tag == "input" and self.postcode is None: - if attrs.count(("name","postcode")) > 0: - for attr,value in attrs: - if attr == "value": - self.postcode = value + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() if __name__ == '__main__': day = 20 - month = 11 + month = 12 year = 2008 #parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True) @@ -360,6 +168,12 @@ if __name__ == '__main__': #parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True) #parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True) # parser = PublicAccessParser("Sheffield City Council", "Sheffield", "http://planning.sheffield.gov.uk/publicaccess/tdc/") - parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/") +# parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/") +# parser = PublicAccessParser("Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/") +# parser = PublicAccessParser("Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/") +# parser = PublicAccessParser("Harrogate Borough Council", "Harrogate", "http://publicaccess.harrogate.gov.uk/publicaccess/tdc/") +# parser = PublicAccessParser("West Lancashire District Council", "West Lancashire", "http://publicaccess.westlancsdc.gov.uk/PublicAccess/tdc/") +# parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/") + parser = PublicAccessParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/") print parser.getResults(day, month, year) diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index aceefcf..8de830b 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -8,7 +8,7 @@ "Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Bristol City Council", "Bristol", "http://e2eweb.bristol-city.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Portsmouth City Council", "Portsmouth", "http://planning.portsmouth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" -"The Borough of Oadby and Wigston", "Oadby and Wigston", "http://web.owbc.net/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"The Borough of Oadby and Wigston", "Oadby and Wigston", "http://pa.owbc.net/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Test Valley Borough Council", "Test Valley", "http://publicaccess.testvalley.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Kings Lynn and West Norfolk Borough Council", "West Norfolk", "http://online.west-norfolk.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Sunderland City Council", "Sunderland", "http://www.sunderland.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" @@ -69,7 +69,7 @@ "Denbighshire County Council", "Denbighshire", "http://planning.denbighshire.gov.uk/", "ApplicationSearchServletParser", "DenbighshireSearchParser" "Wear Valley District Council", "Wear Valley", "http://planning.wearvalley.gov.uk/", "ApplicationSearchServletParser", "WearValleySearchParser" "Chorley Borough Council", "Chorley", "http://planning.chorley.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" -"Gravesham Borough Council", "Gravesham", "http://plan.gravesham.gov.uk/PublicAccess/TDC/", "PublicAccess", "PublicAccessParser" +"Gravesham Borough Council", "Gravesham", "http://plan.gravesham.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "London Borough Of Newham", "Newham", "http://pacaps.newham.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "North West Leicestershire District Council", "NW Leicestershire", "http://paccess.nwleics.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Redditch Borough Council", "Redditch", "http://access.redditchbc.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" @@ -103,7 +103,7 @@ "New Forest National Park", "New Forest NP", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BoltonLikeParser" "Bridgnorth District Council", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BridgnorthParser" "Carlisle City Council", "Carlisle", "http://planning.carlisle.gov.uk/PlanData/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" -"Newcastle City Council", "Newcastle", "http://gispublic.newcastle.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Newcastle City Council", "Newcastle", "http://gisccs013.newcastle.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "North Wiltshire District Council", "North Wiltshire", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" "Oldham Metropolitan Borough Council", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "OldhamParser" "Renfrewshire Council", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BoltonLikeParser" @@ -131,7 +131,7 @@ "Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Hinckley and Bosworth Borough Council", "Hinckley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Tendring District Council", "Tendring", "http://195.99.151.54/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" -"Argyl And Bute Council", "Argyl and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Argyll And Bute Council", "Argyll and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Havant Borough Council", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" "Rochford District Council", "Rochford", "http://www.rochford.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" @@ -275,3 +275,30 @@ "Mendip District Council", "Mendip", "", "Mendip", "MendipParser" "Weymouth and Portland Borough Council", "Weymouth and Portland", "", "Weymouth", "WeymouthParser" "Solihull Metropolitan Borough Council", "Solihull", "", "Solihull", "SolihullParser" +"Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Bedford Borough Council", "Bedford", "http://www.publicaccess.bedford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Bradford Metropolitan District Council", "Bradford", "http://www.planning4bradford.com/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Cambridge City Council", "Cambridge", "http://www.cambridge.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Dartford Borough Council", "Dartford", "http://publicaccess.dartford.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"East Cambridgeshire District Council", "East Cambridgeshire", "http://pa.eastcambs.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"East Riding of Yorkshire Council", "East Riding", "http://www.eastriding.gov.uk/PublicAccess731c/tdc/", "PublicAccess", "PublicAccessParser" +"Gloucester City Council", "Gloucester", "http://www.glcstrplnng11.co.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Horsham District Council", "Horsham", "http://publicaccess.horsham.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"London Borough of Lambeth", "Lambeth", "http://planning.lambeth.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Leeds City Council", "Leeds", "http://planningapplications.leeds.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Mid Sussex District Council", "Mid Sussex", "http://dc.midsussex.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"North East Derbyshire District Council", "North East Derbyshire", "http://planapps-online.ne-derbyshire.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Norwich City Council", "Norwich", "http://publicaccess.norwich.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Richmondshire District Council", "Richmondshire", "http://publicaccess.richmondshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Salford City Council", "Salford", "http://publicaccess.salford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Sandwell Metropolitan Borough Council", "Sandwell", "http://webcaps.sandwell.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Scottish Borders Council", "Scottish Borders", "http://eplanning.scotborders.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Stafford Borough Council", "Stafford", "http://www3.staffordbc.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Swindon Borough Council", "Swindon", "http://194.73.99.13/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Three Rivers District Council", "Three Rivers", "http://www2.threerivers.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Torridge District Council", "Torridge", "http://www.torridge.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Tunbridge Wells Borough Council", "Tunbridge Wells", "http://secure.tunbridgewells.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Wakefield Metropolitan District Council", "Wakefield", "http://planning.wakefield.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"West Wiltshire District Council", "West Wiltshire", "http://planning.westwiltshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Worthing Borough Council", "Worthing", "http://planning.worthing.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Wycombe District Council", "Wycombe", "http://planningpa.wycombe.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"