From 96070fd3a1a1780a2077c3ef6cc5881b3f161a58 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 17 Feb 2008 15:35:47 +0000 Subject: [PATCH] Add scraper for the WAM sites (apart from Westminster, which is a bit different). --- trunk/python_scrapers/OtherFilesToCopy.csv | 1 + trunk/python_scrapers/PlanningExplorer.py | 4 +- trunk/python_scrapers/SitesToGenerate.csv | 11 ++ trunk/python_scrapers/WAM.py | 206 +++++++++++++++++++++ 4 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 trunk/python_scrapers/WAM.py diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index 432bb93..249c247 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -19,3 +19,4 @@ "SouthSomerset.cgi", "493" "WestDorset.cgi", "493" "Christchurch.cgi", "493" +"WAM.py", "420" diff --git a/trunk/python_scrapers/PlanningExplorer.py b/trunk/python_scrapers/PlanningExplorer.py index 74ef700..208e4ab 100644 --- a/trunk/python_scrapers/PlanningExplorer.py +++ b/trunk/python_scrapers/PlanningExplorer.py @@ -548,7 +548,7 @@ if __name__ == '__main__': #parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") #parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") - #parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") + parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") #parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") #parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") @@ -576,3 +576,5 @@ if __name__ == '__main__': # Investigate catching unavailable message: # Charnwood + +# South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site... diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 390c95e..0e2e700 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -190,3 +190,14 @@ "Canterbury City Council", "Canterbury", "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "CanterburyParser" "London Borough of Merton", "Merton", "http://planning.merton.gov.uk/", "PlanningExplorer", "MertonParser" "Chichester District Council", "Chichester", "http://pa.chichester.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"London Borough of Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", "WAM", "WAMParser" +"Braintree District Council", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", "WAM", "BraintreeParser" +"Castle Point Borough Council", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser" +"Colchester Borough Council", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", "WAM", "BraintreeParser" +"East Lothian Council", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser" +"North Somerset Council", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", "WAM", "BraintreeParser" +"Nottingham City Council", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser" +"Poole Borough Council", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", "WAM", "PooleParser" +"Rother District Council", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser" +"South Gloucestershire Council", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", "WAM", "BraintreeParser" +"London Borough of Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", "WAM", "BraintreeParser" diff --git a/trunk/python_scrapers/WAM.py b/trunk/python_scrapers/WAM.py new file mode 100644 index 0000000..8168a35 --- /dev/null +++ b/trunk/python_scrapers/WAM.py @@ -0,0 +1,206 @@ +import urllib2 +import urllib +import urlparse + +import datetime +import time +import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +def clean_string(a_string): + return ' '.join(' '.join(a_string.split(" ")).strip().split()) + +def remove_params(url): + # Probably a bit naughty to use both urlparse and urlunsplit here, + # but it does what we want - removing the jsessionid param + + parsed_url = urlparse.urlparse(url) + params_free_url = urlparse.urlunsplit(parsed_url[:3] + parsed_url[4:]) + + return params_free_url + +class WAMParser: + address_column = 2 + date_format = "%d/%b/%Y" + + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + def _get_search_data(self, year, month, day): + timestamp = time.mktime((year, month, day, 0,0,0,0,0,0)) + + # The parameter endDate appears to be 1000*a timestamp + time_input = str(int(timestamp*1000)) + + #http://wam.boroughofpoole.com/WAM/pas/searchApplications.do;jsessionid=BCC7DFD1C42DC210A7BE5BA616683CDE + # areaCode=%25&sortOrder=1&endDate=1197213359015&applicationType=%25&Button=Search + + search_data = ( + ("areaCode", "%"), + ("sortOrder", "1"), + ("endDate", time_input), + ("applicationType", "%"), + ("Button", "Search"), + ) + + return search_data + + def getResultsByDayMonthYear(self, day, month, year): + search_data_tuple = self._get_search_data(year, month, day) + search_data = urllib.urlencode(search_data_tuple) + + response = urllib2.urlopen(self.base_url, search_data) + + html = response.read() + + soup = BeautifulSoup(html) + + results_table = soup.find(text=re.compile("Your search returned the following")).findNext("table") + + # FIXME - deal with the empty results case + # FIXME - deal with later pages of results + + trs = results_table.findAll("tr")[1:] + + self._current_application = PlanningApplication() + + for tr in trs: + try: + + tds = tr.findAll("td") + + date_received_string = tds[0].contents[0].strip() + self._current_application.date_received = datetime.datetime.strptime(clean_string(date_received_string), self.date_format) + + relative_info_url = tr.a['href'] + info_url_no_params = remove_params(relative_info_url) + + #Now we join on the base url to make it absolute + self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_no_params) + + self._current_application.council_reference = tr.a.string + + address = clean_string(tds[self.address_column].string) + self._current_application.address = address + self._current_application.postcode = getPostcodeFromText(address) + +# self._current_application.description = clean_string(tds[self.description_column].string) + + # Fetch the info page + + info_response = urllib2.urlopen(self._current_application.info_url) + + info_html = info_response.read() + info_soup = BeautifulSoup(info_html) + + try: + relative_comment_url = info_soup.find("a", href=re.compile("createComment.do"))['href'] + comment_url_no_params = remove_params(relative_comment_url) + + self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_no_params) + except: # FIXME - specialize the except + if self.debug: + print "No comment url for %s" %(self._current_application.council_reference) + self._current_application.comment_url = "None" + + # Some WAM sites have the description in the results page, + # but since they don't all have it there, we'll get it from here... + + description_td = info_soup.find(text="Development:").findNext("td") + + # Sometimes the description is in a span in the td, sometimes it is directly there. + self._current_application.description = (description_td.string or description_td.span.string).strip() + + self._results.addApplication(self._current_application) + + except SystemExit: + # It seems a shame to miss out on all the apps from an authority just because one breaks... + if self._current_application.council_reference: + if self.debug: + print "Failed to add %s" %(self._current_application.council_reference) + else: + if self.debug: + print "Failed to add an application" + + self._current_application = PlanningApplication() + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + + +class PooleParser(WAMParser): + address_column = 1 + +class BraintreeParser(WAMParser): + date_format = "%d %b %Y" + + def _get_search_data(self, year, month, day): + # Braintree + # action=showWeeklyList&areaCode=%25&sortOrder=1&endDate=1203249969656&applicationType=%25&Button=Search + search_data = WAMParser._get_search_data(self, year, month, day) + + return (("action", "showWeeklyList"),) + search_data + + +if __name__ == '__main__': + #parser = WAMParser("Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", debug=True) + #parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True) + # Camden + #parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", debug=True) + #Chichester - Done as PublicAccess + #parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True) + #parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True) + #parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True) + #parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True) + #parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True) + #parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True) + #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) + #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) + #parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True) + parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True) + #parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True) + + print parser.getResults(8,2,2008) + +# Left to fix + +# All: +# Paging +# Coping with no apps + + +# Barking and Dagenham - done +# Braintree - done +# Camden - also has a PlanningExplorer, which is done (so not bothering) +# Castle Point - done +# Chichester - not needed (PublicAccess site done) +# Colchester - done. like Braintree +# East Lothian - done +# North Somerset - done. like Braintree +# Nottingham - done (sometimes no comments) +# Poole - done +# Rother - done +# South Gloucestershire - done. like Braintree +# South Norfolk - Works, but no postcodes. Also, the search link here points to PlanningExplorer. I think we should assume this is the current site. +# Tower Hamlets - done. Like Braintree. +# Westminster - not done: clearly WAM underneath, but with a wrapper. +