From 48ec82b48508330c8bfe7e598643c3f861db6ebe Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Mon, 8 Sep 2008 01:35:40 +0000 Subject: [PATCH] Add scraper for Herefordshire. Alter PlanningUtils to CDATA everything, scrapping the xmlquote function. --- trunk/python_scrapers/Herefordshire.py | 85 ++++++++++++++++++++++ trunk/python_scrapers/OtherFilesToCopy.csv | 1 + trunk/python_scrapers/PlanningExplorer.py | 6 +- trunk/python_scrapers/PlanningUtils.py | 21 ++---- trunk/python_scrapers/SitesToGenerate.csv | 1 + 5 files changed, 97 insertions(+), 17 deletions(-) create mode 100644 trunk/python_scrapers/Herefordshire.py diff --git a/trunk/python_scrapers/Herefordshire.py b/trunk/python_scrapers/Herefordshire.py new file mode 100644 index 0000000..19cfbfa --- /dev/null +++ b/trunk/python_scrapers/Herefordshire.py @@ -0,0 +1,85 @@ + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi +import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class HerefordshireParser: + comments_email_address = "Developmentcontrol@barnsley.gov.uk" + + def __init__(self, *args): + + self.authority_name = "Herefordshire County Council" + self.authority_short_name = "Herefordshire" + self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0" + #As we are going to the info page, we may as well pick up the comment url from there. +# self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + post_data = urllib.urlencode( + (("show", "0"), + ("Go", "GO"), + ) + ) + + # Now get the search page + response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) + + soup = BeautifulSoup(response.read()) + + if not soup.find(text=re.compile("Sorry, no matches found")): + # There were apps for this date + + trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:] + + for tr in trs: + application = PlanningApplication() + application.date_received = search_day + + application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) + application.council_reference = tr.a.string + # application.comment_url = self.comment_url %(application.council_reference) + + tds = tr.findAll("td") + + application.address = tds[1].string + application.postcode = getPostcodeFromText(application.address) + + # This just gets us an initial segment of the description. + # We are going to have to download the info page... + #application.description = tds[2].string.strip() + + info_response = urllib.urlopen(application.info_url) + + info_soup = BeautifulSoup(info_response.read()) + + application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() + application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href']) + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = HerefordshireParser() + print parser.getResults(31,8,2008) + diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index dafa174..1bf9904 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -55,3 +55,4 @@ "Halton.py", "420" "Hampshire.py", "420" "Hastings.py", "420" +"Herefordshire.py", "420" diff --git a/trunk/python_scrapers/PlanningExplorer.py b/trunk/python_scrapers/PlanningExplorer.py index 2da3961..5c3e2ec 100644 --- a/trunk/python_scrapers/PlanningExplorer.py +++ b/trunk/python_scrapers/PlanningExplorer.py @@ -631,7 +631,7 @@ if __name__ == '__main__': # NOTE - 04/11/2007 is a sunday # I'm using it to test that the scrapers behave on days with no apps. - parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") +# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") # parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") @@ -639,7 +639,7 @@ if __name__ == '__main__': # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") -# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") + parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") # parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") @@ -655,7 +655,7 @@ if __name__ == '__main__': # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") - print parser.getResults(3, 7, 2008) + print parser.getResults(4, 9, 2008) # To Do diff --git a/trunk/python_scrapers/PlanningUtils.py b/trunk/python_scrapers/PlanningUtils.py index 569d892..6d854e0 100644 --- a/trunk/python_scrapers/PlanningUtils.py +++ b/trunk/python_scrapers/PlanningUtils.py @@ -4,13 +4,6 @@ import re date_format = "%d/%m/%Y" - -def xmlQuote(text): - # Change &s to &s - # I suspect there is probably some standard python - # function I should be using for this... - return text.replace('&', '&') - def fixNewlines(text): # This can be used to sort out windows newlines return text.replace("\r\n","\n") @@ -112,13 +105,13 @@ class PlanningApplication: #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received contents = [ - u"%s" %xmlQuote(self.council_reference), - u"
%s
" %xmlQuote(self.address), - u"%s" %self.postcode, - u"%s" %xmlQuote(self.description), - u"%s" %xmlQuote(self.info_url), - u"%s" %xmlQuote(self.comment_url), - u"%s" %self.date_received.strftime(date_format), + u"" %(self.council_reference), + u"
" %(self.address), + u"" %self.postcode, + u"" %(self.description), + u"" %(self.info_url), + u"" %(self.comment_url), + u"" %self.date_received.strftime(date_format), ] if self.osgb_x: contents.append(u"%s" %(self.osgb_x)) diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 5de335b..7e36475 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -259,3 +259,4 @@ "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" +"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"