From b5dcd82e60b3e789c2bd78a6751e8e3eeab6affa Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Fri, 13 Jun 2008 14:38:12 +0000 Subject: [PATCH] Add scraper for Flintshire. I've included the OSGB x,y coordinates in the hope that we can start using these where we don't have postcodes. --- trunk/python_scrapers/Flintshire.py | 93 ++++++++++++++++++++++ trunk/python_scrapers/OtherFilesToCopy.csv | 1 + trunk/python_scrapers/PlanningUtils.py | 30 ++++--- trunk/python_scrapers/SitesToGenerate.csv | 1 + 4 files changed, 115 insertions(+), 10 deletions(-) create mode 100644 trunk/python_scrapers/Flintshire.py diff --git a/trunk/python_scrapers/Flintshire.py b/trunk/python_scrapers/Flintshire.py new file mode 100644 index 0000000..845e2f6 --- /dev/null +++ b/trunk/python_scrapers/Flintshire.py @@ -0,0 +1,93 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class FlintshireParser: + def __init__(self, *args): + + self.authority_name = "Flintshire County Council" + self.authority_short_name = "Flintshire" + + # I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate... + self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list + response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format), + "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)}) + soup = BeautifulSoup(response.read()) + + # Each app is stored in it's own table + result_tables = soup.findAll("table", border="1") + + # For the moment, we'll have to ignore the first result (see TODO list). + for table in result_tables[1:]: + application = PlanningApplication() + + # It's not clear to me why this next one isn't the string of the next sibling. This works though! + application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0] + + application.address = table.find(text="Location").parent.findNextSibling().string.strip() + application.postcode = getPostcodeFromText(application.address) + + application.info_url = urlparse.urljoin(self.base_url, table.a['href']) + + # Let's go to the info_page and get the OSGB and the date_received + info_request = urllib2.Request(application.info_url) + + # We need to add the language header in order to get UK style dates + info_request.add_header("Accept-Language", "en-gb,en") + info_response = urllib2.urlopen(info_request) + info_soup = BeautifulSoup(info_response.read()) + + grid_reference_td = info_soup.find(text="Grid Reference").findNext("td") + x_element = grid_reference_td.font + + application.osgb_x = x_element.string.strip() + application.osgb_y = x_element.nextSibling.nextSibling.string.strip() + + date_string = info_soup.find(text="Date Valid").findNext("td").string.strip() + + application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6])) + + application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip() + + + # There is a link to comment from the info page, though I can't click it. + application.comment_url = application.info_url + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = FlintshireParser() + print parser.getResults(22,5,2008) + +# TODO + +# 1) Email the council about broken first result. +# This is always +# slightly broken (two s for one of the s and upsets beautiful +# soup. diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index 6059344..79e74c2 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -27,3 +27,4 @@ "Kensington.py", "420" "Fife.py", "420" "ForestOfDean.py", "420" +"Flintshire.py", "420" diff --git a/trunk/python_scrapers/PlanningUtils.py b/trunk/python_scrapers/PlanningUtils.py index f97affd..569d892 100644 --- a/trunk/python_scrapers/PlanningUtils.py +++ b/trunk/python_scrapers/PlanningUtils.py @@ -87,6 +87,11 @@ class PlanningApplication: # expecting this as a datetime.date object self.date_received = None + # If we can get them, we may as well include OSGB. + # These will be the entirely numeric version. + self.osgb_x = None + self.osgb_y = None + def __repr__(self): return self.displayXML() @@ -105,14 +110,19 @@ class PlanningApplication: def displayXML(self): #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received - return u"\n" +\ - u"%s\n" %xmlQuote(self.council_reference) +\ - u"
%s
\n" %xmlQuote(self.address) +\ - u"%s\n" %self.postcode +\ - u"%s\n" %xmlQuote(self.description) +\ - u"%s\n" %xmlQuote(self.info_url) +\ - u"%s\n" %xmlQuote(self.comment_url) +\ - u"%s\n" %self.date_received.strftime(date_format) +\ - u"
\n" - + contents = [ + u"%s" %xmlQuote(self.council_reference), + u"
%s
" %xmlQuote(self.address), + u"%s" %self.postcode, + u"%s" %xmlQuote(self.description), + u"%s" %xmlQuote(self.info_url), + u"%s" %xmlQuote(self.comment_url), + u"%s" %self.date_received.strftime(date_format), + ] + if self.osgb_x: + contents.append(u"%s" %(self.osgb_x)) + if self.osgb_y: + contents.append(u"%s" %(self.osgb_y)) + + return u"\n%s\n" %('\n'.join(contents)) diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 0f75ef1..53364fb 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -227,3 +227,4 @@ "East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" "Fife Council", "Fife", "", "Fife", "FifeParser" "Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser" +"Flintshire County Council", "Flintshire", "", "Flintshire", "FlintshireParser"