From 66bb2955a8e01479f0ff4f2edaf31a7f9fcaf992 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 12 Jun 2008 23:02:04 +0000 Subject: [PATCH] Add Forest of Dean and Fife parsers. --- python_scrapers/Fife.py | 88 ++++++++++++++++++++++++++++ python_scrapers/ForestOfDean.py | 75 ++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 2 + python_scrapers/SitesToGenerate.csv | 3 +- 4 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 python_scrapers/Fife.py create mode 100644 python_scrapers/ForestOfDean.py diff --git a/python_scrapers/Fife.py b/python_scrapers/Fife.py new file mode 100644 index 0000000..18df3d2 --- /dev/null +++ b/python_scrapers/Fife.py @@ -0,0 +1,88 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +class FifeParser: + def __init__(self, *args): + + self.authority_name = "Fife Council" + self.authority_short_name = "Fife" + self.base_url = "http://www.fifedirect.org.uk/topics/index.cfm" + + self.comment_url = "http://www.ukplanning.com/ukp/showCaseFile.do?councilName=Fife+Council&appNumber=%s" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + search_data = urllib.urlencode( + [("fuseaction", "planapps.list"), + ("SUBJECTID", "104CC166-3ED1-4D22-B9F1E2FB8438478A"), + ("src_fromdayRec", day), + ("src_frommonthRec", month), + ("src_fromyearRec", year), + ("src_todayRec", day), + ("src_tomonthRec", month), + ("src_toyearRec", year), + ("findroadworks", "GO"), + ] + ) + + search_url = self.base_url + "?" + search_data + + response = urllib2.urlopen(search_url) + soup = BeautifulSoup(response.read()) + + results_table = soup.find("table", id="results") + + # Apart from the first tr, which contains headers, the trs come in pairs for each application + + trs = results_table.findAll("tr")[1:] + + tr_count = 0 + while tr_count < len(trs): + tr = trs[tr_count] + + if tr_count % 2 == 0: + application = PlanningApplication() + application.date_received = search_date + + tds = tr.findAll("td") + + application.council_reference = tds[0].a.string.strip() + application.comment_url = self.comment_url %(application.council_reference) + + application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) + application.address = ', '.join([x.strip() for x in tds[1].findAll(text=True)]) + application.postcode = getPostcodeFromText(application.address) + else: + # Get rid of the "Details: " at the beginning. + application.description = tr.td.string.strip()[9:] + + self._results.addApplication(application) + + tr_count += 1 + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = FifeParser() + print parser.getResults(21,5,2008) + +# TODO + +# Paginates at 50. Unlikely on a single day, so we'll worry about it later. diff --git a/python_scrapers/ForestOfDean.py b/python_scrapers/ForestOfDean.py new file mode 100644 index 0000000..07e0270 --- /dev/null +++ b/python_scrapers/ForestOfDean.py @@ -0,0 +1,75 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%Y%m%d" + +class ForestOfDeanParser: + def __init__(self, *args): + + self.authority_name = "Forest of Dean District Council" + self.authority_short_name = "Forest of Dean" + self.base_url = "http://www.fdean.gov.uk/content.asp" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + search_data = urllib.urlencode( + [ + ("parent_directory_id", "200"), + ("nav", "679"), + ("id", "13266"), + ("RecStart", "1"), + ("RecCount", "100"), + ("SDate", search_date.strftime(date_format)), + ("EDate", search_date.strftime(date_format)), + ] + ) + + search_url = self.base_url + "?" + search_data + + response = urllib2.urlopen(search_url) + soup = BeautifulSoup(response.read()) + + results_table = soup.find("table", summary="List of planning applications that match your query") + + for tr in results_table.findAll("tr")[1:]: + application = PlanningApplication() + + application.date_received = search_date + + tds = tr.findAll("td") + + application.council_reference = tds[0].a.string.strip() + application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) + application.comment_url = application.info_url + + application.address = ' '.join(tds[1].string.strip().split()) + application.postcode = getPostcodeFromText(application.address) + + application.description = tds[2].string.strip() + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = ForestOfDeanParser() + print parser.getResults(21,5,2008) + +# TODO - looks like it paginates at 20 diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 5a461ba..6059344 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -25,3 +25,5 @@ "Barnsley.py", "420" "Shetland.py", "420" "Kensington.py", "420" +"Fife.py", "420" +"ForestOfDean.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 02ec342..0f75ef1 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -225,4 +225,5 @@ "The Royal Borough of Kensington and Chelsea", "Kensington and Chelsea", "", "Kensington", "KensingtonParser" "Comhairle Nan Eilean Siar", "Comhairle Nan Eilean Siar", "http://planning.cne-siar.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" - +"Fife Council", "Fife", "", "Fife", "FifeParser" +"Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"