From 09ca7050c35b24570c0c49bbe6bec96f30dde1d0 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Wed, 15 Oct 2008 16:33:58 +0000 Subject: [PATCH] Add scraper for Leicestershire County Council. --- python_scrapers/Leicestershire.py | 76 ++++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + 3 files changed, 78 insertions(+) create mode 100644 python_scrapers/Leicestershire.py diff --git a/python_scrapers/Leicestershire.py b/python_scrapers/Leicestershire.py new file mode 100644 index 0000000..c0add6a --- /dev/null +++ b/python_scrapers/Leicestershire.py @@ -0,0 +1,76 @@ +import urllib2 +import urllib +import urlparse + +import datetime +import re + +import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +search_date_format = "%d%%2F%m%%2F%Y" + +class LeicestershireParser: + def __init__(self, *args): + + self.authority_name = "Leicestershire County Council" + self.authority_short_name = "Leicestershire" + self.base_url = "http://www.leics.gov.uk/index/environment/community_services_planning/planning_applications/index/environment/community_services_planning/planning_applications/eplanning_searchform/eplanning_resultpage.htm?sd=%(date)s&ed=%(date)s&kw=&map=f" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format)}) + soup = BeautifulSoup.BeautifulSoup(response.read()) + + if not soup.find(text=re.compile("No Results Found")): + + trs = soup.findAll("table", {"class": "dataTable"})[1].findAll("tr")[1:] + + for tr in trs: + tds = tr.findAll("td") + + application = PlanningApplication() + + # We can fill in the date received without actually looking at the data + application.date_received = search_date + + application.council_reference = tds[0].a.string.strip() + application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) + application.address = ', '.join([x for x in tds[1].contents + if isinstance(x, BeautifulSoup.NavigableString)]) + application.postcode = getPostcodeFromText(application.address) + application.description = tds[2].string.strip() + + # To get the comment link we need to fetch the info page + + info_response = urllib2.urlopen(application.info_url) + info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) + + base = info_soup.base['href'] + + application.comment_url = urlparse.urljoin(base, + info_soup.find("a", target="Planning Application Consultation Form")['href']) + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = LeicestershireParser() + print parser.getResults(1,9,2008) + + +# TODO + +# I suppose we should think about pagination at some point, +# though I've not managed to find a day with more than 1 app yet... diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index c3ccf12..193b34c 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -62,3 +62,4 @@ "WestDorset.py", "420" "Kirklees.py", "420" "Lichfield.py", "420" +"Leicestershire.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 0f2277c..7d7429f 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -268,3 +268,4 @@ "West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser" "Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser" "Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser" +"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"