From e39114078f5fe9f8cf011f94b7abcf317d22cc6f Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Tue, 14 Oct 2008 14:50:46 +0000 Subject: [PATCH] Add parser for Kirklees. Get rid of some unnecessary imports. --- python_scrapers/Aberdeenshire.py | 2 +- python_scrapers/Berwick.py | 2 +- python_scrapers/Kirklees.py | 85 ++++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + 5 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 python_scrapers/Kirklees.py diff --git a/python_scrapers/Aberdeenshire.py b/python_scrapers/Aberdeenshire.py index 7716230..abee6a0 100644 --- a/python_scrapers/Aberdeenshire.py +++ b/python_scrapers/Aberdeenshire.py @@ -3,7 +3,7 @@ import urllib2 import urllib import urlparse -import datetime, time +import datetime import cgi import re diff --git a/python_scrapers/Berwick.py b/python_scrapers/Berwick.py index 50c76b9..faff353 100644 --- a/python_scrapers/Berwick.py +++ b/python_scrapers/Berwick.py @@ -3,7 +3,7 @@ import urllib2 import urllib import urlparse -import datetime, time +import datetime import cgi from BeautifulSoup import BeautifulSoup diff --git a/python_scrapers/Kirklees.py b/python_scrapers/Kirklees.py new file mode 100644 index 0000000..370b47f --- /dev/null +++ b/python_scrapers/Kirklees.py @@ -0,0 +1,85 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +search_date_format = "%d%%2F%m%%2F%Y" +received_date_format = "%d %b %Y" + +class KirkleesParser: + def __init__(self, *args): + + self.authority_name = "Kirklees Council" + self.authority_short_name = "Kirklees" + self.base_url = "http://www.kirklees.gov.uk/business/planning/List.asp?SrchApp=&SrchName=&SrchPostCode=&SrchStreet=&SrchDetails=&SrchLocality=&RorD=A&SrchDteFr=%(date)s&SrchDteTo=%(date)s&Submit=Search&pageNum=%(pagenum)d" + self.comments_email_address = "planning.contactcentre@kirklees.gov.uk" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + pagenum = 1 + + while pagenum: + response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format), + "pagenum": pagenum} + ) + soup = BeautifulSoup.BeautifulSoup(response.read()) + + # This is not a nice way to find the results table, but I can't + # see anything good to use, and it works... + + # There are two trs with style attributes per app. This will find all the first ones of the pairs. + trs = soup.find("table", border="0", cellpadding="0", cellspacing="2", width="100%", summary="").findAll("tr", style=True)[::2] + + for tr in trs: + tds = tr.findAll("td") + date_received = datetime.datetime.strptime(tds[3].string.strip(), received_date_format).date() + + # Stop looking through the list if we have found one which is earlier than the date searched for. + if date_received < search_date: + # If we break out, then we won't want the next page + pagenum = None + break + + application = PlanningApplication() + application.date_received = date_received + + application.council_reference = tds[0].small.string.strip() + + # The second contains the address, split up with
s + application.address = ' '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)]) + application.postcode = getPostcodeFromText(application.address) + + application.description = tds[2].string.strip() + + application.info_url = urlparse.urljoin(self.base_url, tr.findNext("a")['href']) + application.comment_url = self.comments_email_address + + self._results.addApplication(application) + else: + # If we got through the whole list without breaking out, + # then we'll want to get the next page. + pagenum += 1 + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = KirkleesParser() + print parser.getResults(1,10,2008) + +# TODO + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index dcdda49..965da04 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -60,3 +60,4 @@ "Eastbourne.py", "420" "Gosport.py", "420" "WestDorset.py", "420" +"Kirklees.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 2c4bfa8..d802505 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -266,3 +266,4 @@ "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser" "West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser" +"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"