From 7810f01d83cdff5751c9b0fb2d1b62c311721f80 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Wed, 15 Oct 2008 23:53:59 +0000 Subject: [PATCH] Add scraper for Calderdale. --- trunk/python_scrapers/Calderdale.py | 79 ++++++++++++++++++++++ trunk/python_scrapers/OtherFilesToCopy.csv | 1 + trunk/python_scrapers/SitesToGenerate.csv | 1 + 3 files changed, 81 insertions(+) create mode 100644 trunk/python_scrapers/Calderdale.py diff --git a/trunk/python_scrapers/Calderdale.py b/trunk/python_scrapers/Calderdale.py new file mode 100644 index 0000000..74f1833 --- /dev/null +++ b/trunk/python_scrapers/Calderdale.py @@ -0,0 +1,79 @@ +import urllib2 +import urllib +import urlparse + +import datetime + +import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d%%2F%m%%2F%Y" + +class CalderdaleParser: + def __init__(self, *args): + self.authority_name = "Calderdale Council" + self.authority_short_name = "Calderdale" + self.base_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?status=0&date1=%(date)s&date2=%(date)s&Search=Search" + self.info_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?app=%s&Search=Search" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + next_page_url = self.base_url %{"date": search_date.strftime(date_format)} + + while next_page_url: + try: + response = urllib2.urlopen(next_page_url) + except urllib2.HTTPError: + # This is what seems to happen if there are no apps + break + + soup = BeautifulSoup(response.read()) + + next = soup.find(text="Next") + if next: + next_page_url = urlparse.urljoin(self.base_url, next.parent['href']) + else: + next_page_url = None + + # There is an

for each app that we can use + for h3 in soup.findAll("h3", {"class": "resultsnavbar"}): + application = PlanningApplication() + + application.date_received = search_date + application.council_reference = h3.string.split(": ")[1] + application.description = h3.findNext("div").find(text="Proposal:").parent.nextSibling.strip() + + application.address = ', '.join(h3.findNext("div").find(text="Address of proposal:").parent.nextSibling.strip().split("\r")) + application.postcode = getPostcodeFromText(application.address) + + application.comment_url = urlparse.urljoin(self.base_url, h3.findNext("div").find(text=re.compile("Comment on Application")).parent['href']) + + application.info_url = self.info_url %(urllib.quote(application.council_reference)) + + application.osgb_x, application.osgb_y = h3.findNext("div").find(text="Grid Reference:").parent.nextSibling.strip().split() + + self._results.addApplication(application) + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +if __name__ == '__main__': + parser = CalderdaleParser() + print parser.getResults(1,10,2008) + +# TODO + +# 1) Find a better way to deal with the no apps situation. diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index bc3bfb8..4a0873f 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -64,3 +64,4 @@ "Lichfield.py", "420" "Leicestershire.py", "420" "Cairngorms.py", "420" +"Calderdale.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index adb9a18..f8025a9 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -270,3 +270,4 @@ "Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser" "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser" "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser" +"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"