From 2f155cbc6f2cab20a91b28a3522034157ea58f67 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Mon, 8 Sep 2008 11:31:31 +0000 Subject: [PATCH] Add scraper for Exmoor. Fix name of Herefordshire. --- python_scrapers/Exmoor.py | 70 ++++++++++++++++++++++++++++ python_scrapers/Herefordshire.py | 2 +- python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 3 +- 4 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 python_scrapers/Exmoor.py diff --git a/python_scrapers/Exmoor.py b/python_scrapers/Exmoor.py new file mode 100644 index 0000000..9dd437b --- /dev/null +++ b/python_scrapers/Exmoor.py @@ -0,0 +1,70 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +search_date_format = "%d+%b+%Y" +received_date_format = "%d %b %Y" + +class ExmoorParser: + def __init__(self, *args): + + self.authority_name = "Exmoor National Park" + self.authority_short_name = "Exmoor" + self.base_url = "http://www.exmoor-nationalpark.gov.uk/planning_weekly_list.htm?weeklylist=%s" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + response = urllib2.urlopen(self.base_url %(search_day.strftime(search_date_format))) + soup = BeautifulSoup(response.read()) + + # The first contains headers + trs = soup.table.findAll("tr")[1:] + + for tr in trs: + application = PlanningApplication() + + tds = tr.findAll("td") + + application.date_received = datetime.datetime.strptime(tds[0].string, received_date_format).date() + + application.info_url = urllib.unquote(urllib.quote_plus(urlparse.urljoin(self.base_url, tds[1].a['href']))) + application.council_reference = tds[1].a.string.strip() + application.address = tds[2].a.string.strip() + application.postcode = getPostcodeFromText(application.address) + + # Now fetch the info url + + info_response = urllib.urlopen(application.info_url) + info_soup = BeautifulSoup(info_response.read()) + + application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() + + try: + application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text="Comment").parent['href']) + except: + application.comment_url = "No Comments" + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = ExmoorParser() + print parser.getResults(1,8,2008) + diff --git a/python_scrapers/Herefordshire.py b/python_scrapers/Herefordshire.py index 19cfbfa..138a042 100644 --- a/python_scrapers/Herefordshire.py +++ b/python_scrapers/Herefordshire.py @@ -20,7 +20,7 @@ class HerefordshireParser: def __init__(self, *args): - self.authority_name = "Herefordshire County Council" + self.authority_name = "Herefordshire Council" self.authority_short_name = "Herefordshire" self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0" #As we are going to the info page, we may as well pick up the comment url from there. diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 1bf9904..8620687 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -56,3 +56,4 @@ "Hampshire.py", "420" "Hastings.py", "420" "Herefordshire.py", "420" +"Exmoor.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 7e36475..57c7bcd 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -259,4 +259,5 @@ "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" -"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" +"Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" +"Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser"