From 968b5c5040fd879177868eb3f027a66831d18d44 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Fri, 8 Aug 2008 11:31:17 +0000 Subject: [PATCH] Add Aberdeenshire parser. --- python_scrapers/Aberdeenshire.py | 91 ++++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + 3 files changed, 93 insertions(+) create mode 100644 python_scrapers/Aberdeenshire.py diff --git a/python_scrapers/Aberdeenshire.py b/python_scrapers/Aberdeenshire.py new file mode 100644 index 0000000..7716230 --- /dev/null +++ b/python_scrapers/Aberdeenshire.py @@ -0,0 +1,91 @@ + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi +import re + +comment_re = re.compile("Submit Comment") +mapref_re = re.compile("Map Ref") + +import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +class AberdeenshireParser: + def __init__(self, *args): + + self.authority_name = "Aberdeenshire Council" + self.authority_short_name = "Aberdeenshire" + self.base_url = "http://www.aberdeenshire.gov.uk/planning/apps/search.asp?startDateSearch=%(day)s%%2F%(month)s%%2F%(year)s&endDateSearch=%(day)s%%2F%(month)s%%2F%(year)s&Submit=Search" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + next = self.base_url %{"day": day, + "month": month, + "year": year, + } + + while next: + + # Now get the search page + response = urllib2.urlopen(next) + + soup = BeautifulSoup.BeautifulSoup(response.read()) + + trs = soup.table.findAll("tr")[1:] # First one is just headers + + for tr in trs: + application = PlanningApplication() + + application.date_received = search_day + application.council_reference = tr.a.string + application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) + tds = tr.findAll("td") + + application.address = ' '.join([x.replace(" ", " ").strip() for x in tds[2].contents if type(x) == BeautifulSoup.NavigableString and x.strip()]) + application.postcode = getPostcodeFromText(application.address) + application.description = tds[4].string.replace(" ", " ").strip() + + # Get the info page in order to find the comment url + # we could do this without a download if it wasn't for the + # sector parameter - I wonder what that is? + info_response = urllib2.urlopen(application.info_url) + info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) + + comment_navstring = info_soup.find(text=comment_re) + + if comment_navstring: + application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text=comment_re).parent['href']) + else: + application.comment_url = "No Comments" + + # While we're at it, let's get the OSGB + application.osgb_x, application.osgb_y = [x.strip() for x in info_soup.find(text=mapref_re).findNext("a").string.strip().split(",")] + + self._results.addApplication(application) + + next_element = soup.find(text="next").parent + + if next_element.name == 'a': + next = urlparse.urljoin(self.base_url, next_element['href']) + else: + next = None + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = AberdeenshireParser() + print parser.getResults(7,8,2008) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index c84570e..c8b1be0 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -43,3 +43,4 @@ "Redbridge.pl", "493" "Redbridge.cgi", "493" "AmberValley.py", "420" +"Aberdeenshire.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index b1c4021..c825531 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -247,3 +247,4 @@ "London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser" "London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" "Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser" +"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"