From df32136695bb34b1b098ade8ceb1289b3f849796 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 30 Nov 2008 14:55:02 +0000 Subject: [PATCH] Add Thomas' scraper for Solihull. --- python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + python_scrapers/Solihull.py | 81 ++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+) create mode 100644 python_scrapers/Solihull.py diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index c0a1657..d8e2a79 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -68,3 +68,4 @@ "Broxtowe.py", "420" "Mendip.py", "420" "Weymouth.py", "420" +"Solihull.py", "420" \ No newline at end of file diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 57739be..a473f3b 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -274,3 +274,4 @@ "Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser" "Mendip District Council", "Mendip", "", "Mendip", "MendipParser" "Weymouth and Portland Borough Council", "Weymouth and Portland", "", "Weymouth", "WeymouthParser" +"Solihull Metropolitan Borough Council", "Solihull", "", "Solihull", "SolihullParser" diff --git a/python_scrapers/Solihull.py b/python_scrapers/Solihull.py new file mode 100644 index 0000000..84a8a47 --- /dev/null +++ b/python_scrapers/Solihull.py @@ -0,0 +1,81 @@ +""" +This is the screenscraper for planning apps for +Solihull Metropolitan Borough Council. + +The apps for Solihull are displayed in html pages one per week, starting on Monday. +""" + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class SolihullParser: + + def __init__(self, *args): + + self.authority_name = "Solihull Metropolitan Borough Council" + self.authority_short_name = "Solihull" + self.base_url = "http://www.solihull.gov.uk/planning/dc/weeklist.asp?SD=%s&ward=ALL" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + # What we actually need is the monday before the date searched for: + monday_before = search_day - datetime.timedelta(search_day.weekday()) + + # Now get the search page + response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format))) + soup = BeautifulSoup(response.read()) + + result_tables = soup.findAll("table", width="98%", cellpadding="2") + + for table in result_tables: + application = PlanningApplication() + + trs = table.findAll("tr") + application.council_reference = trs[0].strong.string.strip() + relative_info_url = trs[0].a['href'] + application.info_url = urlparse.urljoin(self.base_url, relative_info_url) + + application.address = trs[1].findAll("td")[1].string.strip() + application.postcode = getPostcodeFromText(application.address) + application.description = trs[2].findAll("td")[1].string.strip() + + #There's probably a prettier way to get the date, but with Python, it's easier for me to reinvent the wheel than to find an existing wheel! + raw_date_recv = trs[3].findAll("td")[3].string.strip().split("/") + #Check whether the application is on the target day. If not, discard it and move on. + if int(raw_date_recv[0]) != day: + continue + application.date_received = datetime.date(int(raw_date_recv[2]), int(raw_date_recv[1]), int(raw_date_recv[0])) + + try: + relative_comment_url = trs[5].findAll("td")[1].a['href'] + application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url) + except: + application.comment_url = "No Comment URL." + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = SolihullParser() + #Put this in with constant numbers, copying the Barnsley example. Works for testing, but should it use the arguments for a real run? + print parser.getResults(27,10,2008)