From 33a1ee02abc7e671c07425cde1093c5eb2d1f108 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Fri, 5 Sep 2008 14:14:10 +0000 Subject: [PATCH] Add Hampshire scraper. --- python_scrapers/Hampshire.py | 78 ++++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + 3 files changed, 80 insertions(+) create mode 100644 python_scrapers/Hampshire.py diff --git a/python_scrapers/Hampshire.py b/python_scrapers/Hampshire.py new file mode 100644 index 0000000..d29e052 --- /dev/null +++ b/python_scrapers/Hampshire.py @@ -0,0 +1,78 @@ +""" +This is the scraper for Hampshire. + +There appears to be no way to search by date received, so what we'll do is +go to the currently open for consultation page and just use that. + +I don't think we need to worry about pagination, as there are hardly any. + +""" + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class HampshireParser: + def __init__(self, *args): + + self.authority_name = "Hampshire County Council" + self.authority_short_name = "Hampshire" + self.base_url = "http://www3.hants.gov.uk/planning/mineralsandwaste/planning-applications/applications/applications-open.htm" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + # Now get the search page + response = urllib2.urlopen(self.base_url) + soup = BeautifulSoup(response.read()) + + trs = soup.table.table.findAll("tr", {"class": re.compile("(?:odd)|(?:even)")}) + + + for tr in trs: + application = PlanningApplication() + + tds = tr.findAll("td") + + application.council_reference = tds[0].a.string.strip() + application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href']) + application.address = tds[2].string.strip() + application.postcode = getPostcodeFromText(application.address) + application.description = tds[3].string.strip() + + # Fetch the info url in order to get the date received and the comment url + + info_response = urllib2.urlopen(application.info_url) + + info_soup = BeautifulSoup(info_response.read()) + + application.date_received = datetime.datetime.strptime(info_soup.find(text=re.compile("\s*Received:\s*")).findNext("td").string.strip(), date_format).date() + + application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("input", value="Comment on this application").parent['action']) + + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = HampshireParser() + print parser.getResults(21,5,2008) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 8aec0ed..4930687 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -53,3 +53,4 @@ "Harrow.py", "420" "Westminster.py", "420" "Halton.py", "420" +"Hampshire.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 311c2ed..079c7a0 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -257,3 +257,4 @@ "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser" "Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser" "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" +"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"