From dec1a6984babd5c62f6f8e44f108314aad941055 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Fri, 23 Nov 2007 20:08:43 +0000 Subject: [PATCH] Add a RutlandLike scraper This will handle Rutland, Melton, and any other sites we find which look like them. East Northants and Harborough both also have sites of this sort, but I suspect they are being replaced by PublicAccess... --- python_scrapers/OtherFilesToCopy.csv | 3 +- python_scrapers/PublicAccess.py | 6 +- python_scrapers/RutlandLike.py | 97 ++++++++++++++++++++++++++++ python_scrapers/SitesToGenerate.csv | 4 ++ 4 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 python_scrapers/RutlandLike.py diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 5457d3c..b95ed11 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -10,4 +10,5 @@ "Broxbourne.cgi", "493" "EastHerts.cgi", "493" "NorthHerts.cgi", "493" -"Enfield.cgi", "493" \ No newline at end of file +"Enfield.cgi", "493" +"RutlandLike.py", "420" diff --git a/python_scrapers/PublicAccess.py b/python_scrapers/PublicAccess.py index c1b9a86..796a22d 100644 --- a/python_scrapers/PublicAccess.py +++ b/python_scrapers/PublicAccess.py @@ -350,10 +350,10 @@ class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): if __name__ == '__main__': - day = 31 - month = 8 + day = 20 + month = 11 year = 2007 - parser = PublicAccessParser("Bristol", "Bristol", "http://e2eweb.bristol-city.gov.uk/PublicAccess/tdc/", True) + parser = PublicAccessParser("Hambleton", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", True) print parser.getResults(day, month, year) diff --git a/python_scrapers/RutlandLike.py b/python_scrapers/RutlandLike.py new file mode 100644 index 0000000..9401951 --- /dev/null +++ b/python_scrapers/RutlandLike.py @@ -0,0 +1,97 @@ +import urllib2 +import urllib +import urlparse + +import datetime +#import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +# Where the council reference fills the gap +comment_url_end = "comment.asp?%s" + +#comment_regex = re.compile("Comment on this ") + + +class RutlandLikeParser: + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + date_string = search_date.strftime(date_format) + + search_data = urllib.urlencode({"reference": "", + "undecided": "yes", + "dateFrom": date_string, + "dateTo": date_string, + "Address": "", + "validate": "true", + }) + + + request = urllib2.Request(self.base_url, search_data) + response = urllib2.urlopen(request) + + html = response.read() + + soup = BeautifulSoup(html) + + tables = soup.findAll("table", {"style": "width:auto;"}) + + if not tables: + return self._results + + # We don't want the first or last tr + trs = tables[0].findAll("tr")[1:-1] + + for tr in trs: + app = PlanningApplication() + + tds = tr.findAll("td") + + if len(tds) == 4: + local_info_url = tds[0].a['href'] + app.info_url = urlparse.urljoin(self.base_url, local_info_url) + app.council_reference = tds[0].a.string + + app.address = tds[1].string + app.postcode = getPostcodeFromText(app.address) + + app.description = tds[2].string + + app.comment_url = urlparse.urljoin(self.base_url, comment_url_end %app.council_reference) + app.date_received = search_date + + self._results.addApplication(app) + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +if __name__ == '__main__': + rutland_parser = RutlandLikeParser("Rutland long", "Rutland", "http://www.meltononline.co.uk/planning/searchparam.asp") + + print rutland_parser.getResults(15,11,2007) + diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index ea1c042..c11b358 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -137,3 +137,7 @@ "Chester-le-Street District Council", "Chester-le-Street", "http://planning.chester-le-street.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Vale of the White Horse District Council", "Vale of the White Horse", "http://planning.whitehorsedc.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Corby Borough Council", "Corby", "https://publicaccess.corby.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Rutland County Council", "Rutland", "http://www.rutland.gov.uk/wellandplanning/searchparam.asp", "RutlandLike", "RutlandLikeParser" +"Melton Borough Council", "Melton", "http://www.meltononline.co.uk/planning/searchparam.asp", "RutlandLike", "RutlandLikeParser" +"Harborough District Council", "Harborough", "http://pa.harborough.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"East Northamptonshire Council", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"