From 98f49f172f35faf88da5a870c36efaf27ba02e98 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Tue, 12 Aug 2008 12:34:20 +0000 Subject: [PATCH] Add scraper for Hounslow. --- python_scrapers/Hounslow.py | 72 ++++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + 3 files changed, 74 insertions(+) create mode 100644 python_scrapers/Hounslow.py diff --git a/python_scrapers/Hounslow.py b/python_scrapers/Hounslow.py new file mode 100644 index 0000000..2a42076 --- /dev/null +++ b/python_scrapers/Hounslow.py @@ -0,0 +1,72 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class HounslowParser: + def __init__(self, *args): + + self.authority_name = "London Borough of Hounslow" + self.authority_short_name = "Hounslow" + self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500" + # Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty. + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + # Now get the search page + response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) + soup = BeautifulSoup(response.read()) + + # Results are shown in a table each. The tables don't have any nice + # attributes, but they do all contain a NavString "Application", + # and nothing else does... + nav_strings = soup.findAll(text="Application") + + for nav_string in nav_strings: + result_table = nav_string.findPrevious("table") + + application = PlanningApplication() + application.date_received = search_day + + links = result_table.findAll("a") + + # We can get OSGB coordinates from the link to streetmap + map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]['href'])[3]) + + application.osgb_x = map_qs_dict.get("x")[0] + application.osgb_y = map_qs_dict.get("y")[0] + + application.council_reference = links[1].string.strip() + application.info_url = urlparse.urljoin(self.base_url, links[1]['href']) + application.comment_url = urlparse.urljoin(self.base_url, links[2]['href']) + + application.address = ' '.join(links[0].previous.strip().split()) + application.postcode = getPostcodeFromText(application.address) + + application.description = links[2].previous.strip() + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = HounslowParser() + print parser.getResults(1,8,2008) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 6fa19ef..4c0c81a 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -49,3 +49,4 @@ "Berwick.py", "420" "Birmingham.py", "420" "KingstonUponThames.py", "420" +"Hounslow.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index b6536f5..5ccb92f 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -253,3 +253,4 @@ "Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser" "Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser" "Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser" +"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"