diff --git a/python_scrapers/Harrow.py b/python_scrapers/Harrow.py new file mode 100644 index 0000000..1398af5 --- /dev/null +++ b/python_scrapers/Harrow.py @@ -0,0 +1,74 @@ + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +import re + +location_re = re.compile("Location:") +date_received_re = re.compile("Date first received:") + +date_format = "%d %b %Y" + +class HarrowParser: + def __init__(self, *args): + + self.authority_name = "London Borough of Harrow" + self.authority_short_name = "Harrow" + + # This is a link to the last seven days applications + # The day, month, and year arguments will be ignored. + self.base_url = "http://www.harrow.gov.uk/www4/planning/dcweek1.asp" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + # Now get the search page + response = urllib2.urlopen(self.base_url) + + soup = BeautifulSoup(response.read()) + + # Each application contains the nav string "Application: " + nav_strings = soup.findAll(text="Application: ") + + for nav_string in nav_strings: + application = PlanningApplication() + + application.council_reference = nav_string.findPrevious("tr").findAll("td", limit=2)[1].string.strip() + + application.address = nav_string.findNext(text=location_re).split(":")[1].strip() + application.postcode = getPostcodeFromText(application.address) + + application.description = nav_string.findNext(text="Proposal: ").findNext("td").string.strip() + + application.comment_url = urlparse.urljoin(self.base_url, nav_string.findNext(text="Proposal: ").findNext("a")['href']) + + application.date_received = datetime.datetime.strptime(nav_string.findNext(text=date_received_re).split(": ")[1], date_format).date() + + # FIXME: There is no appropriate info_url for the Harrow apps. + # I'll put the base url for the moment, but as that is + # a list of apps from the last 7 days that will quickly be out of date. + + application.info_url = self.base_url + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = HarrowParser() + print parser.getResults(21,5,2008) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 4c0c81a..fc33531 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -50,3 +50,4 @@ "Birmingham.py", "420" "KingstonUponThames.py", "420" "Hounslow.py", "420" +"Harrow.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 5ccb92f..023b500 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -254,3 +254,4 @@ "Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser" "Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser" "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser" +"London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser"