From af50d991f3a0961d699a93eb12b5f00537155c7b Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Fri, 8 Aug 2008 15:54:31 +0000 Subject: [PATCH] Add scraper for Brent. --- trunk/python_scrapers/Brent.py | 125 +++++++++++++++++++++ trunk/python_scrapers/OtherFilesToCopy.csv | 1 + trunk/python_scrapers/SitesToGenerate.csv | 1 + 3 files changed, 127 insertions(+) create mode 100644 trunk/python_scrapers/Brent.py diff --git a/trunk/python_scrapers/Brent.py b/trunk/python_scrapers/Brent.py new file mode 100644 index 0000000..fce1f94 --- /dev/null +++ b/trunk/python_scrapers/Brent.py @@ -0,0 +1,125 @@ + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi +import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class BrentParser: + def __init__(self, *args): + + self.authority_name = "London Borough of Brent" + self.authority_short_name = "Brent" +# self.base_url = "http://www.brent.gov.uk/servlet/ep.ext?extId=101149&byPeriod=Y&st=PL&periodUnits=day&periodMultiples=14" + self.base_url = "http://www.brent.gov.uk/servlet/ep.ext" + + self._current_application = None + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + post_data = [ + ("from", search_day.strftime(date_format)), + ("until", search_day.strftime(date_format)), + ("EXECUTEQUERY", "Query"), +# ("auth", "402"), + ("st", "PL"), + ("periodUnits", "day"), + ("periodMultiples", "14"), + ("title", "Search+by+Application+Date"), + ("instructions", "Enter+a+date+range+to+search+for+existing+applications+by+the+date+of+application.%0D%0A%3Cbr%3E%3Cbr%3E%0D%0A%3Cstrong%3ENote%3A%3C%2Fstrong%3E+Where+%27%28Applicant%27s+Description%29%27+appears+in+the+proposal%2C+the+text+may+subsequently+be+amended+when+the+application+is+checked."), + ("byFormat", "N"), + ("byOther1", "N"), + ("byOther2", "N"), + ("byOther3", "N"), + ("byOther4", "N"), + ("byOther5", "N"), + ("byPostcode", "N"), + ("byStreet", "N"), + ("byHouseNumber", "N"), + ("byAddress", "N"), + ("byPeriod", "Y"), + ("extId", "101149"), # I wonder what this is... + ("queried", "Y"), + ("other1Label", "Other1"), + ("other2Label", "Other2"), + ("other3Label", "Other3"), + ("other4Label", "Other4"), + ("other5Label", "Other5"), + ("other1List", ""), + ("other2List", ""), + ("other3List", ""), + ("other4List", ""), + ("other5List", ""), + ("periodLabel", "From"), + ("addressLabel", "Select+Address"), + ("print", "") + ] + + # Now get the search page + response = urllib2.urlopen(self.base_url, urllib.urlencode(post_data)) + + soup = BeautifulSoup(response.read()) + + trs = soup.find(text="Search Results").findNext("table").findAll("tr")[:-1] + + # There are six trs per application, ish + + # The first contains the case no and the application date. + # The second contains the address + # The third contains the description + # The fourth contains the info page link + # The fifth contains the comment link (or a note that comments are currently not being accepted + # The sixth is a spacer. + + count = 0 + for tr in trs: + count +=1 + + ref = tr.find(text=re.compile("Case No:")) + + if ref: + self._current_application = PlanningApplication() + count = 1 + + self._current_application.council_reference = ref.split(":")[1].strip() + self._current_application.date_received = search_day + + if count % 6 == 2: + self._current_application.address = tr.td.string.strip() + self._current_application.postcode = getPostcodeFromText(self._current_application.address) + if count % 6 == 3: + self._current_application.description = tr.td.string.strip() + if count % 6 == 4: + self._current_application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) + if count % 6 == 5: + try: + self._current_application.comment_url = urlparse.urljoin(self.base_url, tr.a['href']) + except: + # Comments are not currently being accepted. We'll leave this app for the moment - we'll pick it up later if they start accepting comments + continue + if count % 6 == 0 and self._current_application.is_ready(): + self._results.addApplication(self._current_application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = BrentParser() + print parser.getResults(6,8,2008) + diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index c8b1be0..d47dc1d 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -44,3 +44,4 @@ "Redbridge.cgi", "493" "AmberValley.py", "420" "Aberdeenshire.py", "420" +"Brent.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index c825531..fdf069f 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -248,3 +248,4 @@ "London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" "Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser" "Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser" +"London Borough of Brent", "Brent", "", "Brent", "BrentParser"