From 42d99b3e7fde1d3b5c8700cc3d1c5304e48cf8fb Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 16 Oct 2008 10:24:44 +0000 Subject: [PATCH] Add scraper for Broxtowe. --- python_scrapers/Broxtowe.py | 113 +++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/SitesToGenerate.csv | 1 + 3 files changed, 115 insertions(+) create mode 100644 python_scrapers/Broxtowe.py diff --git a/python_scrapers/Broxtowe.py b/python_scrapers/Broxtowe.py new file mode 100644 index 0000000..3f3dd2b --- /dev/null +++ b/python_scrapers/Broxtowe.py @@ -0,0 +1,113 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class BroxtoweParser: + def __init__(self, *args): + + self.authority_name = "Broxtowe Borough Council" + self.authority_short_name = "Broxtowe" + self.base_url = "http://planning.broxtowe.gov.uk" + + self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s" + + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + # Now get the search page + get_response = urllib2.urlopen(self.base_url) + get_soup = BeautifulSoup(get_response.read()) + + # These are the inputs with a default value + inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")] + + # Add the submit button + inputs_needed.append(('cmdWeeklyList', 'Search Database')) + + # We also need to add the date we want to search for. + # This is the friday after the date searched for. + # At weekends this will get you the friday before, but that isn't + # a problem as there are no apps then. + friday = search_day + datetime.timedelta(4 - search_day.weekday()) + + inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format))) + + # We'd like as many results as we can get away with on one page. + # 50 is the largest option offerend + inputs_needed.append(("ddlResultsPerPageWeeklyList", "50")) + + post_data = dict(inputs_needed) + post_url = get_response.url + + # In case something goes wrong here, let's break out of the loop after at most 10 passes + passes = 0 + + while True: + passes += 1 + + post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data)) + post_soup = BeautifulSoup(post_response.read()) + + result_tables = post_soup.table.findAll("table") + + for result_table in result_tables: + application = PlanningApplication() + + application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r")) + application.postcode = getPostcodeFromText(application.address) + + trs = result_table.findAll("tr") + + application.council_reference = trs[0].findAll("td")[1].string.strip() + application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date() + application.description = trs[3].findAll("td")[1].string.strip() + + application.info_url = self.info_url %(urllib.quote(application.council_reference)) + + # In order to avoid having to do a download for every app, + # I'm setting the comment url to be the same as the info_url. + # There is a comment page which can be got to by pressing the button + application.comment_url = application.info_url + + self._results.addApplication(application) + + # Which page are we on? + page_no = int(post_soup.find("span", id="lblPageNo").b.string) + total_pages = int(post_soup.find("span", id="lblTotalPages").b.string) + + if passes > 10 or not page_no < total_pages: + break + + post_data = [ + ("__EVENTTARGET", "hlbNext"), + ("__EVENTARGUMENT", ""), + ("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']), + ("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']), + ] + + post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action']) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = BroxtoweParser() + print parser.getResults(3,10,2008) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 4a0873f..2d225ac 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -65,3 +65,4 @@ "Leicestershire.py", "420" "Cairngorms.py", "420" "Calderdale.py", "420" +"Broxtowe.py", "420" diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index f8025a9..a7248e6 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -271,3 +271,4 @@ "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser" "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser" "Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser" +"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"