Browse Source

Add scraper for Broxtowe.

duncan.parkes 15 years ago
3 changed files with 115 additions and 0 deletions
  1. +113
  2. +1
  3. +1

+ 113
- 0
python_scrapers/ View File

@@ -0,0 +1,113 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \

date_format = "%d/%m/%Y"

class BroxtoweParser:
def __init__(self, *args):

self.authority_name = "Broxtowe Borough Council"
self.authority_short_name = "Broxtowe"
self.base_url = ""

self.info_url = ""

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

def getResultsByDayMonthYear(self, day, month, year):
search_day =, month, day)

# Now get the search page
get_response = urllib2.urlopen(self.base_url)
get_soup = BeautifulSoup(

# These are the inputs with a default value
inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")]

# Add the submit button
inputs_needed.append(('cmdWeeklyList', 'Search Database'))

# We also need to add the date we want to search for.
# This is the friday after the date searched for.
# At weekends this will get you the friday before, but that isn't
# a problem as there are no apps then.
friday = search_day + datetime.timedelta(4 - search_day.weekday())

inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format)))

# We'd like as many results as we can get away with on one page.
# 50 is the largest option offerend
inputs_needed.append(("ddlResultsPerPageWeeklyList", "50"))

post_data = dict(inputs_needed)
post_url = get_response.url

# In case something goes wrong here, let's break out of the loop after at most 10 passes
passes = 0

while True:
passes += 1

post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data))
post_soup = BeautifulSoup(

result_tables = post_soup.table.findAll("table")

for result_table in result_tables:
application = PlanningApplication()

application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r"))
application.postcode = getPostcodeFromText(application.address)

trs = result_table.findAll("tr")

application.council_reference = trs[0].findAll("td")[1].string.strip()
application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date()
application.description = trs[3].findAll("td")[1].string.strip()

application.info_url = self.info_url %(urllib.quote(application.council_reference))

# In order to avoid having to do a download for every app,
# I'm setting the comment url to be the same as the info_url.
# There is a comment page which can be got to by pressing the button
application.comment_url = application.info_url


# Which page are we on?
page_no = int(post_soup.find("span", id="lblPageNo").b.string)
total_pages = int(post_soup.find("span", id="lblTotalPages").b.string)

if passes > 10 or not page_no < total_pages:

post_data = [
("__EVENTTARGET", "hlbNext"),
("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']),
("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']),

post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action'])

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BroxtoweParser()
print parser.getResults(3,10,2008)

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv View File

@@ -65,3 +65,4 @@
"", "420" "", "420"
"", "420" "", "420"
"", "420" "", "420"
"", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv View File

@@ -271,3 +271,4 @@
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser" "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser" "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser" "Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"
