浏览代码

Add scraper for Broxtowe.

import/raw
duncan.parkes 17 年前
父节点
当前提交
77e9d3388f
共有 3 个文件被更改,包括 115 次插入0 次删除
  1. +113
    -0
      trunk/python_scrapers/Broxtowe.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 113
- 0
trunk/python_scrapers/Broxtowe.py 查看文件

@@ -0,0 +1,113 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BroxtoweParser:
def __init__(self, *args):

self.authority_name = "Broxtowe Borough Council"
self.authority_short_name = "Broxtowe"
self.base_url = "http://planning.broxtowe.gov.uk"

self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s"


self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
get_response = urllib2.urlopen(self.base_url)
get_soup = BeautifulSoup(get_response.read())

# These are the inputs with a default value
inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")]

# Add the submit button
inputs_needed.append(('cmdWeeklyList', 'Search Database'))

# We also need to add the date we want to search for.
# This is the friday after the date searched for.
# At weekends this will get you the friday before, but that isn't
# a problem as there are no apps then.
friday = search_day + datetime.timedelta(4 - search_day.weekday())

inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format)))

# We'd like as many results as we can get away with on one page.
# 50 is the largest option offerend
inputs_needed.append(("ddlResultsPerPageWeeklyList", "50"))

post_data = dict(inputs_needed)
post_url = get_response.url

# In case something goes wrong here, let's break out of the loop after at most 10 passes
passes = 0

while True:
passes += 1

post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data))
post_soup = BeautifulSoup(post_response.read())

result_tables = post_soup.table.findAll("table")

for result_table in result_tables:
application = PlanningApplication()

application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r"))
application.postcode = getPostcodeFromText(application.address)

trs = result_table.findAll("tr")

application.council_reference = trs[0].findAll("td")[1].string.strip()
application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date()
application.description = trs[3].findAll("td")[1].string.strip()

application.info_url = self.info_url %(urllib.quote(application.council_reference))

# In order to avoid having to do a download for every app,
# I'm setting the comment url to be the same as the info_url.
# There is a comment page which can be got to by pressing the button
application.comment_url = application.info_url

self._results.addApplication(application)

# Which page are we on?
page_no = int(post_soup.find("span", id="lblPageNo").b.string)
total_pages = int(post_soup.find("span", id="lblTotalPages").b.string)

if passes > 10 or not page_no < total_pages:
break

post_data = [
("__EVENTTARGET", "hlbNext"),
("__EVENTARGUMENT", ""),
("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']),
("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']),
]

post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action'])

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BroxtoweParser()
print parser.getResults(3,10,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv 查看文件

@@ -65,3 +65,4 @@
"Leicestershire.py", "420" "Leicestershire.py", "420"
"Cairngorms.py", "420" "Cairngorms.py", "420"
"Calderdale.py", "420" "Calderdale.py", "420"
"Broxtowe.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv 查看文件

@@ -271,3 +271,4 @@
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser" "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser" "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser" "Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"

正在加载...
取消
保存