Переглянути джерело

Add scraper for Broxtowe.

import/raw
duncan.parkes 16 роки тому
джерело
коміт
77e9d3388f
3 змінених файлів з 115 додано та 0 видалено
  1. +113
    -0
      trunk/python_scrapers/Broxtowe.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 113
- 0
trunk/python_scrapers/Broxtowe.py Переглянути файл

@@ -0,0 +1,113 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BroxtoweParser:
def __init__(self, *args):

self.authority_name = "Broxtowe Borough Council"
self.authority_short_name = "Broxtowe"
self.base_url = "http://planning.broxtowe.gov.uk"

self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s"


self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
get_response = urllib2.urlopen(self.base_url)
get_soup = BeautifulSoup(get_response.read())

# These are the inputs with a default value
inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")]

# Add the submit button
inputs_needed.append(('cmdWeeklyList', 'Search Database'))

# We also need to add the date we want to search for.
# This is the friday after the date searched for.
# At weekends this will get you the friday before, but that isn't
# a problem as there are no apps then.
friday = search_day + datetime.timedelta(4 - search_day.weekday())

inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format)))

# We'd like as many results as we can get away with on one page.
# 50 is the largest option offerend
inputs_needed.append(("ddlResultsPerPageWeeklyList", "50"))

post_data = dict(inputs_needed)
post_url = get_response.url

# In case something goes wrong here, let's break out of the loop after at most 10 passes
passes = 0

while True:
passes += 1

post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data))
post_soup = BeautifulSoup(post_response.read())

result_tables = post_soup.table.findAll("table")

for result_table in result_tables:
application = PlanningApplication()

application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r"))
application.postcode = getPostcodeFromText(application.address)

trs = result_table.findAll("tr")

application.council_reference = trs[0].findAll("td")[1].string.strip()
application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date()
application.description = trs[3].findAll("td")[1].string.strip()

application.info_url = self.info_url %(urllib.quote(application.council_reference))

# In order to avoid having to do a download for every app,
# I'm setting the comment url to be the same as the info_url.
# There is a comment page which can be got to by pressing the button
application.comment_url = application.info_url

self._results.addApplication(application)

# Which page are we on?
page_no = int(post_soup.find("span", id="lblPageNo").b.string)
total_pages = int(post_soup.find("span", id="lblTotalPages").b.string)

if passes > 10 or not page_no < total_pages:
break

post_data = [
("__EVENTTARGET", "hlbNext"),
("__EVENTARGUMENT", ""),
("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']),
("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']),
]

post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action'])

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BroxtoweParser()
print parser.getResults(3,10,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Переглянути файл

@@ -65,3 +65,4 @@
"Leicestershire.py", "420"
"Cairngorms.py", "420"
"Calderdale.py", "420"
"Broxtowe.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Переглянути файл

@@ -271,3 +271,4 @@
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"

Завантаження…
Відмінити
Зберегти