Procházet zdrojové kódy

Add scraper for Broxtowe.

import/raw
duncan.parkes před 16 roky
rodič
revize
77e9d3388f
3 změnil soubory, kde provedl 115 přidání a 0 odebrání
  1. +113
    -0
      trunk/python_scrapers/Broxtowe.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 113
- 0
trunk/python_scrapers/Broxtowe.py Zobrazit soubor

@@ -0,0 +1,113 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BroxtoweParser:
def __init__(self, *args):

self.authority_name = "Broxtowe Borough Council"
self.authority_short_name = "Broxtowe"
self.base_url = "http://planning.broxtowe.gov.uk"

self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s"


self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
get_response = urllib2.urlopen(self.base_url)
get_soup = BeautifulSoup(get_response.read())

# These are the inputs with a default value
inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")]

# Add the submit button
inputs_needed.append(('cmdWeeklyList', 'Search Database'))

# We also need to add the date we want to search for.
# This is the friday after the date searched for.
# At weekends this will get you the friday before, but that isn't
# a problem as there are no apps then.
friday = search_day + datetime.timedelta(4 - search_day.weekday())

inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format)))

# We'd like as many results as we can get away with on one page.
# 50 is the largest option offerend
inputs_needed.append(("ddlResultsPerPageWeeklyList", "50"))

post_data = dict(inputs_needed)
post_url = get_response.url

# In case something goes wrong here, let's break out of the loop after at most 10 passes
passes = 0

while True:
passes += 1

post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data))
post_soup = BeautifulSoup(post_response.read())

result_tables = post_soup.table.findAll("table")

for result_table in result_tables:
application = PlanningApplication()

application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r"))
application.postcode = getPostcodeFromText(application.address)

trs = result_table.findAll("tr")

application.council_reference = trs[0].findAll("td")[1].string.strip()
application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date()
application.description = trs[3].findAll("td")[1].string.strip()

application.info_url = self.info_url %(urllib.quote(application.council_reference))

# In order to avoid having to do a download for every app,
# I'm setting the comment url to be the same as the info_url.
# There is a comment page which can be got to by pressing the button
application.comment_url = application.info_url

self._results.addApplication(application)

# Which page are we on?
page_no = int(post_soup.find("span", id="lblPageNo").b.string)
total_pages = int(post_soup.find("span", id="lblTotalPages").b.string)

if passes > 10 or not page_no < total_pages:
break

post_data = [
("__EVENTTARGET", "hlbNext"),
("__EVENTARGUMENT", ""),
("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']),
("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']),
]

post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action'])

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BroxtoweParser()
print parser.getResults(3,10,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Zobrazit soubor

@@ -65,3 +65,4 @@
"Leicestershire.py", "420"
"Cairngorms.py", "420"
"Calderdale.py", "420"
"Broxtowe.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Zobrazit soubor

@@ -271,3 +271,4 @@
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"

Načítá se…
Zrušit
Uložit