Pārlūkot izejas kodu

Add scraper for Broxtowe.

master
duncan.parkes pirms 16 gadiem
vecāks
revīzija
42d99b3e7f
3 mainītis faili ar 115 papildinājumiem un 0 dzēšanām
  1. +113
    -0
      python_scrapers/Broxtowe.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 113
- 0
python_scrapers/Broxtowe.py Parādīt failu

@@ -0,0 +1,113 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BroxtoweParser:
def __init__(self, *args):

self.authority_name = "Broxtowe Borough Council"
self.authority_short_name = "Broxtowe"
self.base_url = "http://planning.broxtowe.gov.uk"

self.info_url = "http://planning.broxtowe.gov.uk/ApplicationDetail.aspx?RefVal=%s"


self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
get_response = urllib2.urlopen(self.base_url)
get_soup = BeautifulSoup(get_response.read())

# These are the inputs with a default value
inputs_needed = [(x['id'], x['value']) for x in get_soup.form.findAll("input", value=True, type=lambda x: x != "submit")]

# Add the submit button
inputs_needed.append(('cmdWeeklyList', 'Search Database'))

# We also need to add the date we want to search for.
# This is the friday after the date searched for.
# At weekends this will get you the friday before, but that isn't
# a problem as there are no apps then.
friday = search_day + datetime.timedelta(4 - search_day.weekday())

inputs_needed.append(("ddlWeeklyList", friday.strftime(date_format)))

# We'd like as many results as we can get away with on one page.
# 50 is the largest option offerend
inputs_needed.append(("ddlResultsPerPageWeeklyList", "50"))

post_data = dict(inputs_needed)
post_url = get_response.url

# In case something goes wrong here, let's break out of the loop after at most 10 passes
passes = 0

while True:
passes += 1

post_response = urllib2.urlopen(post_url, urllib.urlencode(post_data))
post_soup = BeautifulSoup(post_response.read())

result_tables = post_soup.table.findAll("table")

for result_table in result_tables:
application = PlanningApplication()

application.address = ', '.join(result_table.findPrevious("b").string.strip().split("\r"))
application.postcode = getPostcodeFromText(application.address)

trs = result_table.findAll("tr")

application.council_reference = trs[0].findAll("td")[1].string.strip()
application.date_received = datetime.datetime.strptime(trs[1].findAll("td")[1].string.strip(), date_format).date()
application.description = trs[3].findAll("td")[1].string.strip()

application.info_url = self.info_url %(urllib.quote(application.council_reference))

# In order to avoid having to do a download for every app,
# I'm setting the comment url to be the same as the info_url.
# There is a comment page which can be got to by pressing the button
application.comment_url = application.info_url

self._results.addApplication(application)

# Which page are we on?
page_no = int(post_soup.find("span", id="lblPageNo").b.string)
total_pages = int(post_soup.find("span", id="lblTotalPages").b.string)

if passes > 10 or not page_no < total_pages:
break

post_data = [
("__EVENTTARGET", "hlbNext"),
("__EVENTARGUMENT", ""),
("__VIEWSTATE", post_soup.find("input", id="__VIEWSTATE")['value']),
("__EVENTVALIDATION", post_soup.find("input", id="__EVENTVALIDATION")['value']),
]

post_url = urlparse.urljoin(post_response.url, post_soup.find("form")['action'])

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BroxtoweParser()
print parser.getResults(3,10,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Parādīt failu

@@ -65,3 +65,4 @@
"Leicestershire.py", "420"
"Cairngorms.py", "420"
"Calderdale.py", "420"
"Broxtowe.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Parādīt failu

@@ -271,3 +271,4 @@
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"

Notiek ielāde…
Atcelt
Saglabāt