Ver a proveniência

Adding scraper for Halton.

Also adding the pycurl scraper for Westminster, just in case it is useful to remind us how to do stuff later.
import/raw
duncan.parkes há 16 anos
ascendente
cometimento
420356966c
4 ficheiros alterados com 306 adições e 0 eliminações
  1. +134
    -0
      trunk/python_scrapers/Halton.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv
  4. +170
    -0
      trunk/python_scrapers/Westminster_pycurl.py

+ 134
- 0
trunk/python_scrapers/Halton.py Ver ficheiro

@@ -0,0 +1,134 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi


import cookielib

cookie_jar = cookielib.CookieJar()


from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

#date_format = "%d-%m-%Y"
date_format = "%d/%m/%Y"
received_date_format = "%d %B %Y"

import re

# We're going to use this for a re.split
# A whitespace char, "of" or "at" (case independent), and then a whitespace char.
address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I)

class HaltonParser:
def __init__(self, *args):

self.authority_name = "Halton Borough Council"
self.authority_short_name = "Halton"
self.base_url = "http://www.halton.gov.uk/planningapps/index.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

#CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search

def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# It seems dates are interpreted as midnight on
post_data = urllib.urlencode(
[
# ("CaseNo", ""),
# ("AppName", ""),
("DateApValFrom", search_day.strftime(date_format)),
("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)),
# ("AdrsNo", ""),
# ("StName", ""),
# ("StTown", ""),
("DropWeekDate", "0"),#search_day.strftime(date_format)),
("DropAppealStatus", "0"),
# ("DateAppealValFrom", ""),
# ("DateAppealValTo", ""),
("PageSize", "10"),
("Action", "Search"),
]
)

request = urllib2.Request(self.base_url, post_data)

while request:
# Now get the search page
# We need to deal with cookies, since pagination depends on them.
cookie_jar.add_cookie_header(request)
response = urllib2.urlopen(request)

cookie_jar.extract_cookies(response, request)

soup = BeautifulSoup(response.read())

# This should find us each Case on the current page.
caseno_strings = soup.findAll(text="Case No:")

for caseno_string in caseno_strings:
application = PlanningApplication()

application.council_reference = caseno_string.findNext("td").string
application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip()

application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date()

# The address here is included in the description. We'll have to do some heuristics to try to work out where it starts.
# As a first go, we'll try splitting the description on the last occurence of " of " or " at ".

try:
application.address = re.split(address_finder_re, application.description)[-1].strip()
except IndexError:
# If we can't find of or at, we'll just have the description again, it's better than nothing.
application.address = application.description

# We may as well get the postcode from the description rather than the address, in case things have gone wrong
application.postcode = getPostcodeFromText(application.description)

application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action'])

# Now what to have as info url...
# There is no way to link to a specific app, so we'll just have the search page.
application.info_url = self.base_url

self._results.addApplication(application)
# Now we need to find the post data for the next page, if there is any.
# Find the form with id "formNext", if there is one
next_form = soup.find("form", id="formNext")

if next_form is not None:
action = next_form['action']
# The HTML is borked - the inputs are outside the form, they are all
# in a td which follows it.
inputs = next_form.findNext("td").findAll("input")
post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs])
request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data)
else:
request = None


return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HaltonParser()
print parser.getResults(4,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Ver ficheiro

@@ -52,3 +52,4 @@
"Hounslow.py", "420"
"Harrow.py", "420"
"Westminster.py", "420"
"Halton.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Ver ficheiro

@@ -256,3 +256,4 @@
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"
"London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser"
"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"

+ 170
- 0
trunk/python_scrapers/Westminster_pycurl.py Ver ficheiro

@@ -0,0 +1,170 @@
"""
This is the screenscraper for Westminster City Council.

I have just noticed that there is a PublicAccess underneath all this, but
it only has the apps in for which they are accepting comments, so I think
we may as well use this url and get the lot...

This is the PublicAccess url:
http://publicaccess.westminster.gov.uk/publicaccess/
"""

import urllib
import urlparse

import pycurl
import StringIO

import datetime, time
import cgi

import sys

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class WestminsterParser:
def __init__(self, *args):

self.authority_name = "City of Westminster"
self.authority_short_name = "Westminster"
self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# post_data = [
# ("EFNO", ""),
# ("STName", ""),
# ("STNUMB", ""),
# ("ADRSNO", ""),
# ("WARD", "AllWards"),
# ("AGT", ""),
# ("ATCDE", "AllApps"),
# ("DECDE", "AllDecs"),
# ("DTErec", search_day.strftime(date_format)),
# ("DTErecTo", search_day.strftime(date_format)),
# ("DTEvalid", ""),
# ("DTEvalidTo", ""),
# ("APDECDE", "AllAppDecs"),
# ("submit", "Start+Search"),
# ]
post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}

while post_data:

# Now get the search page

sys.stderr.write("Fetching: %s\n" %self.base_url)
sys.stderr.write("post data: %s\n" %post_data)

# This gives us something to use as the callback
fakefile = StringIO.StringIO()

curlobj = pycurl.Curl()
curlobj.setopt(pycurl.URL, self.base_url)
curlobj.setopt(pycurl.POST, True)
curlobj.setopt(pycurl.POSTFIELDS, post_data)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.FOLLOWLOCATION, True)
curlobj.setopt(pycurl.MAXREDIRS, 10)

curlobj.perform()

sys.stderr.write("Got it\n")
soup = BeautifulSoup(fakefile.getvalue())

# We may as well free up the memory used by fakefile
fakefile.close()

sys.stderr.write("Created soup\n")

results_form = soup.find("form", {"name": "currentsearchresultsNext"})

# Sort out the post_data for the next page, if there is one
# If there is no next page then there will be no inputs in the form.
# In this case, post_data will be '', which is false.

sys.stderr.write("Found form containing results\n")

post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])

sys.stderr.write("Got post data\n")

# Each result has one link, and they are the only links in the form

links = results_form.findAll("a")

sys.stderr.write("Got list of links\n")

for link in links:

sys.stderr.write("Working on link: %s\n" %link['href'])

application = PlanningApplication()

application.date_received = search_day
application.info_url = urlparse.urljoin(self.base_url, link['href'])
application.council_reference = link.string.strip()

application.address = link.findNext("td").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = link.findNext("tr").findAll("td")[-1].string.strip()

# To get the comment url, we're going to have to go to each info url :-(

sys.stderr.write("Fetching: %s\n" %application.info_url)


fakefile = StringIO.StringIO()


curlobj.setopt(pycurl.HTTPGET, True)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

# We have to convert the info url to ascii for curl
curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))

curlobj.perform()

sys.stderr.write("Got it\n")

info_soup = BeautifulSoup(fakefile.getvalue())

fakefile.close()

comment_nav_string = info_soup.find(text="Comment on this case")
if comment_nav_string:
application.comment_url = comment_nav_string.parent['href']
else:
application.comment_url = "No Comments"

#http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500

self._results.addApplication(application)

sys.stderr.write("Finished that link\n")


sys.stderr.write("Finished while loop, returning stuff.\n")

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = WestminsterParser()
print parser.getResults(1,8,2008)


Carregando…
Cancelar
Guardar