Переглянути джерело

Adding scraper for Halton.

Also adding the pycurl scraper for Westminster, just in case it is useful to remind us how to do stuff later.
master
duncan.parkes 16 роки тому
джерело
коміт
1510528a8a
4 змінених файлів з 306 додано та 0 видалено
  1. +134
    -0
      python_scrapers/Halton.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv
  4. +170
    -0
      python_scrapers/Westminster_pycurl.py

+ 134
- 0
python_scrapers/Halton.py Переглянути файл

@@ -0,0 +1,134 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi


import cookielib

cookie_jar = cookielib.CookieJar()


from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

#date_format = "%d-%m-%Y"
date_format = "%d/%m/%Y"
received_date_format = "%d %B %Y"

import re

# We're going to use this for a re.split
# A whitespace char, "of" or "at" (case independent), and then a whitespace char.
address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I)

class HaltonParser:
def __init__(self, *args):

self.authority_name = "Halton Borough Council"
self.authority_short_name = "Halton"
self.base_url = "http://www.halton.gov.uk/planningapps/index.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

#CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search

def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# It seems dates are interpreted as midnight on
post_data = urllib.urlencode(
[
# ("CaseNo", ""),
# ("AppName", ""),
("DateApValFrom", search_day.strftime(date_format)),
("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)),
# ("AdrsNo", ""),
# ("StName", ""),
# ("StTown", ""),
("DropWeekDate", "0"),#search_day.strftime(date_format)),
("DropAppealStatus", "0"),
# ("DateAppealValFrom", ""),
# ("DateAppealValTo", ""),
("PageSize", "10"),
("Action", "Search"),
]
)

request = urllib2.Request(self.base_url, post_data)

while request:
# Now get the search page
# We need to deal with cookies, since pagination depends on them.
cookie_jar.add_cookie_header(request)
response = urllib2.urlopen(request)

cookie_jar.extract_cookies(response, request)

soup = BeautifulSoup(response.read())

# This should find us each Case on the current page.
caseno_strings = soup.findAll(text="Case No:")

for caseno_string in caseno_strings:
application = PlanningApplication()

application.council_reference = caseno_string.findNext("td").string
application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip()

application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date()

# The address here is included in the description. We'll have to do some heuristics to try to work out where it starts.
# As a first go, we'll try splitting the description on the last occurence of " of " or " at ".

try:
application.address = re.split(address_finder_re, application.description)[-1].strip()
except IndexError:
# If we can't find of or at, we'll just have the description again, it's better than nothing.
application.address = application.description

# We may as well get the postcode from the description rather than the address, in case things have gone wrong
application.postcode = getPostcodeFromText(application.description)

application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action'])

# Now what to have as info url...
# There is no way to link to a specific app, so we'll just have the search page.
application.info_url = self.base_url

self._results.addApplication(application)
# Now we need to find the post data for the next page, if there is any.
# Find the form with id "formNext", if there is one
next_form = soup.find("form", id="formNext")

if next_form is not None:
action = next_form['action']
# The HTML is borked - the inputs are outside the form, they are all
# in a td which follows it.
inputs = next_form.findNext("td").findAll("input")
post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs])
request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data)
else:
request = None


return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HaltonParser()
print parser.getResults(4,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Переглянути файл

@@ -52,3 +52,4 @@
"Hounslow.py", "420"
"Harrow.py", "420"
"Westminster.py", "420"
"Halton.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Переглянути файл

@@ -256,3 +256,4 @@
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"
"London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser"
"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser"
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"

+ 170
- 0
python_scrapers/Westminster_pycurl.py Переглянути файл

@@ -0,0 +1,170 @@
"""
This is the screenscraper for Westminster City Council.

I have just noticed that there is a PublicAccess underneath all this, but
it only has the apps in for which they are accepting comments, so I think
we may as well use this url and get the lot...

This is the PublicAccess url:
http://publicaccess.westminster.gov.uk/publicaccess/
"""

import urllib
import urlparse

import pycurl
import StringIO

import datetime, time
import cgi

import sys

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class WestminsterParser:
def __init__(self, *args):

self.authority_name = "City of Westminster"
self.authority_short_name = "Westminster"
self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# post_data = [
# ("EFNO", ""),
# ("STName", ""),
# ("STNUMB", ""),
# ("ADRSNO", ""),
# ("WARD", "AllWards"),
# ("AGT", ""),
# ("ATCDE", "AllApps"),
# ("DECDE", "AllDecs"),
# ("DTErec", search_day.strftime(date_format)),
# ("DTErecTo", search_day.strftime(date_format)),
# ("DTEvalid", ""),
# ("DTEvalidTo", ""),
# ("APDECDE", "AllAppDecs"),
# ("submit", "Start+Search"),
# ]
post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}

while post_data:

# Now get the search page

sys.stderr.write("Fetching: %s\n" %self.base_url)
sys.stderr.write("post data: %s\n" %post_data)

# This gives us something to use as the callback
fakefile = StringIO.StringIO()

curlobj = pycurl.Curl()
curlobj.setopt(pycurl.URL, self.base_url)
curlobj.setopt(pycurl.POST, True)
curlobj.setopt(pycurl.POSTFIELDS, post_data)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.FOLLOWLOCATION, True)
curlobj.setopt(pycurl.MAXREDIRS, 10)

curlobj.perform()

sys.stderr.write("Got it\n")
soup = BeautifulSoup(fakefile.getvalue())

# We may as well free up the memory used by fakefile
fakefile.close()

sys.stderr.write("Created soup\n")

results_form = soup.find("form", {"name": "currentsearchresultsNext"})

# Sort out the post_data for the next page, if there is one
# If there is no next page then there will be no inputs in the form.
# In this case, post_data will be '', which is false.

sys.stderr.write("Found form containing results\n")

post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])

sys.stderr.write("Got post data\n")

# Each result has one link, and they are the only links in the form

links = results_form.findAll("a")

sys.stderr.write("Got list of links\n")

for link in links:

sys.stderr.write("Working on link: %s\n" %link['href'])

application = PlanningApplication()

application.date_received = search_day
application.info_url = urlparse.urljoin(self.base_url, link['href'])
application.council_reference = link.string.strip()

application.address = link.findNext("td").string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = link.findNext("tr").findAll("td")[-1].string.strip()

# To get the comment url, we're going to have to go to each info url :-(

sys.stderr.write("Fetching: %s\n" %application.info_url)


fakefile = StringIO.StringIO()


curlobj.setopt(pycurl.HTTPGET, True)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

# We have to convert the info url to ascii for curl
curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))

curlobj.perform()

sys.stderr.write("Got it\n")

info_soup = BeautifulSoup(fakefile.getvalue())

fakefile.close()

comment_nav_string = info_soup.find(text="Comment on this case")
if comment_nav_string:
application.comment_url = comment_nav_string.parent['href']
else:
application.comment_url = "No Comments"

#http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500

self._results.addApplication(application)

sys.stderr.write("Finished that link\n")


sys.stderr.write("Finished while loop, returning stuff.\n")

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = WestminsterParser()
print parser.getResults(1,8,2008)


Завантаження…
Відмінити
Зберегти