ソースを参照

Oops - didn't want this one.

master
duncan.parkes 17年前
コミット
dfb1c6893d
1個のファイルの変更0行の追加263行の削除
  1. +0
    -263
      CGI/SouthOxfordshire.py

+ 0
- 263
CGI/SouthOxfordshire.py ファイルの表示

@@ -1,263 +0,0 @@
#!/usr/bin/python

import cgi
import cgitb; cgitb.enable(display=0, logdir="/tmp")

import urllib, urllib2
import HTMLParser
import urlparse
import datetime, time

# This needs a page number inserting
search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d"

# This needs the council reference
comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s"

authority_name = "South Oxfordshire District Council"
authority_short_name = "South Oxfordshire"


from PlanningUtils import fixNewlines, \
getPostcodeFromText, \
PlanningAuthorityResults, \
PlanningApplication

class SouthOxfordshireParser(HTMLParser.HTMLParser):
"""In this case we'll take the date, so that we can avoid doing dowloads for
the other days in this week's file. This date should be a datetime.date object.
"""
def __init__(self):
HTMLParser.HTMLParser.__init__(self)

self._requested_date = None

# We'll keep a count of the number of tables we have seen.
# All the interesting stuff is in table 3
self._table_count = 0

# While inside table 3, we'll keep a count of the number of
# <td>s we have seen. What is in which numbered <td> is detailed below.
# 1 reference
# 3 place and description
# 5 date received
# 2 and 4 are just padding
self._td_count = 0

# This is just a flag to say that we are now ready to get the reference
# from the next bit of data
self._get_reference = False

self._data = ''

# this will hold the application we are currently working on.
self._current_application = None
# The object which stores our set of planning application results
self._results = PlanningAuthorityResults(authority_name, authority_short_name)

def handle_starttag(self, tag, attrs):
# if we see a table tag, increment the table count.
if tag == 'table':
self._table_count += 1
# we are only interested in other tags if we are in table 3.
if self._table_count == 3:
# If we are starting a <tr>, create a new PlanningApplication object
# for the application currently being processed
if tag == 'tr':
self._current_application = PlanningApplication()

# if we see a td, increment the <td> count.
if tag == 'td':
self._td_count += 1

# if we are in the first <td>, and we see a link,
# then it is to the info page for this applicaion.
if tag == 'a' and self._td_count == 1:
for key, value in attrs:
if key == 'href':
url_end = value
self._current_application.info_url = urlparse.urljoin(search_url,url_end)

# We now know that the next bit of data is the reference
self._get_reference = True
# href is the only attribute we are interested in.
break

def handle_endtag(self, tag):
# There is no need to do anything unless we are in table 3.
if self._table_count == 3:

# The end <tr> indicates that the current application is finished.
# Now we can fetch the info_page to get the address, postcode,
# and description.
# If we don't have a reference, then we are in the header row,
# which we don't want.
# There is no point in doing this if the date is not the requested one.
if tag == 'tr' and \
self._current_application.council_reference is not None and \
self._current_application.date_received == self._requested_date:
info_page_parser = SouthOxfordshireInfoURLParser()
info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())

self._current_application.address = info_page_parser.address
self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
self._current_application.description = info_page_parser.description

# Add the current application to the results set
self._results.addApplication(self._current_application)

# At the end of the 5th <td>, self._data should contain
# the received date of the application.
if tag == 'td' and self._td_count == 5:
app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
self._current_application.date_received = datetime.date(app_year, app_month, app_day)
self._data = ''
self._td_count = 0

def handle_data(self, data):
# There is no need to do anything if we aren't in table 3.
if self._table_count == 3:
# If we are in the first <td>, and the get_reference flag is set,
# then the next data is the reference.
if self._td_count == 1 and self._get_reference:
self._current_application.council_reference = data

# The comment url can now be made, as it depends only on the reference.
# On this site, the link to the comment page is only displayed once
# the planning authority has decided who is handling this application
# and has opened consultations. The link below works straight away,
# and also works for apps for which the consultation period is over.
# I have no idea if anything is actually done with these comments if
# it is followed too early...
self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}

# Set the get_reference flag back to False.
self._get_reference = False

# If we are in the 5th <td>, then we need to collect all the data together
# before we can use it. This is actually processed in handle_endtag.
if self._td_count == 5:
self._data += data

def handle_entityref( self, ref ):
# We might have some entity_refs to clear up.
# there is no need to bother with this if we aren't in the results table.
if self._table_count == 3 and self._td_count == 5:
if ref == 'nbsp':
self._data += ' '


def getResultsByDayMonthYear(self, day, month, year):
"""This will return an ApplicationResults object containg the
applications for the date passed in."""

today = datetime.date.today()
self.requested_date = datetime.date(year, month, day)
delta = today - self.requested_date

# to get the correct page, we need
# page ((days mod 7) + 1)
page_number = delta.days/7 + 1

response = urllib2.urlopen(search_url %page_number)

self.feed(response.read())

return self._results


def getResults(self, day, month, year):
return getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
"""This parser is to get the description and address out of the info page
for a South Oxfordshire application."""

def __init__(self):
HTMLParser.HTMLParser.__init__(self)

self.address = None
self.description = None

# These two states will be set to:
# 0 - if we haven't yet got that bit
# 1 - if we are currently working on it
# 2 - if we have finished
self._address_state = 0
self._description_state = 0

# We well need to know whether or not we are in a <td>
self._in_td = False

# This is used for collecting together date which comes in several bits.
self._data = ''
def handle_starttag(self, tag, attrs):
# If we see the start of a <td> and we are still interested in some data
# then set the td flag to true, and blank the data
if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
self._in_td = True
self._data = ''

def handle_endtag(self, tag):
if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
# If we are working on the description,
# set description from _data and note that we need to work on it no more.
if self._description_state == 1:
self.description = self._data
self._description_state = 2


# If we are working on the address,
# set address from _data and note that we need to work on it no more.
elif self._address_state == 1:
self.address = self._data
self._address_state = 2

# If we see data which says 'Descripton',
# then set the description state to working.
elif self._data.strip() == 'Description':
self._description_state = 1
# If we see data which says 'Location',
# then set the addresss state to working.
elif self._data.strip() == 'Location':
self._address_state = 1

# Note that we are leaving the <td>
self._in_td = False
def handle_data(self, data):
# if we are in a td, and we are still interested in the data for something,
# append the current bit to self._data
if self._in_td and (self._address_state < 2 or self._description_state < 2):
self._data += data


# TODO

# find out what time of day this is run - does it matter that
# we aren't being careful with daylight saving time etc.

# Can we check that scraped email address really is
# an email address?

if __name__ == "__main__":
form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')

parser = SouthOxfordshireParser()

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

読み込み中…
キャンセル
保存