Selaa lähdekoodia

add the generated scrapers for fastweb sites

master
duncan.parkes 17 vuotta sitten
vanhempi
commit
7df8e7ea93
11 muutettua tiedostoa jossa 441 lisäystä ja 13 poistoa
  1. +29
    -0
      cgi-bin/Craven.cgi
  2. +29
    -0
      cgi-bin/Eastleigh.cgi
  3. +29
    -0
      cgi-bin/Eden.cgi
  4. +207
    -0
      cgi-bin/FastWeb.py
  5. +29
    -0
      cgi-bin/Mansfield.cgi
  6. +1
    -0
      cgi-bin/PlanningUtils.py
  7. +29
    -0
      cgi-bin/South Lakeland.cgi
  8. +29
    -0
      cgi-bin/Sutton.cgi
  9. +29
    -0
      cgi-bin/Welwyn-Hatfield.cgi
  10. +29
    -0
      cgi-bin/Wyre Forest.cgi
  11. +1
    -13
      python_scrapers/FastWeb.py

+ 29
- 0
cgi-bin/Craven.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Craven District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Craven District Council"
authority_short_name = "Craven"
base_url = "http://www.planning.cravendc.gov.uk/fastweb/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Eastleigh.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Eastleigh Borough Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Eastleigh Borough Council"
authority_short_name = "Eastleigh"
base_url = "http://www.eastleigh.gov.uk/FastWEB/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Eden.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Eden District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Eden District Council"
authority_short_name = "Eden"
base_url = "http://eforms.eden.gov.uk/fastweb/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 207
- 0
cgi-bin/FastWeb.py Näytä tiedosto

@@ -0,0 +1,207 @@

import urllib2
import HTMLParser
import urlparse
import datetime

from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication

# example url
# http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007

search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"

# for testing paging
#search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"

comment_url_end = "comment.asp?AltRef=%s"
info_url_end = "detail.asp?AltRef=%s"

class FastWeb:
def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):
self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.debug = debug

# The object which stores our set of planning application results
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
def getResultsByDayMonthYear(self, day, month, year):
requested_date = datetime.date(year, month, day)

# What we should do:

#1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day.

# Assuming we have a results page:
#2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers

#3) Iterate over scroll numbers.

scroll = 0
first_time = True
number_of_results = 0

while first_time or scroll * 20 < number_of_results:
scroll += 1
this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year}
url = urlparse.urljoin(self.base_url, this_search_url)
response = urllib2.urlopen(url)

contents = response.read()

if first_time:
# We can now use the returned URL to tell us if there were no results.
returned_url = response.geturl()

# example URL of no results page
# http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none&
if returned_url.count("search.asp"):
# We got back the search page, there were no results for this date
break
results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url)
results_page_parser.feed(contents)

if first_time:
number_of_results += results_page_parser.number_of_results
first_time = False

return self._results
def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()



# States

STARTING = 1
GOT_RESULTS_COUNT = 2
IN_RESULTS_TABLE = 3
IN_RESULTS_TABLE_TD = 4
IN_INNER_TABLE = 5
FINISHED = -1


class FastWebResultsPageParser(HTMLParser.HTMLParser):
def __init__(self, results, requested_date, base_url):

self.results = results

self.requested_date = requested_date
self.base_url = base_url


HTMLParser.HTMLParser.__init__(self)

# We'll use this to store the number of results returned for this search
self.number_of_results = None

self._state = STARTING
self._td_count = None

self._data_list = []

# This will store the planning application we are currently working on.
self._current_application = None
def get_data(self, flush=True):
data = " ".join(self._data_list)

if flush:
self.flush_data()
return data

def flush_data(self):
self._data_list = []

def handle_starttag(self, tag, attrs):
if self._state == STARTING and tag == "input":
self._state = GOT_RESULTS_COUNT

# This is where the number of results returned is stored
attr_dict = {}
for attr_name, attr_value in attrs:
attr_dict[attr_name] = attr_value
if attr_dict.get("id") == "RecCount":
self.number_of_results = int(attr_dict.get("value"))

elif self._state == GOT_RESULTS_COUNT and tag == "table":
self._state = IN_RESULTS_TABLE

elif self._state == IN_RESULTS_TABLE and tag == "td":
self._state = IN_RESULTS_TABLE_TD
elif self._state == IN_RESULTS_TABLE_TD and tag == "table":
self._state = IN_INNER_TABLE
self._td_count = 0
self._current_application = PlanningApplication()
self._current_application.date_received = self.requested_date

elif self._state == IN_INNER_TABLE and tag == "td":
self._td_count += 1
self.flush_data()

def handle_endtag(self, tag):
if self._state == IN_INNER_TABLE and tag == "table":
# The next if should never be false, but it pays to be careful :-)
if self._current_application.council_reference is not None:
self.results.addApplication(self._current_application)
self._state = IN_RESULTS_TABLE_TD

elif self._state == IN_RESULTS_TABLE_TD and tag == "td":
self._state = FINISHED
elif self._state == IN_INNER_TABLE and tag == "td":
if self._td_count == 2:
# This data is the App No.
council_reference = self.get_data().strip()
self._current_application.council_reference = council_reference

# This also gives us everything we need for the info and comment urls
self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference))
self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference))
elif self._td_count == 4:
# This data is the address
self._current_application.address = self.get_data().strip()
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
elif self._td_count == 7:
# This data is the description
self._current_application.description = self.get_data().strip()

def handle_data(self, data):
self._data_list.append(data)

# for debug purposes

#cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/")

#eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/")


#suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/")

#print eastleighparser.getResults(10,8,2007)
#print cravenparser.getResults(25,12,2006)
#print suttonparser.getResults(10,8,2007)

#south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/")

#print south_lakeland_parser.getResults(27,11,2006)


+ 29
- 0
cgi-bin/Mansfield.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Mansfield District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Mansfield District Council"
authority_short_name = "Mansfield"
base_url = "http://www.mansfield.gov.uk/Fastweb23/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 1
- 0
cgi-bin/PlanningUtils.py Näytä tiedosto

@@ -90,6 +90,7 @@ class PlanningApplication:
return self.displayXML()
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
return "<application>\n" +\
"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
"<address>%s</address>\n" %xmlQuote(self.address) +\


+ 29
- 0
cgi-bin/South Lakeland.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for South Lakeland District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "South Lakeland District Council"
authority_short_name = "South Lakeland"
base_url = "http://www.southlakeland.gov.uk/fastweb/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Sutton.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for London Borough of Sutton.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "London Borough of Sutton"
authority_short_name = "Sutton"
base_url = "http://82.43.4.135/FASTWEB/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Welwyn-Hatfield.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Welwyn-Hatfield District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Welwyn-Hatfield District Council"
authority_short_name = "Welwyn-Hatfield"
base_url = "https://fastweb.welhat.gov.uk/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Wyre Forest.cgi Näytä tiedosto

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Wyre Forest District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Wyre Forest District Council"
authority_short_name = "Wyre Forest"
base_url = "http://www.wyreforest.gov.uk/fastweb/"

import FastWeb

parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 1
- 13
python_scrapers/FastWeb.py Näytä tiedosto

@@ -56,22 +56,15 @@ class FastWeb:
url = urlparse.urljoin(self.base_url, this_search_url)
response = urllib2.urlopen(url)

#print response.info()
#print response.geturl()

contents = response.read()
#print contents

if first_time:
# We can now use the returned URL to tell us if there were no results.
returned_url = response.geturl()
#parsed_returned_url = urlparse.urlparse(returned_url)

# example URL of no results page
# http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none&
#print parsed_returned_url
if returned_url.count("search.asp"):
#if parsed_returned_url[4] == "search.asp?Results=none&":
# We got back the search page, there were no results for this date
break
@@ -136,7 +129,7 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if self._state == STARTING and tag == "input":
self._state = GOT_RESULTS_COUNT
#print attrs
# This is where the number of results returned is stored
attr_dict = {}
@@ -145,7 +138,6 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser):
if attr_dict.get("id") == "RecCount":
self.number_of_results = int(attr_dict.get("value"))
#print self.number_of_results

elif self._state == GOT_RESULTS_COUNT and tag == "table":
self._state = IN_RESULTS_TABLE
@@ -213,7 +205,3 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser):

#print south_lakeland_parser.getResults(27,11,2006)

# To do

# 3) integrate with other scrapers
# 4) other fastweb sites

Ladataan…
Peruuta
Tallenna