duncan.parkes 17 лет назад
Родитель
Сommit
8487a10a7d
6 измененных файлов: 233 добавлений и 5 удалений
  1. +219
    -0
      python_scrapers/FastWeb.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/PlanningUtils.py
  4. +9
    -1
      python_scrapers/PublicAccessSites.csv
  5. +2
    -3
      python_scrapers/README
  6. +1
    -1
      python_scrapers/generateCGIScripts.py

+ 219
- 0
python_scrapers/FastWeb.py Просмотреть файл

@@ -0,0 +1,219 @@

import urllib2
import HTMLParser
import urlparse
import datetime

from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication

# example url
# http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007

search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"

# for testing paging
#search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"

comment_url_end = "comment.asp?AltRef=%s"
info_url_end = "detail.asp?AltRef=%s"

class FastWeb:
def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):
self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.debug = debug

# The object which stores our set of planning application results
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
def getResultsByDayMonthYear(self, day, month, year):
requested_date = datetime.date(year, month, day)

# What we should do:

#1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day.

# Assuming we have a results page:
#2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers

#3) Iterate over scroll numbers.

scroll = 0
first_time = True
number_of_results = 0

while first_time or scroll * 20 < number_of_results:
scroll += 1
this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year}
url = urlparse.urljoin(self.base_url, this_search_url)
response = urllib2.urlopen(url)

#print response.info()
#print response.geturl()

contents = response.read()
#print contents

if first_time:
# We can now use the returned URL to tell us if there were no results.
returned_url = response.geturl()
#parsed_returned_url = urlparse.urlparse(returned_url)

# example URL of no results page
# http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none&
#print parsed_returned_url
if returned_url.count("search.asp"):
#if parsed_returned_url[4] == "search.asp?Results=none&":
# We got back the search page, there were no results for this date
break
results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url)
results_page_parser.feed(contents)

if first_time:
number_of_results += results_page_parser.number_of_results
first_time = False

return self._results
def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()



# States

STARTING = 1
GOT_RESULTS_COUNT = 2
IN_RESULTS_TABLE = 3
IN_RESULTS_TABLE_TD = 4
IN_INNER_TABLE = 5
FINISHED = -1


class FastWebResultsPageParser(HTMLParser.HTMLParser):
def __init__(self, results, requested_date, base_url):

self.results = results

self.requested_date = requested_date
self.base_url = base_url


HTMLParser.HTMLParser.__init__(self)

# We'll use this to store the number of results returned for this search
self.number_of_results = None

self._state = STARTING
self._td_count = None

self._data_list = []

# This will store the planning application we are currently working on.
self._current_application = None
def get_data(self, flush=True):
data = " ".join(self._data_list)

if flush:
self.flush_data()
return data

def flush_data(self):
self._data_list = []

def handle_starttag(self, tag, attrs):
if self._state == STARTING and tag == "input":
self._state = GOT_RESULTS_COUNT
#print attrs
# This is where the number of results returned is stored
attr_dict = {}
for attr_name, attr_value in attrs:
attr_dict[attr_name] = attr_value
if attr_dict.get("id") == "RecCount":
self.number_of_results = int(attr_dict.get("value"))
#print self.number_of_results

elif self._state == GOT_RESULTS_COUNT and tag == "table":
self._state = IN_RESULTS_TABLE

elif self._state == IN_RESULTS_TABLE and tag == "td":
self._state = IN_RESULTS_TABLE_TD
elif self._state == IN_RESULTS_TABLE_TD and tag == "table":
self._state = IN_INNER_TABLE
self._td_count = 0
self._current_application = PlanningApplication()
self._current_application.date_received = self.requested_date

elif self._state == IN_INNER_TABLE and tag == "td":
self._td_count += 1
self.flush_data()

def handle_endtag(self, tag):
if self._state == IN_INNER_TABLE and tag == "table":
# The next if should never be false, but it pays to be careful :-)
if self._current_application.council_reference is not None:
self.results.addApplication(self._current_application)
self._state = IN_RESULTS_TABLE_TD

elif self._state == IN_RESULTS_TABLE_TD and tag == "td":
self._state = FINISHED
elif self._state == IN_INNER_TABLE and tag == "td":
if self._td_count == 2:
# This data is the App No.
council_reference = self.get_data().strip()
self._current_application.council_reference = council_reference

# This also gives us everything we need for the info and comment urls
self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference))
self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference))
elif self._td_count == 4:
# This data is the address
self._current_application.address = self.get_data().strip()
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
elif self._td_count == 7:
# This data is the description
self._current_application.description = self.get_data().strip()

def handle_data(self, data):
self._data_list.append(data)

# for debug purposes

#cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/")

#eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/")


#suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/")

#print eastleighparser.getResults(10,8,2007)
#print cravenparser.getResults(25,12,2006)
#print suttonparser.getResults(10,8,2007)

#south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/")

#print south_lakeland_parser.getResults(27,11,2006)

# To do

# 3) integrate with other scrapers
# 4) other fastweb sites

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Просмотреть файл

@@ -6,3 +6,4 @@
"ApplicationSearchServletParser.py", "420"
"AcolnetParser.py", "420"
"MultipartPostHandler.py", "420"
"FastWeb.py", "420"

+ 1
- 0
python_scrapers/PlanningUtils.py Просмотреть файл

@@ -90,6 +90,7 @@ class PlanningApplication:
return self.displayXML()
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
return "<application>\n" +\
"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
"<address>%s</address>\n" %xmlQuote(self.address) +\


+ 9
- 1
python_scrapers/PublicAccessSites.csv Просмотреть файл

@@ -112,4 +112,12 @@
"South Bedfordshire District Council", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "SouthBedfordshireParser"
"Suffolk Coastal District Council", "Suffolk Coastal", "https://apps3.suffolkcoastal.gov.uk/planningonline/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "SuffolkCoastalParser"
"Surrey Heath Borough Council", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "SurreyHeathParser"
"New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NewForestDCParser"
"New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NewForestDCParser"
"Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/", "FastWeb", "FastWeb"
"Eastleigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/", "FastWeb", "FastWeb"
"Eden District Council", "Eden", "http://eforms.eden.gov.uk/fastweb/", "FastWeb", "FastWeb"
"Mansfield District Council", "Mansfield", "http://www.mansfield.gov.uk/Fastweb23/", "FastWeb", "FastWeb"
"South Lakeland District Council", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/", "FastWeb", "FastWeb"
"London Borough of Sutton", "Sutton", "http://82.43.4.135/FASTWEB/", "FastWeb", "FastWeb"
"Welwyn-Hatfield District Council", "Welwyn-Hatfield", "https://fastweb.welhat.gov.uk/", "FastWeb", "FastWeb"
"Wyre Forest District Council", "Wyre Forest", "http://www.wyreforest.gov.uk/fastweb/", "FastWeb", "FastWeb"

+ 2
- 3
python_scrapers/README Просмотреть файл

@@ -1,10 +1,9 @@
In order to generate the contents of the CGI directory (../CGI)
In order to generate the contents of the CGI directory (../cgi-bin/)
run

./createCGI.sh

This script svn deletes the old CGI directory's contents,
generates new cgi files in the CGI directory,
This script generates new cgi files in the CGI directory,
copies in some other files that are needed,
and commits all these changes to svn.



+ 1
- 1
python_scrapers/generateCGIScripts.py Просмотреть файл

@@ -1,6 +1,6 @@
#!/usr/local/bin/python

list_of_sites_filename = "PublicAccessSites.csv"
list_of_sites_filename = "SitesToGenerate.csv"
other_files_to_copy_filename = "OtherFilesToCopy.csv"
template_filename = "CGITemplate"
python_location = "/usr/local/bin/python"


Загрузка…
Отмена
Сохранить