Przeglądaj źródła

Add scraper for the WAM sites (apart from Westminster, which is a bit different).

master
duncan.parkes 16 lat temu
rodzic
commit
3ba69e0a68
4 zmienionych plików z 221 dodań i 1 usunięć
  1. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  2. +3
    -1
      python_scrapers/PlanningExplorer.py
  3. +11
    -0
      python_scrapers/SitesToGenerate.csv
  4. +206
    -0
      python_scrapers/WAM.py

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Wyświetl plik

@@ -19,3 +19,4 @@
"SouthSomerset.cgi", "493"
"WestDorset.cgi", "493"
"Christchurch.cgi", "493"
"WAM.py", "420"

+ 3
- 1
python_scrapers/PlanningExplorer.py Wyświetl plik

@@ -548,7 +548,7 @@ if __name__ == '__main__':
#parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
#parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
#parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
#parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
#parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
#parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
@@ -576,3 +576,5 @@ if __name__ == '__main__':

# Investigate catching unavailable message:
# Charnwood

# South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site...

+ 11
- 0
python_scrapers/SitesToGenerate.csv Wyświetl plik

@@ -190,3 +190,14 @@
"Canterbury City Council", "Canterbury", "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "CanterburyParser"
"London Borough of Merton", "Merton", "http://planning.merton.gov.uk/", "PlanningExplorer", "MertonParser"
"Chichester District Council", "Chichester", "http://pa.chichester.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"London Borough of Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"Braintree District Council", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", "WAM", "BraintreeParser"
"Castle Point Borough Council", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"Colchester Borough Council", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", "WAM", "BraintreeParser"
"East Lothian Council", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"North Somerset Council", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", "WAM", "BraintreeParser"
"Nottingham City Council", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"Poole Borough Council", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", "WAM", "PooleParser"
"Rother District Council", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", "WAM", "WAMParser"
"South Gloucestershire Council", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", "WAM", "BraintreeParser"
"London Borough of Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", "WAM", "BraintreeParser"

+ 206
- 0
python_scrapers/WAM.py Wyświetl plik

@@ -0,0 +1,206 @@
import urllib2
import urllib
import urlparse

import datetime
import time
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

def clean_string(a_string):
return ' '.join(' '.join(a_string.split(" ")).strip().split())

def remove_params(url):
# Probably a bit naughty to use both urlparse and urlunsplit here,
# but it does what we want - removing the jsessionid param
parsed_url = urlparse.urlparse(url)
params_free_url = urlparse.urlunsplit(parsed_url[:3] + parsed_url[4:])

return params_free_url

class WAMParser:
address_column = 2
date_format = "%d/%b/%Y"

def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):

self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.debug = debug

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

def _get_search_data(self, year, month, day):
timestamp = time.mktime((year, month, day, 0,0,0,0,0,0))
# The parameter endDate appears to be 1000*a timestamp
time_input = str(int(timestamp*1000))

#http://wam.boroughofpoole.com/WAM/pas/searchApplications.do;jsessionid=BCC7DFD1C42DC210A7BE5BA616683CDE
# areaCode=%25&sortOrder=1&endDate=1197213359015&applicationType=%25&Button=Search

search_data = (
("areaCode", "%"),
("sortOrder", "1"),
("endDate", time_input),
("applicationType", "%"),
("Button", "Search"),
)

return search_data

def getResultsByDayMonthYear(self, day, month, year):
search_data_tuple = self._get_search_data(year, month, day)
search_data = urllib.urlencode(search_data_tuple)

response = urllib2.urlopen(self.base_url, search_data)
html = response.read()

soup = BeautifulSoup(html)

results_table = soup.find(text=re.compile("Your search returned the following")).findNext("table")

# FIXME - deal with the empty results case
# FIXME - deal with later pages of results

trs = results_table.findAll("tr")[1:]

self._current_application = PlanningApplication()

for tr in trs:
try:

tds = tr.findAll("td")

date_received_string = tds[0].contents[0].strip()
self._current_application.date_received = datetime.datetime.strptime(clean_string(date_received_string), self.date_format)

relative_info_url = tr.a['href']
info_url_no_params = remove_params(relative_info_url)

#Now we join on the base url to make it absolute
self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_no_params)

self._current_application.council_reference = tr.a.string

address = clean_string(tds[self.address_column].string)
self._current_application.address = address
self._current_application.postcode = getPostcodeFromText(address)

# self._current_application.description = clean_string(tds[self.description_column].string)

# Fetch the info page

info_response = urllib2.urlopen(self._current_application.info_url)

info_html = info_response.read()
info_soup = BeautifulSoup(info_html)

try:
relative_comment_url = info_soup.find("a", href=re.compile("createComment.do"))['href']
comment_url_no_params = remove_params(relative_comment_url)

self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_no_params)
except: # FIXME - specialize the except
if self.debug:
print "No comment url for %s" %(self._current_application.council_reference)
self._current_application.comment_url = "None"

# Some WAM sites have the description in the results page,
# but since they don't all have it there, we'll get it from here...

description_td = info_soup.find(text="Development:").findNext("td")

# Sometimes the description is in a span in the td, sometimes it is directly there.
self._current_application.description = (description_td.string or description_td.span.string).strip()

self._results.addApplication(self._current_application)

except SystemExit:
# It seems a shame to miss out on all the apps from an authority just because one breaks...
if self._current_application.council_reference:
if self.debug:
print "Failed to add %s" %(self._current_application.council_reference)
else:
if self.debug:
print "Failed to add an application"
self._current_application = PlanningApplication()

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()



class PooleParser(WAMParser):
address_column = 1

class BraintreeParser(WAMParser):
date_format = "%d %b %Y"

def _get_search_data(self, year, month, day):
# Braintree
# action=showWeeklyList&areaCode=%25&sortOrder=1&endDate=1203249969656&applicationType=%25&Button=Search
search_data = WAMParser._get_search_data(self, year, month, day)
return (("action", "showWeeklyList"),) + search_data

if __name__ == '__main__':
#parser = WAMParser("Barking and Dagenham", "Barking and Dagenham", "http://idoxwam.lbbd.gov.uk:8081/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True)
# Camden
#parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do", debug=True)
#Chichester - Done as PublicAccess
#parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True)
#parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True)
#parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True)
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True)
#parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True)
parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)

print parser.getResults(8,2,2008)

# Left to fix

# All:
# Paging
# Coping with no apps


# Barking and Dagenham - done
# Braintree - done
# Camden - also has a PlanningExplorer, which is done (so not bothering)
# Castle Point - done
# Chichester - not needed (PublicAccess site done)
# Colchester - done. like Braintree
# East Lothian - done
# North Somerset - done. like Braintree
# Nottingham - done (sometimes no comments)
# Poole - done
# Rother - done
# South Gloucestershire - done. like Braintree
# South Norfolk - Works, but no postcodes. Also, the search link here points to PlanningExplorer. I think we should assume this is the current site.
# Tower Hamlets - done. Like Braintree.
# Westminster - not done: clearly WAM underneath, but with a wrapper.


Ładowanie…
Anuluj
Zapisz