Переглянути джерело

Add a parser for sites with urls ending in searchPageLoad.do

This includes:
Cumbria
Lincolnshire
West Sussex

It should also include:
Dorset
Somerset

but they are both down.

All of these sites are county councils which are only responsible for a small number of apps, many of which are not 
at sites which have postcodes, so don't expect too much from these scrapers!
import/raw
duncan.parkes 17 роки тому
джерело
коміт
088a5387c1
4 змінених файлів з 129 додано та 0 видалено
  1. +112
    -0
      trunk/python_scrapers/AtriumePlanning.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +13
    -0
      trunk/python_scrapers/PlanningUtils.py
  4. +3
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 112
- 0
trunk/python_scrapers/AtriumePlanning.py Переглянути файл

@@ -0,0 +1,112 @@
import urllib2
import urllib
import urlparse

import datetime
#import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText


info_path = "loadFullDetails.do"
comment_path = "loadRepresentation.do"

class AtriumePlanningParser:
def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):

self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.info_url = urlparse.urljoin(base_url, info_path)
self.comment_url = urlparse.urljoin(base_url, comment_path)

self.debug = debug

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):

# The end date for the search needs to be one day after the start
# date - presumably the date is used as a timestamp at midnight
search_start_date = datetime.date(year, month, day)
search_end_date = search_start_date + datetime.timedelta(1)


search_data = urllib.urlencode({"dayRegStart": search_start_date.strftime("%d"),
"monthRegStart": search_start_date.strftime("%b"),
"yearRegStart": search_start_date.strftime("%Y"),
"dayRegEnd": search_end_date.strftime("%d"),
"monthRegEnd": search_end_date.strftime("%b"),
"yearRegEnd": search_end_date.strftime("%Y"),
"searchType": "current",
"dispatch": "Search"
})

response = urllib2.urlopen(self.base_url, search_data)

html = response.read()

soup = BeautifulSoup(html)
# Get a list of the trs in the results table
if soup.find(text="Results"):
tds = soup.find(text="Results").parent.findNext("table").findAll("td")

for td in tds:
if td.string:
if td.string.strip() == "Date Registered":
# We are starting a new App
self._current_application = PlanningApplication()
self._current_application.date_received = datetime.datetime.strptime(td.findNext("td").string, "%d-%m-%Y")
elif td.string.strip() == "Application Number":
self._current_application.council_reference = td.findNext("td").string
elif td.string.strip() == "Location":
location = td.findNext("td").string
self._current_application.address = location

postcode = getPostcodeFromText(location)
if postcode:
self._current_application.postcode = postcode
elif td.string.strip() == "Proposal":
self._current_application.description = td.findNext("td").string
elif td.a and td.a.string.strip() == "View Full Details":
# The info url is td.a
messy_info_url = td.a["href"]

# We need to get an id out of this url
query_str = urlparse.urlsplit(messy_info_url)[3]

self._current_application.info_url = self.info_url + "?" + query_str
self._current_application.comment_url = self.comment_url + "?" + query_str

if self._current_application.is_ready():
self._results.addApplication(self._current_application)



return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


#if __name__ == '__main__':
# cumbria_parser = AtriumePlanningParser("Cumbria County Council", "Cumbria", "http://217.114.50.149:7778/ePlanningOPS/loadResults.do")

# print cumbria_parser.getResults(22,11,2007)
# lincolnshire_parser = AtriumePlanningParser("Lincolnshire County Council", "Lincolnshire", "")

# print cumbria_parser.getResults(22,11,2007)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Переглянути файл

@@ -12,3 +12,4 @@
"NorthHerts.cgi", "493"
"Enfield.cgi", "493"
"RutlandLike.py", "420"
"AtriumePlanning.py", "420"

+ 13
- 0
trunk/python_scrapers/PlanningUtils.py Переглянути файл

@@ -88,6 +88,19 @@ class PlanningApplication:

def __repr__(self):
return self.displayXML()

def is_ready(self):
# This method tells us if the application is complete
# Because of the postcode default, we can't really
# check the postcode - make sure it is filled in when
# you do the address.
return self.council_reference \
and self.address \
and self.description \
and self.info_url \
and self.comment_url \
and self.date_received
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received


+ 3
- 0
trunk/python_scrapers/SitesToGenerate.csv Переглянути файл

@@ -141,3 +141,6 @@
"Melton Borough Council", "Melton", "http://www.meltononline.co.uk/planning/searchparam.asp", "RutlandLike", "RutlandLikeParser"
"Harborough District Council", "Harborough", "http://pa.harborough.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"East Northamptonshire Council", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Cumbria County Council", "Cumbria", "http://217.114.50.149:7778/ePlanningOPS/loadResults.do", "AtriumePlanning", "AtriumePlanningParser"
"Lincolnshire County Council", "Lincolnshire", "http://apps1.lincolnshire.gov.uk/ePlanning/loadResults.do", "AtriumePlanning", "AtriumePlanningParser"
"West Sussex County Council", "West Sussex", "http://eplanning.westsussex.gov.uk/ePlanningOPS/loadResults.do", "AtriumePlanning", "AtriumePlanningParser"

Завантаження…
Відмінити
Зберегти