Преглед на файлове

Add a parser for sites with urls ending in searchPageLoad.do

This includes:
Cumbria
Lincolnshire
West Sussex

It should also include:
Dorset
Somerset

but they are both down.

All of these sites are county councils which are only responsible for a small number of apps, many of which are not 
at sites which have postcodes, so don't expect too much from these scrapers!
master
duncan.parkes преди 16 години
родител
ревизия
e74b8cb631
променени са 4 файла, в които са добавени 129 реда и са изтрити 0 реда
  1. +112
    -0
      python_scrapers/AtriumePlanning.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +13
    -0
      python_scrapers/PlanningUtils.py
  4. +3
    -0
      python_scrapers/SitesToGenerate.csv

+ 112
- 0
python_scrapers/AtriumePlanning.py Целия файл

@@ -0,0 +1,112 @@
import urllib2
import urllib
import urlparse

import datetime
#import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText


info_path = "loadFullDetails.do"
comment_path = "loadRepresentation.do"

class AtriumePlanningParser:
def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):

self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.info_url = urlparse.urljoin(base_url, info_path)
self.comment_url = urlparse.urljoin(base_url, comment_path)

self.debug = debug

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):

# The end date for the search needs to be one day after the start
# date - presumably the date is used as a timestamp at midnight
search_start_date = datetime.date(year, month, day)
search_end_date = search_start_date + datetime.timedelta(1)


search_data = urllib.urlencode({"dayRegStart": search_start_date.strftime("%d"),
"monthRegStart": search_start_date.strftime("%b"),
"yearRegStart": search_start_date.strftime("%Y"),
"dayRegEnd": search_end_date.strftime("%d"),
"monthRegEnd": search_end_date.strftime("%b"),
"yearRegEnd": search_end_date.strftime("%Y"),
"searchType": "current",
"dispatch": "Search"
})

response = urllib2.urlopen(self.base_url, search_data)

html = response.read()

soup = BeautifulSoup(html)
# Get a list of the trs in the results table
if soup.find(text="Results"):
tds = soup.find(text="Results").parent.findNext("table").findAll("td")

for td in tds:
if td.string:
if td.string.strip() == "Date Registered":
# We are starting a new App
self._current_application = PlanningApplication()
self._current_application.date_received = datetime.datetime.strptime(td.findNext("td").string, "%d-%m-%Y")
elif td.string.strip() == "Application Number":
self._current_application.council_reference = td.findNext("td").string
elif td.string.strip() == "Location":
location = td.findNext("td").string
self._current_application.address = location

postcode = getPostcodeFromText(location)
if postcode:
self._current_application.postcode = postcode
elif td.string.strip() == "Proposal":
self._current_application.description = td.findNext("td").string
elif td.a and td.a.string.strip() == "View Full Details":
# The info url is td.a
messy_info_url = td.a["href"]

# We need to get an id out of this url
query_str = urlparse.urlsplit(messy_info_url)[3]

self._current_application.info_url = self.info_url + "?" + query_str
self._current_application.comment_url = self.comment_url + "?" + query_str

if self._current_application.is_ready():
self._results.addApplication(self._current_application)



return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


#if __name__ == '__main__':
# cumbria_parser = AtriumePlanningParser("Cumbria County Council", "Cumbria", "http://217.114.50.149:7778/ePlanningOPS/loadResults.do")

# print cumbria_parser.getResults(22,11,2007)
# lincolnshire_parser = AtriumePlanningParser("Lincolnshire County Council", "Lincolnshire", "")

# print cumbria_parser.getResults(22,11,2007)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Целия файл

@@ -12,3 +12,4 @@
"NorthHerts.cgi", "493"
"Enfield.cgi", "493"
"RutlandLike.py", "420"
"AtriumePlanning.py", "420"

+ 13
- 0
python_scrapers/PlanningUtils.py Целия файл

@@ -88,6 +88,19 @@ class PlanningApplication:

def __repr__(self):
return self.displayXML()

def is_ready(self):
# This method tells us if the application is complete
# Because of the postcode default, we can't really
# check the postcode - make sure it is filled in when
# you do the address.
return self.council_reference \
and self.address \
and self.description \
and self.info_url \
and self.comment_url \
and self.date_received
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received


+ 3
- 0
python_scrapers/SitesToGenerate.csv Целия файл

@@ -141,3 +141,6 @@
"Melton Borough Council", "Melton", "http://www.meltononline.co.uk/planning/searchparam.asp", "RutlandLike", "RutlandLikeParser"
"Harborough District Council", "Harborough", "http://pa.harborough.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"East Northamptonshire Council", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Cumbria County Council", "Cumbria", "http://217.114.50.149:7778/ePlanningOPS/loadResults.do", "AtriumePlanning", "AtriumePlanningParser"
"Lincolnshire County Council", "Lincolnshire", "http://apps1.lincolnshire.gov.uk/ePlanning/loadResults.do", "AtriumePlanning", "AtriumePlanningParser"
"West Sussex County Council", "West Sussex", "http://eplanning.westsussex.gov.uk/ePlanningOPS/loadResults.do", "AtriumePlanning", "AtriumePlanningParser"

Зареждане…
Отказ
Запис