Преглед изворни кода

Add Wychavon scraper from Thomas.

import/raw
duncan.parkes@gmail.com пре 15 година
родитељ
комит
e51eab174b
3 измењених фајлова са 131 додато и 1 уклоњено
  1. +2
    -1
      trunk/OtherFilesToCopy.csv
  2. +1
    -0
      trunk/SitesToGenerate.csv
  3. +128
    -0
      trunk/python_scrapers/Wychavon.py

+ 2
- 1
trunk/OtherFilesToCopy.csv Прегледај датотеку

@@ -68,4 +68,5 @@
"Broxtowe.py", "420"
"Mendip.py", "420"
"Weymouth.py", "420"
"Solihull.py", "420"
"Solihull.py", "420"
"Wychavon.py", "420"

+ 1
- 0
trunk/SitesToGenerate.csv Прегледај датотеку

@@ -317,6 +317,7 @@
"Wolverhampton City Council","Wolverhampton",,,,,,"http://planningonline.wolverhampton.gov.uk/PublicAccess/tdc/","PublicAccess","PublicAccessParser",
"Worcester City Council","Worcester",,,,,,"http://www.worcester.gov.uk:8080/planet/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry","Planet","PlanetParser",
"Worthing Borough Council","Worthing",,,,,,"http://planning.worthing.gov.uk/publicaccess/tdc/","PublicAccess","PublicAccessParser",
"Wychavon District Council","Wychavon",,,,,,,"Wychavon","WychavonParser",
"Wycombe District Council","Wycombe",,,,,,"http://planningpa.wycombe.gov.uk/publicaccess/tdc/","PublicAccess","PublicAccessParser",
"Wyre Forest District Council","Wyre Forest",,,,,,"http://www.wyreforest.gov.uk/fastweb/","FastWeb","FastWeb",
"City of York Council","York",,,,,,"http://planning.york.gov.uk/PublicAccess/tdc/","PublicAccess","PublicAccessParser",

+ 128
- 0
trunk/python_scrapers/Wychavon.py Прегледај датотеку

@@ -0,0 +1,128 @@
"""
This is the screenscraper for planning apps from Wychavon District Council.

This appears to be an Acolnet variant, and is searched by a block of months.
"""

import urllib
import urlparse

import datetime

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class WychavonParser:
def __init__(self, *args):
self.authority_name = "Wychavon"
self.authority_short_name = "Wychavon"
# Currently hard coded--if this address updates, we'll need to scrape
# the search form to get it each time.
self.base_url = "http://www.e-wychavon.org.uk/scripts/plan2005/\
acolnetcgi.exe?ACTION=UNWRAP&WhereDescription=General%20Search&\
Whereclause3=%27%30%31%2F%7BEdtMonthEnd%7D%2F%7BEdtYearEnd%7D%27&\
RIPNAME=Root%2EPages%2EPgeDC%2EPgeListCases"
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
form_data = "EdtYearNo=&EdtCaseNo=&EdtApplicant=&EdtAgent=&EdtLocation"\
+ "=&EdtWard=&EdtMonthStart1=" + str(month) + "&EdtYearStart=" \
+ str(year) + "&EdtMonthEnd=" + str(month) + "&EdtYearEnd="\
+ str(year) + "&submit=Search"

# Fetch the results
response = urllib.urlopen(self.base_url, form_data)
soup = BeautifulSoup(response.read())
#Each set of results has its own table
results_tables = soup.findAll("table", cellpadding="2", cols="4")

for table in results_tables:
application = PlanningApplication()

trs = table.findAll("tr")
application.council_reference = trs[0].findAll("td")[1].font.font.\
font.string.strip()
relative_info_url = trs[0].findAll("td")[1].a['href']
application.info_url = urlparse.urljoin(self.base_url, relative_info_url)
application.address = trs[1].findAll("td")[1].font.string.strip()
application.postcode = getPostcodeFromText(application.address)
#This code avoids an error if there's no description given.
descrip = trs[2].findAll("td")[1].font.string
if descrip == None:
application.description = ""
else:
application.description = descrip.strip()
rec_m, rec_d, rec_y = trs[1].findAll("td")[3].font.string.strip().\
split("/")
application.date_received = datetime.date(int(rec_y), int(rec_m), \
int(rec_d))
apptype = trs[0].findAll("td")[3].font.string
# Avoids throwing an error if no apptype is given (this can happen)
if apptype != None:
apptype = apptype.strip()
# Is all this really necessary? I don't know, but I've assumed that
# it is. The form will appear without the suffix, I don't know if
# the council's backend would accept it or not. Current behaviour
# is to degrade silently to no suffix if it can't match an
# application type.
if apptype == "Telecommunications":
# Don't know why it's a naked IP rather than sitting on the
# same site, but there it is.
application.comment_url = "http://81.171.139.151/WAM/createCom"\
+"ment.do?action=CreateApplicationComment&applicationType=PLANNI"\
+"NG&appNumber=T3/" + application.council_reference + "/TC"
else:
comment_url = "http://81.171.139.151/WAM/createComment.do?acti"\
+"on=CreateApplicationComment&applicationType=PLANNING&appNumber"\
+"=W/" + application.council_reference + "/"
suffix = ""
if apptype == "Householder planning application":
suffix = "PP"
elif apptype == "Non-householder planning application":
suffix = "PN"
elif apptype == "Outline applications":
suffix = "OU"
elif apptype == "Change of use":
suffix = "CU"
elif apptype == "Listed Building consent":
suffix = "LB"
elif apptype == "Advertisement application":
suffix = "AA"
elif apptype == "Certificate of Lawfulness Existing":
suffix = "LUE"
elif apptype == "Approval of reserved matters":
suffix = "VOC"
#These are all the ones that I found, except "Advice - Pre-app/
#Householder", the suffix for which is inconsistent. The suffix
#for this could be obtained by scraping the description page for
#each application.
application.comment_url = comment_url + suffix

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = WychavonParser()
#Put this in with constant numbers, copying the Barnsley example. Works for testing, but should it use the arguments for a real run?
print parser.getResults(16,3,2009)

Loading…
Откажи
Сачувај