Ver a proveniência

Add scraper for Kingston upon Thames.

import/raw
duncan.parkes há 17 anos
ascendente
cometimento
3c85f0d0dd
3 ficheiros alterados com 72 adições e 0 eliminações
  1. +70
    -0
      trunk/python_scrapers/KingstonUponThames.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 70
- 0
trunk/python_scrapers/KingstonUponThames.py Ver ficheiro

@@ -0,0 +1,70 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%b/%Y"

class KingstonParser:
comments_email_address = "dc@rbk.kingston.gov.uk"

def __init__(self, *args):
self.authority_name = "Royal Borough of Kingston upon Thames"
self.authority_short_name = "Kingston upon Thames"
self.base_url = "http://maps.kingston.gov.uk/isis_main/planning/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strStreets=ALL&strStreetsTxt=All%%20Streets&strLimit=500"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
soup = BeautifulSoup(response.read())

# Each app is stored in a table on it's own.
# These tables don't have any nice distinguishing features,
# but they do all contain a NavigableString "Application",
# and nothing else in the page does.
nav_strings = soup.findAll(text="Application")
for nav_string in nav_strings:
results_table = nav_string.findPrevious("table")

application = PlanningApplication()
application.date_received = search_day

application.council_reference = results_table.a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, results_table.a['href'])
application.address = results_table.findAll("td")[7].a.string.strip()

application.postcode = getPostcodeFromText(application.address)
application.description = results_table.findAll("td")[-1].contents[0].strip()

# A few applications have comment urls, but most don't.
# When they do, they have a case officer - I don't think we can
# work out the other urls - even if they exist.
# Best to use the email address.
application.comment_url = self.comments_email_address

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = KingstonParser()
print parser.getResults(2,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Ver ficheiro

@@ -48,3 +48,4 @@
"Carmarthenshire.py", "420" "Carmarthenshire.py", "420"
"Berwick.py", "420" "Berwick.py", "420"
"Birmingham.py", "420" "Birmingham.py", "420"
"KingstonUponThames.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Ver ficheiro

@@ -252,3 +252,4 @@
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser" "Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser" "Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser" "Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"
"Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"

Carregando…
Cancelar
Guardar