Ver código fonte

Add Kensington and Chelsea Scraper.

import/raw
duncan.parkes 18 anos atrás
pai
commit
2a0fe9d90a
3 arquivos alterados com 74 adições e 0 exclusões
  1. +72
    -0
      trunk/python_scrapers/Kensington.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 72
- 0
trunk/python_scrapers/Kensington.py Ver arquivo

@@ -0,0 +1,72 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class KensingtonParser:

def __init__(self, *args):

self.authority_name = "The Royal Borough of Kensington and Chelsea"
self.authority_short_name = "Kensington and Chelsea"
self.base_url = "http://www.rbkc.gov.uk/Planning/scripts/weeklyresults.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# We want the sunday of the week being searched for.
# (sunday is at the end of the week).
friday = search_day - datetime.timedelta(search_day.weekday()) + datetime.timedelta(4)

# Not using urllib.urlencode as it insists on turning the "+" into "%2B"
post_data = "WeekEndDate=%d%%2F%d%%2F%d&order=Received+Date&submit=search" %(friday.day, friday.month, friday.year)


# Now get the search page
response = urllib2.urlopen(self.base_url, post_data)
soup = BeautifulSoup(response.read())

trs = soup.find("table", summary="Planning Application search results table").findAll("tr")[1:]

for tr in trs:
application = PlanningApplication()

tds = tr.findAll("td")

# Not sure why these are entities. We'll convert them back.
application.council_reference = tds[0].a.contents[1].strip().replace("/", "/")
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.comment_url = application.info_url

application.date_received = datetime.datetime(*(time.strptime(tds[1].string.strip(), date_format)[0:6]))

application.address = tds[2].string.strip()
application.postcode = getPostcodeFromText(application.address)

application.description = tds[3].string.strip()

self._results.addApplication(application)
return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = KensingtonParser()
print parser.getResults(11,6,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Ver arquivo

@@ -24,3 +24,4 @@
"IsleOfWight.py", "420" "IsleOfWight.py", "420"
"Barnsley.py", "420" "Barnsley.py", "420"
"Shetland.py", "420" "Shetland.py", "420"
"Kensington.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Ver arquivo

@@ -222,3 +222,4 @@
"Barnsley Metropolitan Borough Council", "Barnsley", "", "Barnsley", "BarnsleyParser" "Barnsley Metropolitan Borough Council", "Barnsley", "", "Barnsley", "BarnsleyParser"
"Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" "Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Shetland Islands Council", "Shetland Islands", "", "Shetland", "ShetlandParser" "Shetland Islands Council", "Shetland Islands", "", "Shetland", "ShetlandParser"
"The Royal Borough of Kensington and Chelsea", "Kensington and Chelsea", "", "Kensington", "KensingtonParser"

Carregando…
Cancelar
Salvar