Browse Source

Add parser for Kirklees. Get rid of some unnecessary imports.

import/raw
duncan.parkes 16 years ago
parent
commit
f0a0912836
5 changed files with 89 additions and 2 deletions
  1. +1
    -1
      trunk/python_scrapers/Aberdeenshire.py
  2. +1
    -1
      trunk/python_scrapers/Berwick.py
  3. +85
    -0
      trunk/python_scrapers/Kirklees.py
  4. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  5. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 1
- 1
trunk/python_scrapers/Aberdeenshire.py View File

@@ -3,7 +3,7 @@ import urllib2
import urllib import urllib
import urlparse import urlparse


import datetime, time
import datetime
import cgi import cgi
import re import re




+ 1
- 1
trunk/python_scrapers/Berwick.py View File

@@ -3,7 +3,7 @@ import urllib2
import urllib import urllib
import urlparse import urlparse


import datetime, time
import datetime
import cgi import cgi


from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup


+ 85
- 0
trunk/python_scrapers/Kirklees.py View File

@@ -0,0 +1,85 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

search_date_format = "%d%%2F%m%%2F%Y"
received_date_format = "%d %b %Y"

class KirkleesParser:
def __init__(self, *args):

self.authority_name = "Kirklees Council"
self.authority_short_name = "Kirklees"
self.base_url = "http://www.kirklees.gov.uk/business/planning/List.asp?SrchApp=&SrchName=&SrchPostCode=&SrchStreet=&SrchDetails=&SrchLocality=&RorD=A&SrchDteFr=%(date)s&SrchDteTo=%(date)s&Submit=Search&pageNum=%(pagenum)d"
self.comments_email_address = "planning.contactcentre@kirklees.gov.uk"
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

pagenum = 1

while pagenum:
response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format),
"pagenum": pagenum}
)
soup = BeautifulSoup.BeautifulSoup(response.read())

# This is not a nice way to find the results table, but I can't
# see anything good to use, and it works...

# There are two trs with style attributes per app. This will find all the first ones of the pairs.
trs = soup.find("table", border="0", cellpadding="0", cellspacing="2", width="100%", summary="").findAll("tr", style=True)[::2]

for tr in trs:
tds = tr.findAll("td")
date_received = datetime.datetime.strptime(tds[3].string.strip(), received_date_format).date()

# Stop looking through the list if we have found one which is earlier than the date searched for.
if date_received < search_date:
# If we break out, then we won't want the next page
pagenum = None
break

application = PlanningApplication()
application.date_received = date_received

application.council_reference = tds[0].small.string.strip()

# The second <td> contains the address, split up with <br/>s
application.address = ' '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)])
application.postcode = getPostcodeFromText(application.address)

application.description = tds[2].string.strip()

application.info_url = urlparse.urljoin(self.base_url, tr.findNext("a")['href'])
application.comment_url = self.comments_email_address

self._results.addApplication(application)
else:
# If we got through the whole list without breaking out,
# then we'll want to get the next page.
pagenum += 1

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = KirkleesParser()
print parser.getResults(1,10,2008)

# TODO


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -60,3 +60,4 @@
"Eastbourne.py", "420" "Eastbourne.py", "420"
"Gosport.py", "420" "Gosport.py", "420"
"WestDorset.py", "420" "WestDorset.py", "420"
"Kirklees.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -266,3 +266,4 @@
"Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser" "Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"
"West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser" "West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser"
"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"

Loading…
Cancel
Save