Bläddra i källkod

Add parser for Kirklees. Get rid of some unnecessary imports.

master
duncan.parkes 15 år sedan
förälder
incheckning
e39114078f
5 ändrade filer med 89 tillägg och 2 borttagningar
  1. +1
    -1
      python_scrapers/Aberdeenshire.py
  2. +1
    -1
      python_scrapers/Berwick.py
  3. +85
    -0
      python_scrapers/Kirklees.py
  4. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  5. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 1
- 1
python_scrapers/Aberdeenshire.py Visa fil

@@ -3,7 +3,7 @@ import urllib2
import urllib import urllib
import urlparse import urlparse


import datetime, time
import datetime
import cgi import cgi
import re import re




+ 1
- 1
python_scrapers/Berwick.py Visa fil

@@ -3,7 +3,7 @@ import urllib2
import urllib import urllib
import urlparse import urlparse


import datetime, time
import datetime
import cgi import cgi


from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup


+ 85
- 0
python_scrapers/Kirklees.py Visa fil

@@ -0,0 +1,85 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

search_date_format = "%d%%2F%m%%2F%Y"
received_date_format = "%d %b %Y"

class KirkleesParser:
def __init__(self, *args):

self.authority_name = "Kirklees Council"
self.authority_short_name = "Kirklees"
self.base_url = "http://www.kirklees.gov.uk/business/planning/List.asp?SrchApp=&SrchName=&SrchPostCode=&SrchStreet=&SrchDetails=&SrchLocality=&RorD=A&SrchDteFr=%(date)s&SrchDteTo=%(date)s&Submit=Search&pageNum=%(pagenum)d"
self.comments_email_address = "planning.contactcentre@kirklees.gov.uk"
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

pagenum = 1

while pagenum:
response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format),
"pagenum": pagenum}
)
soup = BeautifulSoup.BeautifulSoup(response.read())

# This is not a nice way to find the results table, but I can't
# see anything good to use, and it works...

# There are two trs with style attributes per app. This will find all the first ones of the pairs.
trs = soup.find("table", border="0", cellpadding="0", cellspacing="2", width="100%", summary="").findAll("tr", style=True)[::2]

for tr in trs:
tds = tr.findAll("td")
date_received = datetime.datetime.strptime(tds[3].string.strip(), received_date_format).date()

# Stop looking through the list if we have found one which is earlier than the date searched for.
if date_received < search_date:
# If we break out, then we won't want the next page
pagenum = None
break

application = PlanningApplication()
application.date_received = date_received

application.council_reference = tds[0].small.string.strip()

# The second <td> contains the address, split up with <br/>s
application.address = ' '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)])
application.postcode = getPostcodeFromText(application.address)

application.description = tds[2].string.strip()

application.info_url = urlparse.urljoin(self.base_url, tr.findNext("a")['href'])
application.comment_url = self.comments_email_address

self._results.addApplication(application)
else:
# If we got through the whole list without breaking out,
# then we'll want to get the next page.
pagenum += 1

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = KirkleesParser()
print parser.getResults(1,10,2008)

# TODO


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Visa fil

@@ -60,3 +60,4 @@
"Eastbourne.py", "420" "Eastbourne.py", "420"
"Gosport.py", "420" "Gosport.py", "420"
"WestDorset.py", "420" "WestDorset.py", "420"
"Kirklees.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Visa fil

@@ -266,3 +266,4 @@
"Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser" "Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"
"West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser" "West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser"
"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"

Laddar…
Avbryt
Spara