|
- import urllib2
- import urllib
- import urlparse
-
- import datetime, time
- import cgi
-
- import BeautifulSoup
-
- from PlanningUtils import PlanningApplication, \
- PlanningAuthorityResults, \
- getPostcodeFromText
-
- search_date_format = "%d%%2F%m%%2F%Y"
- received_date_format = "%d %b %Y"
-
- class KirkleesParser:
- def __init__(self, *args):
-
- self.authority_name = "Kirklees Council"
- self.authority_short_name = "Kirklees"
- self.base_url = "http://www.kirklees.gov.uk/business/planning/List.asp?SrchApp=&SrchName=&SrchPostCode=&SrchStreet=&SrchDetails=&SrchLocality=&RorD=A&SrchDteFr=%(date)s&SrchDteTo=%(date)s&Submit=Search&pageNum=%(pagenum)d"
- self.comments_email_address = "planning.contactcentre@kirklees.gov.uk"
-
- self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
-
-
- def getResultsByDayMonthYear(self, day, month, year):
- search_date = datetime.date(year, month, day)
-
- pagenum = 1
-
- while pagenum:
- response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format),
- "pagenum": pagenum}
- )
- soup = BeautifulSoup.BeautifulSoup(response.read())
-
- # This is not a nice way to find the results table, but I can't
- # see anything good to use, and it works...
-
- # There are two trs with style attributes per app. This will find all the first ones of the pairs.
- trs = soup.find("table", border="0", cellpadding="0", cellspacing="2", width="100%", summary="").findAll("tr", style=True)[::2]
-
- for tr in trs:
- tds = tr.findAll("td")
- date_received = datetime.datetime.strptime(tds[3].string.strip(), received_date_format).date()
-
- # Stop looking through the list if we have found one which is earlier than the date searched for.
- if date_received < search_date:
- # If we break out, then we won't want the next page
- pagenum = None
- break
-
- application = PlanningApplication()
- application.date_received = date_received
-
- application.council_reference = tds[0].small.string.strip()
-
- # The second <td> contains the address, split up with <br/>s
- application.address = ' '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)])
- application.postcode = getPostcodeFromText(application.address)
-
- application.description = tds[2].string.strip()
-
- application.info_url = urlparse.urljoin(self.base_url, tr.findNext("a")['href'])
- application.comment_url = self.comments_email_address
-
- self._results.addApplication(application)
- else:
- # If we got through the whole list without breaking out,
- # then we'll want to get the next page.
- pagenum += 1
-
- return self._results
-
- def getResults(self, day, month, year):
- return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
-
- if __name__ == '__main__':
- parser = KirkleesParser()
- print parser.getResults(1,10,2008)
-
- # TODO
|