Add parser for Kirklees. Get rid of some unnecessary imports.

před 17 roky · e39114078f
--- a/python_scrapers/Aberdeenshire.py
+++ b/python_scrapers/Aberdeenshire.py
@@ -3,7 +3,7 @@ import urllib2
 import urllib
 import urlparse

 import datetime, time
 import datetime
 import cgi
 import re

--- a/python_scrapers/Berwick.py
+++ b/python_scrapers/Berwick.py
@@ -3,7 +3,7 @@ import urllib2
 import urllib
 import urlparse

 import datetime, time
 import datetime
 import cgi

 from BeautifulSoup import BeautifulSoup
--- a/python_scrapers/Kirklees.py
+++ b/python_scrapers/Kirklees.py
@@ -0,0 +1,85 @@
 import urllib2
 import urllib
 import urlparse

 import datetime, time
 import cgi

 import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 search_date_format = "%d%%2F%m%%2F%Y"
 received_date_format = "%d %b %Y"

 class KirkleesParser:
    def __init__(self, *args):

        self.authority_name = "Kirklees Council"
        self.authority_short_name = "Kirklees"
        self.base_url = "http://www.kirklees.gov.uk/business/planning/List.asp?SrchApp=&SrchName=&SrchPostCode=&SrchStreet=&SrchDetails=&SrchLocality=&RorD=A&SrchDteFr=%(date)s&SrchDteTo=%(date)s&Submit=Search&pageNum=%(pagenum)d"
        self.comments_email_address = "planning.contactcentre@kirklees.gov.uk"
 
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        pagenum = 1

        while pagenum:
            response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format), 
                                                       "pagenum": pagenum}
                                       )
            soup = BeautifulSoup.BeautifulSoup(response.read())

            # This is not a nice way to find the results table, but I can't 
            # see anything good to use, and it works...

            # There are two trs with style attributes per app. This will find all the first ones of the pairs.
            trs = soup.find("table", border="0", cellpadding="0", cellspacing="2", width="100%", summary="").findAll("tr", style=True)[::2]

            for tr in trs:
                tds = tr.findAll("td")
                date_received = datetime.datetime.strptime(tds[3].string.strip(), received_date_format).date()

                # Stop looking through the list if we have found one which is earlier than the date searched for.
                if date_received < search_date:
                    # If we break out, then we won't want the next page
                    pagenum = None
                    break

                application = PlanningApplication()
                application.date_received = date_received

                application.council_reference = tds[0].small.string.strip()

                # The second <td> contains the address, split up with <br/>s
                application.address = ' '.join([x for x in tds[1].contents if isinstance(x, BeautifulSoup.NavigableString)])
                application.postcode = getPostcodeFromText(application.address)

                application.description = tds[2].string.strip()

                application.info_url = urlparse.urljoin(self.base_url, tr.findNext("a")['href'])
                application.comment_url = self.comments_email_address

                self._results.addApplication(application)
            else:
                # If we got through the whole list without breaking out,
                # then we'll want to get the next page.
                pagenum += 1

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 if __name__ == '__main__':
    parser = KirkleesParser()
    print parser.getResults(1,10,2008)

 # TODO

--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -60,3 +60,4 @@
 "Eastbourne.py", "420"
 "Gosport.py", "420"
 "WestDorset.py", "420"
 "Kirklees.py", "420"
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -266,3 +266,4 @@
 "Caerphilly County Borough Council", "Caerphilly", "http://publicaccess.caerphilly.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Gosport Borough Council", "Gosport", "", "Gosport", "GosportParser"
 "West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser"
 "Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"