Add mostly done shetland scraper.

16 years ago · c532bc1be2
--- a/trunk/python_scrapers/Shetland.py
+++ b/trunk/python_scrapers/Shetland.py
@@ -0,0 +1,100 @@
 import urllib2
 import urllib
 import urlparse
 import datetime, time
 import cgi
 from BeautifulSoup import BeautifulSoup
 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText
 date_format = "%d/%m/%Y"
 class ShetlandParser:
    def __init__(self, *args):
        self.authority_name = "Shetland Islands Council"
        self.authority_short_name = "Shetland Islands"
        self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0"
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
    def getResultsByDayMonthYear(self):
        # Note that we don't take the day, month and year parameters here.
        # First get the search page
        request = urllib2.Request(self.base_url)
        response = urllib2.urlopen(request)
        soup = BeautifulSoup(response.read())
        # The apps are in the 5th table on the page (not a very good way to get it...)
        results_table = soup.findAll("table")[5]
        # Now we need to find the trs which contain the apps.
        # The first TR is just headers.
        # After that they alternate between containing an app and just some display graphics
        # until the third from last. After that, they contain more rubbish.
        trs = results_table.findAll("tr")[1:-2]
        for i in range(len(trs)):
            # We are only interested in the trs in even positions in the list.
            if i % 2 == 0:
                tr = trs[i]
                application = PlanningApplication()
                application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))
                application.council_reference = tr.a.string
                comment_url_element = tr.find(text="comment on this planning application").parent
                application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])
                application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
                info_response = urllib2.urlopen(application.info_url)
                info_soup = BeautifulSoup(info_response.read())
                info_table = info_soup.findAll("table")[2]
                application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
                application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()
                # Now to get the address. This will be split across several tds.
                address_start_td = info_table.find("td", rowspan="4")
                # We need the first bit of the address from this tr
                address_bits = [address_start_td.findNext("td").string.strip()]
                # We will need the first td from the next three trs after this
                for address_tr in address_start_td.findAllNext("tr")[:3]:
                    address_line = address_tr.td.string.strip()
                    if address_line:
                        address_bits.append(address_line)
                address_bits.append(application.postcode)
                application.address = ', '.join(address_bits)
                self._results.addApplication(application)
        return self._results
    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear().displayXML()
 if __name__ == '__main__':
    parser = ShetlandParser()
    print parser.getResults(21,5,2008)
 # TODO: Sort out pagination