Add mostly done shetland scraper.

16 years ago · 945133bdeb
--- a/python_scrapers/Shetland.py
+++ b/python_scrapers/Shetland.py
@@ -0,0 +1,100 @@
 import urllib2
 import urllib
 import urlparse

 import datetime, time
 import cgi

 from BeautifulSoup import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 date_format = "%d/%m/%Y"

 class ShetlandParser:
    def __init__(self, *args):

        self.authority_name = "Shetland Islands Council"
        self.authority_short_name = "Shetland Islands"
        self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self):
        # Note that we don't take the day, month and year parameters here.

        # First get the search page
        request = urllib2.Request(self.base_url)
        response = urllib2.urlopen(request)

        soup = BeautifulSoup(response.read())

        # The apps are in the 5th table on the page (not a very good way to get it...)
        results_table = soup.findAll("table")[5]

        # Now we need to find the trs which contain the apps.
        # The first TR is just headers.
        # After that they alternate between containing an app and just some display graphics
        # until the third from last. After that, they contain more rubbish.

        trs = results_table.findAll("tr")[1:-2]

        for i in range(len(trs)):
            # We are only interested in the trs in even positions in the list.
            if i % 2 == 0:
                tr = trs[i]

                application = PlanningApplication()

                application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))

                application.council_reference = tr.a.string

                comment_url_element = tr.find(text="comment on this planning application").parent
                application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])

                application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

                info_response = urllib2.urlopen(application.info_url)

                info_soup = BeautifulSoup(info_response.read())
                
                info_table = info_soup.findAll("table")[2]

                application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
                application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()

                # Now to get the address. This will be split across several tds.

                address_start_td = info_table.find("td", rowspan="4")

                # We need the first bit of the address from this tr
                address_bits = [address_start_td.findNext("td").string.strip()]

                # We will need the first td from the next three trs after this
                for address_tr in address_start_td.findAllNext("tr")[:3]:
                    address_line = address_tr.td.string.strip()
                    
                    if address_line:
                        address_bits.append(address_line)
                        
                address_bits.append(application.postcode)

                application.address = ', '.join(address_bits)

                self._results.addApplication(application)
                

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear().displayXML()

 if __name__ == '__main__':
    parser = ShetlandParser()
    print parser.getResults(21,5,2008)

 # TODO: Sort out pagination