Parcourir la source

Add mostly done shetland scraper.

duncan.parkes il y a 16 ans
1 fichiers modifiés avec 100 ajouts et 0 suppressions
  1. +100

+ 100
- 0
python_scrapers/ Voir le fichier

@@ -0,0 +1,100 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \

date_format = "%d/%m/%Y"

class ShetlandParser:
def __init__(self, *args):

self.authority_name = "Shetland Islands Council"
self.authority_short_name = "Shetland Islands"
self.base_url = ""

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

def getResultsByDayMonthYear(self):
# Note that we don't take the day, month and year parameters here.

# First get the search page
request = urllib2.Request(self.base_url)
response = urllib2.urlopen(request)

soup = BeautifulSoup(

# The apps are in the 5th table on the page (not a very good way to get it...)
results_table = soup.findAll("table")[5]

# Now we need to find the trs which contain the apps.
# The first TR is just headers.
# After that they alternate between containing an app and just some display graphics
# until the third from last. After that, they contain more rubbish.

trs = results_table.findAll("tr")[1:-2]

for i in range(len(trs)):
# We are only interested in the trs in even positions in the list.
if i % 2 == 0:
tr = trs[i]

application = PlanningApplication()

application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))

application.council_reference = tr.a.string

comment_url_element = tr.find(text="comment on this planning application").parent
application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])

application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

info_response = urllib2.urlopen(application.info_url)

info_soup = BeautifulSoup(
info_table = info_soup.findAll("table")[2]

application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()

# Now to get the address. This will be split across several tds.

address_start_td = info_table.find("td", rowspan="4")

# We need the first bit of the address from this tr
address_bits = [address_start_td.findNext("td").string.strip()]

# We will need the first td from the next three trs after this
for address_tr in address_start_td.findAllNext("tr")[:3]:
address_line =
if address_line:

application.address = ', '.join(address_bits)


return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear().displayXML()

if __name__ == '__main__':
parser = ShetlandParser()
print parser.getResults(21,5,2008)

# TODO: Sort out pagination
