Pārlūkot izejas kodu

Add mostly done shetland scraper.

import/raw
duncan.parkes pirms 16 gadiem
vecāks
revīzija
c532bc1be2
1 mainītis faili ar 100 papildinājumiem un 0 dzēšanām
  1. +100
    -0
      trunk/python_scrapers/Shetland.py

+ 100
- 0
trunk/python_scrapers/Shetland.py Parādīt failu

@@ -0,0 +1,100 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class ShetlandParser:
def __init__(self, *args):

self.authority_name = "Shetland Islands Council"
self.authority_short_name = "Shetland Islands"
self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self):
# Note that we don't take the day, month and year parameters here.

# First get the search page
request = urllib2.Request(self.base_url)
response = urllib2.urlopen(request)

soup = BeautifulSoup(response.read())

# The apps are in the 5th table on the page (not a very good way to get it...)
results_table = soup.findAll("table")[5]

# Now we need to find the trs which contain the apps.
# The first TR is just headers.
# After that they alternate between containing an app and just some display graphics
# until the third from last. After that, they contain more rubbish.

trs = results_table.findAll("tr")[1:-2]

for i in range(len(trs)):
# We are only interested in the trs in even positions in the list.
if i % 2 == 0:
tr = trs[i]

application = PlanningApplication()

application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))

application.council_reference = tr.a.string

comment_url_element = tr.find(text="comment on this planning application").parent
application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])

application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

info_response = urllib2.urlopen(application.info_url)

info_soup = BeautifulSoup(info_response.read())
info_table = info_soup.findAll("table")[2]

application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()

# Now to get the address. This will be split across several tds.

address_start_td = info_table.find("td", rowspan="4")

# We need the first bit of the address from this tr
address_bits = [address_start_td.findNext("td").string.strip()]

# We will need the first td from the next three trs after this
for address_tr in address_start_td.findAllNext("tr")[:3]:
address_line = address_tr.td.string.strip()
if address_line:
address_bits.append(address_line)
address_bits.append(application.postcode)

application.address = ', '.join(address_bits)

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear().displayXML()

if __name__ == '__main__':
parser = ShetlandParser()
print parser.getResults(21,5,2008)

# TODO: Sort out pagination

Notiek ielāde…
Atcelt
Saglabāt