Procházet zdrojové kódy

Add scraper for Shetland Islands.

Encode all output as UTF-8.
import/raw
duncan.parkes před 16 roky
rodič
revize
c7e8c06d7b
5 změnil soubory, kde provedl 96 přidání a 65 odebrání
  1. +1
    -1
      trunk/python_scrapers/CGITemplate
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +18
    -15
      trunk/python_scrapers/PlanningUtils.py
  4. +75
    -49
      trunk/python_scrapers/Shetland.py
  5. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 1
- 1
trunk/python_scrapers/CGITemplate Zobrazit soubor

@@ -28,4 +28,4 @@ xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml
print xml.encode("utf-8") # print the xml

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Zobrazit soubor

@@ -23,3 +23,4 @@
"Ocella.py", "420"
"IsleOfWight.py", "420"
"Barnsley.py", "420"
"Shetland.py", "420"

+ 18
- 15
trunk/python_scrapers/PlanningUtils.py Zobrazit soubor

@@ -65,12 +65,13 @@ class PlanningAuthorityResults:

applications_bit = "".join([x.displayXML() for x in self.planning_applications])

return "<planning>\n" +\
"<authority_name>%s</authority_name>\n" %self.authority_name +\
"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
"<applications>\n" + applications_bit +\
"</applications>\n" +\
"</planning>\n"
return u"""<?xml version="1.0" encoding="UTF-8"?>\n""" + \
u"<planning>\n" +\
u"<authority_name>%s</authority_name>\n" %self.authority_name +\
u"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
u"<applications>\n" + applications_bit +\
u"</applications>\n" +\
u"</planning>\n"



@@ -104,12 +105,14 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
return "<application>\n" +\
"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
"<address>%s</address>\n" %xmlQuote(self.address) +\
"<postcode>%s</postcode>\n" %self.postcode +\
"<description>%s</description>\n" %xmlQuote(self.description) +\
"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
"</application>\n"
return u"<application>\n" +\
u"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
u"<address>%s</address>\n" %xmlQuote(self.address) +\
u"<postcode>%s</postcode>\n" %self.postcode +\
u"<description>%s</description>\n" %xmlQuote(self.description) +\
u"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
u"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
u"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
u"</application>\n"


+ 75
- 49
trunk/python_scrapers/Shetland.py Zobrazit soubor

@@ -1,9 +1,13 @@
"""
The Shetland Isles site shows applications from the last 14 days.
These are paginated into groups of ten.
"""

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

@@ -13,88 +17,110 @@ from PlanningUtils import PlanningApplication, \

date_format = "%d/%m/%Y"

page_count_regex = re.compile("Records 1 to 10 of (\d*) Records Found")

class ShetlandParser:
def __init__(self, *args):

self.authority_name = "Shetland Islands Council"
self.authority_short_name = "Shetland Islands"
self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0"
self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self):
# Note that we don't take the day, month and year parameters here.
def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.datetime(year, month, day)

offset = 0

# First get the search page
request = urllib2.Request(self.base_url)
response = urllib2.urlopen(request)
response = urllib2.urlopen(self.base_url %(offset))
contents = response.read()

soup = BeautifulSoup(response.read())
# First let's find out how many records there are (they are displayed ten per page).
match = page_count_regex.search(contents)
app_count = int(match.groups()[0])

# The apps are in the 5th table on the page (not a very good way to get it...)
results_table = soup.findAll("table")[5]
while offset < app_count:
if offset != 0:
contents = urllib2.urlopen(self.base_url %(offset)).read()

# Now we need to find the trs which contain the apps.
# The first TR is just headers.
# After that they alternate between containing an app and just some display graphics
# until the third from last. After that, they contain more rubbish.
soup = BeautifulSoup(contents)
# The apps are in the 5th table on the page (not a very good way to get it...)
results_table = soup.findAll("table")[5]

trs = results_table.findAll("tr")[1:-2]
# Now we need to find the trs which contain the apps.
# The first TR is just headers.
# After that they alternate between containing an app and just some display graphics
# until the third from last. After that, they contain more rubbish.

for i in range(len(trs)):
# We are only interested in the trs in even positions in the list.
if i % 2 == 0:
tr = trs[i]
trs = results_table.findAll("tr")[1:-2]

application = PlanningApplication()
for i in range(len(trs)):
# We are only interested in the trs in even positions in the list.
if i % 2 == 0:
tr = trs[i]

application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))
application = PlanningApplication()

application.council_reference = tr.a.string
comment_url_element = tr.find(text="comment on this planning application").parent
application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))

comment_url_element = tr.find(text="comment on this planning application").parent
application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])
# If the date of this application is earlier than the date
# we are searching for then don't download it.
# We could optimize this a bit more by not doing the later pages.

application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
if application.date_received < search_date:
break

info_response = urllib2.urlopen(application.info_url)
application.council_reference = tr.a.string

info_soup = BeautifulSoup(info_response.read())
info_table = info_soup.findAll("table")[2]
application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])

application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()
application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

# Now to get the address. This will be split across several tds.
info_response = urllib2.urlopen(application.info_url)

address_start_td = info_table.find("td", rowspan="4")
info_soup = BeautifulSoup(info_response.read())

# We need the first bit of the address from this tr
address_bits = [address_start_td.findNext("td").string.strip()]
info_table = info_soup.findAll("table")[2]

# We will need the first td from the next three trs after this
for address_tr in address_start_td.findAllNext("tr")[:3]:
address_line = address_tr.td.string.strip()
if address_line:
address_bits.append(address_line)
address_bits.append(application.postcode)
application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()

# Now to get the address. This will be split across several tds.

address_start_td = info_table.find("td", rowspan="4")

application.address = ', '.join(address_bits)
# We need the first bit of the address from this tr
address_bits = [address_start_td.findNext("td").string.strip()]

self._results.addApplication(application)
# We will need the first td from the next three trs after this
for address_tr in address_start_td.findAllNext("tr")[:3]:
address_line = address_tr.td.string.strip()

if address_line:
address_bits.append(address_line)

address_bits.append(application.postcode)

application.address = ', '.join(address_bits)

self._results.addApplication(application)
offset += 10

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear().displayXML()
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = ShetlandParser()
print parser.getResults(21,5,2008)

# TODO: Sort out pagination
# Note: to test this, you will need to pick a current date.
print parser.getResults(9,6,2008)


+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Zobrazit soubor

@@ -221,3 +221,4 @@
"Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser"
"Barnsley Metropolitan Borough Council", "Barnsley", "", "Barnsley", "BarnsleyParser"
"Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Shetland Islands Council", "Shetland Islands", "", "Shetland", "ShetlandParser"

Načítá se…
Zrušit
Uložit