Kaynağa Gözat

Add Aberdeenshire parser.

import/raw
duncan.parkes 16 yıl önce
ebeveyn
işleme
782947887e
3 değiştirilmiş dosya ile 93 ekleme ve 0 silme
  1. +91
    -0
      trunk/python_scrapers/Aberdeenshire.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 91
- 0
trunk/python_scrapers/Aberdeenshire.py Dosyayı Görüntüle

@@ -0,0 +1,91 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

comment_re = re.compile("Submit Comment")
mapref_re = re.compile("Map Ref")

import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class AberdeenshireParser:
def __init__(self, *args):

self.authority_name = "Aberdeenshire Council"
self.authority_short_name = "Aberdeenshire"
self.base_url = "http://www.aberdeenshire.gov.uk/planning/apps/search.asp?startDateSearch=%(day)s%%2F%(month)s%%2F%(year)s&endDateSearch=%(day)s%%2F%(month)s%%2F%(year)s&Submit=Search"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

next = self.base_url %{"day": day,
"month": month,
"year": year,
}

while next:
# Now get the search page
response = urllib2.urlopen(next)

soup = BeautifulSoup.BeautifulSoup(response.read())

trs = soup.table.findAll("tr")[1:] # First one is just headers

for tr in trs:
application = PlanningApplication()

application.date_received = search_day
application.council_reference = tr.a.string
application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
tds = tr.findAll("td")

application.address = ' '.join([x.replace(" ", " ").strip() for x in tds[2].contents if type(x) == BeautifulSoup.NavigableString and x.strip()])
application.postcode = getPostcodeFromText(application.address)
application.description = tds[4].string.replace(" ", " ").strip()

# Get the info page in order to find the comment url
# we could do this without a download if it wasn't for the
# sector parameter - I wonder what that is?
info_response = urllib2.urlopen(application.info_url)
info_soup = BeautifulSoup.BeautifulSoup(info_response.read())

comment_navstring = info_soup.find(text=comment_re)
if comment_navstring:
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text=comment_re).parent['href'])
else:
application.comment_url = "No Comments"

# While we're at it, let's get the OSGB
application.osgb_x, application.osgb_y = [x.strip() for x in info_soup.find(text=mapref_re).findNext("a").string.strip().split(",")]

self._results.addApplication(application)
next_element = soup.find(text="next").parent

if next_element.name == 'a':
next = urlparse.urljoin(self.base_url, next_element['href'])
else:
next = None

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = AberdeenshireParser()
print parser.getResults(7,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Dosyayı Görüntüle

@@ -43,3 +43,4 @@
"Redbridge.pl", "493"
"Redbridge.cgi", "493"
"AmberValley.py", "420"
"Aberdeenshire.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Dosyayı Görüntüle

@@ -247,3 +247,4 @@
"London Borough of Merton", "Merton", "http://planning.merton.gov.uk", "PlanningExplorer", "MertonParser"
"London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser"
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"

Yükleniyor…
İptal
Kaydet