Переглянути джерело

Add Forest of Dean and Fife parsers.

import/raw
duncan.parkes 16 роки тому
джерело
коміт
821296b942
4 змінених файлів з 167 додано та 1 видалено
  1. +88
    -0
      trunk/python_scrapers/Fife.py
  2. +75
    -0
      trunk/python_scrapers/ForestOfDean.py
  3. +2
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  4. +2
    -1
      trunk/python_scrapers/SitesToGenerate.csv

+ 88
- 0
trunk/python_scrapers/Fife.py Переглянути файл

@@ -0,0 +1,88 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class FifeParser:
def __init__(self, *args):

self.authority_name = "Fife Council"
self.authority_short_name = "Fife"
self.base_url = "http://www.fifedirect.org.uk/topics/index.cfm"

self.comment_url = "http://www.ukplanning.com/ukp/showCaseFile.do?councilName=Fife+Council&appNumber=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_data = urllib.urlencode(
[("fuseaction", "planapps.list"),
("SUBJECTID", "104CC166-3ED1-4D22-B9F1E2FB8438478A"),
("src_fromdayRec", day),
("src_frommonthRec", month),
("src_fromyearRec", year),
("src_todayRec", day),
("src_tomonthRec", month),
("src_toyearRec", year),
("findroadworks", "GO"),
]
)
search_url = self.base_url + "?" + search_data

response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

results_table = soup.find("table", id="results")

# Apart from the first tr, which contains headers, the trs come in pairs for each application

trs = results_table.findAll("tr")[1:]

tr_count = 0
while tr_count < len(trs):
tr = trs[tr_count]

if tr_count % 2 == 0:
application = PlanningApplication()
application.date_received = search_date
tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.comment_url = self.comment_url %(application.council_reference)

application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.address = ', '.join([x.strip() for x in tds[1].findAll(text=True)])
application.postcode = getPostcodeFromText(application.address)
else:
# Get rid of the "Details: " at the beginning.
application.description = tr.td.string.strip()[9:]

self._results.addApplication(application)

tr_count += 1

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = FifeParser()
print parser.getResults(21,5,2008)

# TODO

# Paginates at 50. Unlikely on a single day, so we'll worry about it later.

+ 75
- 0
trunk/python_scrapers/ForestOfDean.py Переглянути файл

@@ -0,0 +1,75 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%Y%m%d"

class ForestOfDeanParser:
def __init__(self, *args):

self.authority_name = "Forest of Dean District Council"
self.authority_short_name = "Forest of Dean"
self.base_url = "http://www.fdean.gov.uk/content.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_data = urllib.urlencode(
[
("parent_directory_id", "200"),
("nav", "679"),
("id", "13266"),
("RecStart", "1"),
("RecCount", "100"),
("SDate", search_date.strftime(date_format)),
("EDate", search_date.strftime(date_format)),
]
)

search_url = self.base_url + "?" + search_data

response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

results_table = soup.find("table", summary="List of planning applications that match your query")

for tr in results_table.findAll("tr")[1:]:
application = PlanningApplication()
application.date_received = search_date
tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.comment_url = application.info_url

application.address = ' '.join(tds[1].string.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = tds[2].string.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = ForestOfDeanParser()
print parser.getResults(21,5,2008)

# TODO - looks like it paginates at 20

+ 2
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Переглянути файл

@@ -25,3 +25,5 @@
"Barnsley.py", "420"
"Shetland.py", "420"
"Kensington.py", "420"
"Fife.py", "420"
"ForestOfDean.py", "420"

+ 2
- 1
trunk/python_scrapers/SitesToGenerate.csv Переглянути файл

@@ -225,4 +225,5 @@
"The Royal Borough of Kensington and Chelsea", "Kensington and Chelsea", "", "Kensington", "KensingtonParser"
"Comhairle Nan Eilean Siar", "Comhairle Nan Eilean Siar", "http://planning.cne-siar.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"

"Fife Council", "Fife", "", "Fife", "FifeParser"
"Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"

Завантаження…
Відмінити
Зберегти