Parcourir la source

Add Forest of Dean and Fife parsers.

master
duncan.parkes il y a 18 ans
Parent
révision
66bb2955a8
4 fichiers modifiés avec 167 ajouts et 1 suppressions
  1. +88
    -0
      python_scrapers/Fife.py
  2. +75
    -0
      python_scrapers/ForestOfDean.py
  3. +2
    -0
      python_scrapers/OtherFilesToCopy.csv
  4. +2
    -1
      python_scrapers/SitesToGenerate.csv

+ 88
- 0
python_scrapers/Fife.py Voir le fichier

@@ -0,0 +1,88 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class FifeParser:
def __init__(self, *args):

self.authority_name = "Fife Council"
self.authority_short_name = "Fife"
self.base_url = "http://www.fifedirect.org.uk/topics/index.cfm"

self.comment_url = "http://www.ukplanning.com/ukp/showCaseFile.do?councilName=Fife+Council&appNumber=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_data = urllib.urlencode(
[("fuseaction", "planapps.list"),
("SUBJECTID", "104CC166-3ED1-4D22-B9F1E2FB8438478A"),
("src_fromdayRec", day),
("src_frommonthRec", month),
("src_fromyearRec", year),
("src_todayRec", day),
("src_tomonthRec", month),
("src_toyearRec", year),
("findroadworks", "GO"),
]
)
search_url = self.base_url + "?" + search_data

response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

results_table = soup.find("table", id="results")

# Apart from the first tr, which contains headers, the trs come in pairs for each application

trs = results_table.findAll("tr")[1:]

tr_count = 0
while tr_count < len(trs):
tr = trs[tr_count]

if tr_count % 2 == 0:
application = PlanningApplication()
application.date_received = search_date
tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.comment_url = self.comment_url %(application.council_reference)

application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.address = ', '.join([x.strip() for x in tds[1].findAll(text=True)])
application.postcode = getPostcodeFromText(application.address)
else:
# Get rid of the "Details: " at the beginning.
application.description = tr.td.string.strip()[9:]

self._results.addApplication(application)

tr_count += 1

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = FifeParser()
print parser.getResults(21,5,2008)

# TODO

# Paginates at 50. Unlikely on a single day, so we'll worry about it later.

+ 75
- 0
python_scrapers/ForestOfDean.py Voir le fichier

@@ -0,0 +1,75 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%Y%m%d"

class ForestOfDeanParser:
def __init__(self, *args):

self.authority_name = "Forest of Dean District Council"
self.authority_short_name = "Forest of Dean"
self.base_url = "http://www.fdean.gov.uk/content.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_data = urllib.urlencode(
[
("parent_directory_id", "200"),
("nav", "679"),
("id", "13266"),
("RecStart", "1"),
("RecCount", "100"),
("SDate", search_date.strftime(date_format)),
("EDate", search_date.strftime(date_format)),
]
)

search_url = self.base_url + "?" + search_data

response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

results_table = soup.find("table", summary="List of planning applications that match your query")

for tr in results_table.findAll("tr")[1:]:
application = PlanningApplication()
application.date_received = search_date
tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.comment_url = application.info_url

application.address = ' '.join(tds[1].string.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = tds[2].string.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = ForestOfDeanParser()
print parser.getResults(21,5,2008)

# TODO - looks like it paginates at 20

+ 2
- 0
python_scrapers/OtherFilesToCopy.csv Voir le fichier

@@ -25,3 +25,5 @@
"Barnsley.py", "420" "Barnsley.py", "420"
"Shetland.py", "420" "Shetland.py", "420"
"Kensington.py", "420" "Kensington.py", "420"
"Fife.py", "420"
"ForestOfDean.py", "420"

+ 2
- 1
python_scrapers/SitesToGenerate.csv Voir le fichier

@@ -225,4 +225,5 @@
"The Royal Borough of Kensington and Chelsea", "Kensington and Chelsea", "", "Kensington", "KensingtonParser" "The Royal Borough of Kensington and Chelsea", "Kensington and Chelsea", "", "Kensington", "KensingtonParser"
"Comhairle Nan Eilean Siar", "Comhairle Nan Eilean Siar", "http://planning.cne-siar.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Comhairle Nan Eilean Siar", "Comhairle Nan Eilean Siar", "http://planning.cne-siar.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" "East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"

"Fife Council", "Fife", "", "Fife", "FifeParser"
"Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"

Chargement…
Annuler
Enregistrer