Browse Source

Add Forest of Dean and Fife parsers.

master
duncan.parkes 16 years ago
parent
commit
66bb2955a8
4 changed files with 167 additions and 1 deletions
  1. +88
    -0
      python_scrapers/Fife.py
  2. +75
    -0
      python_scrapers/ForestOfDean.py
  3. +2
    -0
      python_scrapers/OtherFilesToCopy.csv
  4. +2
    -1
      python_scrapers/SitesToGenerate.csv

+ 88
- 0
python_scrapers/Fife.py View File

@@ -0,0 +1,88 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class FifeParser:
def __init__(self, *args):

self.authority_name = "Fife Council"
self.authority_short_name = "Fife"
self.base_url = "http://www.fifedirect.org.uk/topics/index.cfm"

self.comment_url = "http://www.ukplanning.com/ukp/showCaseFile.do?councilName=Fife+Council&appNumber=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_data = urllib.urlencode(
[("fuseaction", "planapps.list"),
("SUBJECTID", "104CC166-3ED1-4D22-B9F1E2FB8438478A"),
("src_fromdayRec", day),
("src_frommonthRec", month),
("src_fromyearRec", year),
("src_todayRec", day),
("src_tomonthRec", month),
("src_toyearRec", year),
("findroadworks", "GO"),
]
)
search_url = self.base_url + "?" + search_data

response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

results_table = soup.find("table", id="results")

# Apart from the first tr, which contains headers, the trs come in pairs for each application

trs = results_table.findAll("tr")[1:]

tr_count = 0
while tr_count < len(trs):
tr = trs[tr_count]

if tr_count % 2 == 0:
application = PlanningApplication()
application.date_received = search_date
tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.comment_url = self.comment_url %(application.council_reference)

application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.address = ', '.join([x.strip() for x in tds[1].findAll(text=True)])
application.postcode = getPostcodeFromText(application.address)
else:
# Get rid of the "Details: " at the beginning.
application.description = tr.td.string.strip()[9:]

self._results.addApplication(application)

tr_count += 1

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = FifeParser()
print parser.getResults(21,5,2008)

# TODO

# Paginates at 50. Unlikely on a single day, so we'll worry about it later.

+ 75
- 0
python_scrapers/ForestOfDean.py View File

@@ -0,0 +1,75 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%Y%m%d"

class ForestOfDeanParser:
def __init__(self, *args):

self.authority_name = "Forest of Dean District Council"
self.authority_short_name = "Forest of Dean"
self.base_url = "http://www.fdean.gov.uk/content.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

search_data = urllib.urlencode(
[
("parent_directory_id", "200"),
("nav", "679"),
("id", "13266"),
("RecStart", "1"),
("RecCount", "100"),
("SDate", search_date.strftime(date_format)),
("EDate", search_date.strftime(date_format)),
]
)

search_url = self.base_url + "?" + search_data

response = urllib2.urlopen(search_url)
soup = BeautifulSoup(response.read())

results_table = soup.find("table", summary="List of planning applications that match your query")

for tr in results_table.findAll("tr")[1:]:
application = PlanningApplication()
application.date_received = search_date
tds = tr.findAll("td")

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.comment_url = application.info_url

application.address = ' '.join(tds[1].string.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = tds[2].string.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = ForestOfDeanParser()
print parser.getResults(21,5,2008)

# TODO - looks like it paginates at 20

+ 2
- 0
python_scrapers/OtherFilesToCopy.csv View File

@@ -25,3 +25,5 @@
"Barnsley.py", "420" "Barnsley.py", "420"
"Shetland.py", "420" "Shetland.py", "420"
"Kensington.py", "420" "Kensington.py", "420"
"Fife.py", "420"
"ForestOfDean.py", "420"

+ 2
- 1
python_scrapers/SitesToGenerate.csv View File

@@ -225,4 +225,5 @@
"The Royal Borough of Kensington and Chelsea", "Kensington and Chelsea", "", "Kensington", "KensingtonParser" "The Royal Borough of Kensington and Chelsea", "Kensington and Chelsea", "", "Kensington", "KensingtonParser"
"Comhairle Nan Eilean Siar", "Comhairle Nan Eilean Siar", "http://planning.cne-siar.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Comhairle Nan Eilean Siar", "Comhairle Nan Eilean Siar", "http://planning.cne-siar.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" "East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"

"Fife Council", "Fife", "", "Fife", "FifeParser"
"Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"

Loading…
Cancel
Save