Browse Source

Add scraper for Calderdale.

master
duncan.parkes 16 years ago
parent
commit
81851a73b4
3 changed files with 81 additions and 0 deletions
  1. +79
    -0
      python_scrapers/Calderdale.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 79
- 0
python_scrapers/Calderdale.py View File

@@ -0,0 +1,79 @@
import urllib2
import urllib
import urlparse

import datetime

import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class CalderdaleParser:
def __init__(self, *args):
self.authority_name = "Calderdale Council"
self.authority_short_name = "Calderdale"
self.base_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?status=0&date1=%(date)s&date2=%(date)s&Search=Search"
self.info_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?app=%s&Search=Search"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

next_page_url = self.base_url %{"date": search_date.strftime(date_format)}

while next_page_url:
try:
response = urllib2.urlopen(next_page_url)
except urllib2.HTTPError:
# This is what seems to happen if there are no apps
break

soup = BeautifulSoup(response.read())

next = soup.find(text="Next")
if next:
next_page_url = urlparse.urljoin(self.base_url, next.parent['href'])
else:
next_page_url = None

# There is an <h3> for each app that we can use
for h3 in soup.findAll("h3", {"class": "resultsnavbar"}):
application = PlanningApplication()

application.date_received = search_date
application.council_reference = h3.string.split(": ")[1]
application.description = h3.findNext("div").find(text="Proposal:").parent.nextSibling.strip()

application.address = ', '.join(h3.findNext("div").find(text="Address of proposal:").parent.nextSibling.strip().split("\r"))
application.postcode = getPostcodeFromText(application.address)

application.comment_url = urlparse.urljoin(self.base_url, h3.findNext("div").find(text=re.compile("Comment on Application")).parent['href'])

application.info_url = self.info_url %(urllib.quote(application.council_reference))

application.osgb_x, application.osgb_y = h3.findNext("div").find(text="Grid Reference:").parent.nextSibling.strip().split()

self._results.addApplication(application)

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


if __name__ == '__main__':
parser = CalderdaleParser()
print parser.getResults(1,10,2008)

# TODO

# 1) Find a better way to deal with the no apps situation.

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv View File

@@ -64,3 +64,4 @@
"Lichfield.py", "420" "Lichfield.py", "420"
"Leicestershire.py", "420" "Leicestershire.py", "420"
"Cairngorms.py", "420" "Cairngorms.py", "420"
"Calderdale.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv View File

@@ -270,3 +270,4 @@
"Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser" "Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser" "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser" "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"

Loading…
Cancel
Save