Преглед изворни кода

Add scraper for Calderdale.

master
duncan.parkes пре 17 година
родитељ
комит
81851a73b4
3 измењених фајлова са 81 додато и 0 уклоњено
  1. +79
    -0
      python_scrapers/Calderdale.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 79
- 0
python_scrapers/Calderdale.py Прегледај датотеку

@@ -0,0 +1,79 @@
import urllib2
import urllib
import urlparse

import datetime

import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class CalderdaleParser:
def __init__(self, *args):
self.authority_name = "Calderdale Council"
self.authority_short_name = "Calderdale"
self.base_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?status=0&date1=%(date)s&date2=%(date)s&Search=Search"
self.info_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?app=%s&Search=Search"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

next_page_url = self.base_url %{"date": search_date.strftime(date_format)}

while next_page_url:
try:
response = urllib2.urlopen(next_page_url)
except urllib2.HTTPError:
# This is what seems to happen if there are no apps
break

soup = BeautifulSoup(response.read())

next = soup.find(text="Next")
if next:
next_page_url = urlparse.urljoin(self.base_url, next.parent['href'])
else:
next_page_url = None

# There is an <h3> for each app that we can use
for h3 in soup.findAll("h3", {"class": "resultsnavbar"}):
application = PlanningApplication()

application.date_received = search_date
application.council_reference = h3.string.split(": ")[1]
application.description = h3.findNext("div").find(text="Proposal:").parent.nextSibling.strip()

application.address = ', '.join(h3.findNext("div").find(text="Address of proposal:").parent.nextSibling.strip().split("\r"))
application.postcode = getPostcodeFromText(application.address)

application.comment_url = urlparse.urljoin(self.base_url, h3.findNext("div").find(text=re.compile("Comment on Application")).parent['href'])

application.info_url = self.info_url %(urllib.quote(application.council_reference))

application.osgb_x, application.osgb_y = h3.findNext("div").find(text="Grid Reference:").parent.nextSibling.strip().split()

self._results.addApplication(application)

return self._results


def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


if __name__ == '__main__':
parser = CalderdaleParser()
print parser.getResults(1,10,2008)

# TODO

# 1) Find a better way to deal with the no apps situation.

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Прегледај датотеку

@@ -64,3 +64,4 @@
"Lichfield.py", "420" "Lichfield.py", "420"
"Leicestershire.py", "420" "Leicestershire.py", "420"
"Cairngorms.py", "420" "Cairngorms.py", "420"
"Calderdale.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Прегледај датотеку

@@ -270,3 +270,4 @@
"Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser" "Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser" "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser" "Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"
"Calderdale Council", "Calderdale", "", "Calderdale", "CalderdaleParser"

Loading…
Откажи
Сачувај