浏览代码

Add scraper for Leicestershire County Council.

master
duncan.parkes 16 年前
父节点
当前提交
09ca7050c3
共有 3 个文件被更改,包括 78 次插入0 次删除
  1. +76
    -0
      python_scrapers/Leicestershire.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 76
- 0
python_scrapers/Leicestershire.py 查看文件

@@ -0,0 +1,76 @@
import urllib2
import urllib
import urlparse

import datetime
import re

import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

search_date_format = "%d%%2F%m%%2F%Y"

class LeicestershireParser:
def __init__(self, *args):

self.authority_name = "Leicestershire County Council"
self.authority_short_name = "Leicestershire"
self.base_url = "http://www.leics.gov.uk/index/environment/community_services_planning/planning_applications/index/environment/community_services_planning/planning_applications/eplanning_searchform/eplanning_resultpage.htm?sd=%(date)s&ed=%(date)s&kw=&map=f"
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format)})
soup = BeautifulSoup.BeautifulSoup(response.read())

if not soup.find(text=re.compile("No Results Found")):
trs = soup.findAll("table", {"class": "dataTable"})[1].findAll("tr")[1:]

for tr in trs:
tds = tr.findAll("td")

application = PlanningApplication()

# We can fill in the date received without actually looking at the data
application.date_received = search_date

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.address = ', '.join([x for x in tds[1].contents
if isinstance(x, BeautifulSoup.NavigableString)])
application.postcode = getPostcodeFromText(application.address)
application.description = tds[2].string.strip()

# To get the comment link we need to fetch the info page

info_response = urllib2.urlopen(application.info_url)
info_soup = BeautifulSoup.BeautifulSoup(info_response.read())

base = info_soup.base['href']

application.comment_url = urlparse.urljoin(base,
info_soup.find("a", target="Planning Application Consultation Form")['href'])

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = LeicestershireParser()
print parser.getResults(1,9,2008)


# TODO

# I suppose we should think about pagination at some point,
# though I've not managed to find a day with more than 1 app yet...

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv 查看文件

@@ -62,3 +62,4 @@
"WestDorset.py", "420"
"Kirklees.py", "420"
"Lichfield.py", "420"
"Leicestershire.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv 查看文件

@@ -268,3 +268,4 @@
"West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser"
"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"
"Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"

正在加载...
取消
保存