Parcourir la source

Add scraper for Leicestershire County Council.

master
duncan.parkes il y a 16 ans
Parent
révision
09ca7050c3
3 fichiers modifiés avec 78 ajouts et 0 suppressions
  1. +76
    -0
      python_scrapers/Leicestershire.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 76
- 0
python_scrapers/Leicestershire.py Voir le fichier

@@ -0,0 +1,76 @@
import urllib2
import urllib
import urlparse

import datetime
import re

import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

search_date_format = "%d%%2F%m%%2F%Y"

class LeicestershireParser:
def __init__(self, *args):

self.authority_name = "Leicestershire County Council"
self.authority_short_name = "Leicestershire"
self.base_url = "http://www.leics.gov.uk/index/environment/community_services_planning/planning_applications/index/environment/community_services_planning/planning_applications/eplanning_searchform/eplanning_resultpage.htm?sd=%(date)s&ed=%(date)s&kw=&map=f"
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

response = urllib2.urlopen(self.base_url %{"date": search_date.strftime(search_date_format)})
soup = BeautifulSoup.BeautifulSoup(response.read())

if not soup.find(text=re.compile("No Results Found")):
trs = soup.findAll("table", {"class": "dataTable"})[1].findAll("tr")[1:]

for tr in trs:
tds = tr.findAll("td")

application = PlanningApplication()

# We can fill in the date received without actually looking at the data
application.date_received = search_date

application.council_reference = tds[0].a.string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
application.address = ', '.join([x for x in tds[1].contents
if isinstance(x, BeautifulSoup.NavigableString)])
application.postcode = getPostcodeFromText(application.address)
application.description = tds[2].string.strip()

# To get the comment link we need to fetch the info page

info_response = urllib2.urlopen(application.info_url)
info_soup = BeautifulSoup.BeautifulSoup(info_response.read())

base = info_soup.base['href']

application.comment_url = urlparse.urljoin(base,
info_soup.find("a", target="Planning Application Consultation Form")['href'])

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = LeicestershireParser()
print parser.getResults(1,9,2008)


# TODO

# I suppose we should think about pagination at some point,
# though I've not managed to find a day with more than 1 app yet...

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Voir le fichier

@@ -62,3 +62,4 @@
"WestDorset.py", "420"
"Kirklees.py", "420"
"Lichfield.py", "420"
"Leicestershire.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Voir le fichier

@@ -268,3 +268,4 @@
"West Dorset District Council", "West Dorset", "", "WestDorset", "WestDorsetParser"
"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"
"Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"

Chargement…
Annuler
Enregistrer