Parcourir la source

Add scraper for Herefordshire.

Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.
master
duncan.parkes il y a 16 ans
Parent
révision
d92f3bb6fd
5 fichiers modifiés avec 97 ajouts et 17 suppressions
  1. +85
    -0
      python_scrapers/Herefordshire.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +3
    -3
      python_scrapers/PlanningExplorer.py
  4. +7
    -14
      python_scrapers/PlanningUtils.py
  5. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 85
- 0
python_scrapers/Herefordshire.py Voir le fichier

@@ -0,0 +1,85 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HerefordshireParser:
comments_email_address = "Developmentcontrol@barnsley.gov.uk"

def __init__(self, *args):

self.authority_name = "Herefordshire County Council"
self.authority_short_name = "Herefordshire"
self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0"
#As we are going to the info page, we may as well pick up the comment url from there.
# self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode(
(("show", "0"),
("Go", "GO"),
)
)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})

soup = BeautifulSoup(response.read())

if not soup.find(text=re.compile("Sorry, no matches found")):
# There were apps for this date

trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:]

for tr in trs:
application = PlanningApplication()
application.date_received = search_day

application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
application.council_reference = tr.a.string
# application.comment_url = self.comment_url %(application.council_reference)

tds = tr.findAll("td")

application.address = tds[1].string
application.postcode = getPostcodeFromText(application.address)

# This just gets us an initial segment of the description.
# We are going to have to download the info page...
#application.description = tds[2].string.strip()

info_response = urllib.urlopen(application.info_url)

info_soup = BeautifulSoup(info_response.read())

application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href'])

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HerefordshireParser()
print parser.getResults(31,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Voir le fichier

@@ -55,3 +55,4 @@
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"
"Herefordshire.py", "420"

+ 3
- 3
python_scrapers/PlanningExplorer.py Voir le fichier

@@ -631,7 +631,7 @@ if __name__ == '__main__':
# NOTE - 04/11/2007 is a sunday
# I'm using it to test that the scrapers behave on days with no apps.
parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
@@ -639,7 +639,7 @@ if __name__ == '__main__':
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
@@ -655,7 +655,7 @@ if __name__ == '__main__':
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
print parser.getResults(3, 7, 2008)
print parser.getResults(4, 9, 2008)

# To Do



+ 7
- 14
python_scrapers/PlanningUtils.py Voir le fichier

@@ -4,13 +4,6 @@ import re

date_format = "%d/%m/%Y"


def xmlQuote(text):
# Change &s to &s
# I suspect there is probably some standard python
# function I should be using for this...
return text.replace('&', '&')

def fixNewlines(text):
# This can be used to sort out windows newlines
return text.replace("\r\n","\n")
@@ -112,13 +105,13 @@ class PlanningApplication:
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

contents = [
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
u"<address>%s</address>" %xmlQuote(self.address),
u"<postcode>%s</postcode>" %self.postcode,
u"<description>%s</description>" %xmlQuote(self.description),
u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address),
u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
u"<description><![CDATA[%s]]></description>" %(self.description),
u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))


+ 1
- 0
python_scrapers/SitesToGenerate.csv Voir le fichier

@@ -259,3 +259,4 @@
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"

Chargement…
Annuler
Enregistrer