瀏覽代碼

Add scraper for Herefordshire.

Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.
import/raw
duncan.parkes 16 年之前
父節點
當前提交
48ec82b485
共有 5 個檔案被更改,包括 97 行新增17 行删除
  1. +85
    -0
      trunk/python_scrapers/Herefordshire.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +3
    -3
      trunk/python_scrapers/PlanningExplorer.py
  4. +7
    -14
      trunk/python_scrapers/PlanningUtils.py
  5. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 85
- 0
trunk/python_scrapers/Herefordshire.py 查看文件

@@ -0,0 +1,85 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HerefordshireParser:
comments_email_address = "Developmentcontrol@barnsley.gov.uk"

def __init__(self, *args):

self.authority_name = "Herefordshire County Council"
self.authority_short_name = "Herefordshire"
self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0"
#As we are going to the info page, we may as well pick up the comment url from there.
# self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode(
(("show", "0"),
("Go", "GO"),
)
)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})

soup = BeautifulSoup(response.read())

if not soup.find(text=re.compile("Sorry, no matches found")):
# There were apps for this date

trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:]

for tr in trs:
application = PlanningApplication()
application.date_received = search_day

application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
application.council_reference = tr.a.string
# application.comment_url = self.comment_url %(application.council_reference)

tds = tr.findAll("td")

application.address = tds[1].string
application.postcode = getPostcodeFromText(application.address)

# This just gets us an initial segment of the description.
# We are going to have to download the info page...
#application.description = tds[2].string.strip()

info_response = urllib.urlopen(application.info_url)

info_soup = BeautifulSoup(info_response.read())

application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href'])

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HerefordshireParser()
print parser.getResults(31,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv 查看文件

@@ -55,3 +55,4 @@
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"
"Herefordshire.py", "420"

+ 3
- 3
trunk/python_scrapers/PlanningExplorer.py 查看文件

@@ -631,7 +631,7 @@ if __name__ == '__main__':
# NOTE - 04/11/2007 is a sunday
# I'm using it to test that the scrapers behave on days with no apps.
parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
@@ -639,7 +639,7 @@ if __name__ == '__main__':
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
@@ -655,7 +655,7 @@ if __name__ == '__main__':
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
print parser.getResults(3, 7, 2008)
print parser.getResults(4, 9, 2008)

# To Do



+ 7
- 14
trunk/python_scrapers/PlanningUtils.py 查看文件

@@ -4,13 +4,6 @@ import re

date_format = "%d/%m/%Y"


def xmlQuote(text):
# Change &s to &s
# I suspect there is probably some standard python
# function I should be using for this...
return text.replace('&', '&')

def fixNewlines(text):
# This can be used to sort out windows newlines
return text.replace("\r\n","\n")
@@ -112,13 +105,13 @@ class PlanningApplication:
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

contents = [
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
u"<address>%s</address>" %xmlQuote(self.address),
u"<postcode>%s</postcode>" %self.postcode,
u"<description>%s</description>" %xmlQuote(self.description),
u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address),
u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
u"<description><![CDATA[%s]]></description>" %(self.description),
u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))


+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv 查看文件

@@ -259,3 +259,4 @@
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"

Loading…
取消
儲存