Преглед на файлове

Add scraper for Herefordshire.

Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.
master
duncan.parkes преди 16 години
родител
ревизия
d92f3bb6fd
променени са 5 файла, в които са добавени 97 реда и са изтрити 17 реда
  1. +85
    -0
      python_scrapers/Herefordshire.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +3
    -3
      python_scrapers/PlanningExplorer.py
  4. +7
    -14
      python_scrapers/PlanningUtils.py
  5. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 85
- 0
python_scrapers/Herefordshire.py Целия файл

@@ -0,0 +1,85 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HerefordshireParser:
comments_email_address = "Developmentcontrol@barnsley.gov.uk"

def __init__(self, *args):

self.authority_name = "Herefordshire County Council"
self.authority_short_name = "Herefordshire"
self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0"
#As we are going to the info page, we may as well pick up the comment url from there.
# self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode(
(("show", "0"),
("Go", "GO"),
)
)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})

soup = BeautifulSoup(response.read())

if not soup.find(text=re.compile("Sorry, no matches found")):
# There were apps for this date

trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:]

for tr in trs:
application = PlanningApplication()
application.date_received = search_day

application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
application.council_reference = tr.a.string
# application.comment_url = self.comment_url %(application.council_reference)

tds = tr.findAll("td")

application.address = tds[1].string
application.postcode = getPostcodeFromText(application.address)

# This just gets us an initial segment of the description.
# We are going to have to download the info page...
#application.description = tds[2].string.strip()

info_response = urllib.urlopen(application.info_url)

info_soup = BeautifulSoup(info_response.read())

application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href'])

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HerefordshireParser()
print parser.getResults(31,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Целия файл

@@ -55,3 +55,4 @@
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"
"Herefordshire.py", "420"

+ 3
- 3
python_scrapers/PlanningExplorer.py Целия файл

@@ -631,7 +631,7 @@ if __name__ == '__main__':
# NOTE - 04/11/2007 is a sunday
# I'm using it to test that the scrapers behave on days with no apps.
parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
@@ -639,7 +639,7 @@ if __name__ == '__main__':
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
@@ -655,7 +655,7 @@ if __name__ == '__main__':
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
print parser.getResults(3, 7, 2008)
print parser.getResults(4, 9, 2008)

# To Do



+ 7
- 14
python_scrapers/PlanningUtils.py Целия файл

@@ -4,13 +4,6 @@ import re

date_format = "%d/%m/%Y"


def xmlQuote(text):
# Change &s to &s
# I suspect there is probably some standard python
# function I should be using for this...
return text.replace('&', '&')

def fixNewlines(text):
# This can be used to sort out windows newlines
return text.replace("\r\n","\n")
@@ -112,13 +105,13 @@ class PlanningApplication:
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

contents = [
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
u"<address>%s</address>" %xmlQuote(self.address),
u"<postcode>%s</postcode>" %self.postcode,
u"<description>%s</description>" %xmlQuote(self.description),
u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address),
u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
u"<description><![CDATA[%s]]></description>" %(self.description),
u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))


+ 1
- 0
python_scrapers/SitesToGenerate.csv Целия файл

@@ -259,3 +259,4 @@
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"

Зареждане…
Отказ
Запис