Selaa lähdekoodia

Add scraper for Herefordshire.

Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.
import/raw
duncan.parkes 16 vuotta sitten
vanhempi
commit
48ec82b485
5 muutettua tiedostoa jossa 97 lisäystä ja 17 poistoa
  1. +85
    -0
      trunk/python_scrapers/Herefordshire.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +3
    -3
      trunk/python_scrapers/PlanningExplorer.py
  4. +7
    -14
      trunk/python_scrapers/PlanningUtils.py
  5. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 85
- 0
trunk/python_scrapers/Herefordshire.py Näytä tiedosto

@@ -0,0 +1,85 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HerefordshireParser:
comments_email_address = "Developmentcontrol@barnsley.gov.uk"

def __init__(self, *args):

self.authority_name = "Herefordshire County Council"
self.authority_short_name = "Herefordshire"
self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0"
#As we are going to the info page, we may as well pick up the comment url from there.
# self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = urllib.urlencode(
(("show", "0"),
("Go", "GO"),
)
)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})

soup = BeautifulSoup(response.read())

if not soup.find(text=re.compile("Sorry, no matches found")):
# There were apps for this date

trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:]

for tr in trs:
application = PlanningApplication()
application.date_received = search_day

application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
application.council_reference = tr.a.string
# application.comment_url = self.comment_url %(application.council_reference)

tds = tr.findAll("td")

application.address = tds[1].string
application.postcode = getPostcodeFromText(application.address)

# This just gets us an initial segment of the description.
# We are going to have to download the info page...
#application.description = tds[2].string.strip()

info_response = urllib.urlopen(application.info_url)

info_soup = BeautifulSoup(info_response.read())

application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href'])

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HerefordshireParser()
print parser.getResults(31,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Näytä tiedosto

@@ -55,3 +55,4 @@
"Halton.py", "420"
"Hampshire.py", "420"
"Hastings.py", "420"
"Herefordshire.py", "420"

+ 3
- 3
trunk/python_scrapers/PlanningExplorer.py Näytä tiedosto

@@ -631,7 +631,7 @@ if __name__ == '__main__':
# NOTE - 04/11/2007 is a sunday
# I'm using it to test that the scrapers behave on days with no apps.
parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
@@ -639,7 +639,7 @@ if __name__ == '__main__':
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
@@ -655,7 +655,7 @@ if __name__ == '__main__':
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
print parser.getResults(3, 7, 2008)
print parser.getResults(4, 9, 2008)

# To Do



+ 7
- 14
trunk/python_scrapers/PlanningUtils.py Näytä tiedosto

@@ -4,13 +4,6 @@ import re

date_format = "%d/%m/%Y"


def xmlQuote(text):
# Change &s to &s
# I suspect there is probably some standard python
# function I should be using for this...
return text.replace('&', '&')

def fixNewlines(text):
# This can be used to sort out windows newlines
return text.replace("\r\n","\n")
@@ -112,13 +105,13 @@ class PlanningApplication:
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

contents = [
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
u"<address>%s</address>" %xmlQuote(self.address),
u"<postcode>%s</postcode>" %self.postcode,
u"<description>%s</description>" %xmlQuote(self.description),
u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
u"<address><![CDATA[%s]]></address>" %(self.address),
u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
u"<description><![CDATA[%s]]></description>" %(self.description),
u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))


+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Näytä tiedosto

@@ -259,3 +259,4 @@
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"

Ladataan…
Peruuta
Tallenna