Explorar el Código

Add scraper for Flintshire. I've included the OSGB x,y coordinates in the hope that we can start using these where

we don't have postcodes.
import/raw
duncan.parkes hace 16 años
padre
commit
b5dcd82e60
Se han modificado 4 ficheros con 115 adiciones y 10 borrados
  1. +93
    -0
      trunk/python_scrapers/Flintshire.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +20
    -10
      trunk/python_scrapers/PlanningUtils.py
  4. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 93
- 0
trunk/python_scrapers/Flintshire.py Ver fichero

@@ -0,0 +1,93 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class FlintshireParser:
def __init__(self, *args):

self.authority_name = "Flintshire County Council"
self.authority_short_name = "Flintshire"

# I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

# We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
"start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
soup = BeautifulSoup(response.read())

# Each app is stored in it's own table
result_tables = soup.findAll("table", border="1")

# For the moment, we'll have to ignore the first result (see TODO list).
for table in result_tables[1:]:
application = PlanningApplication()

# It's not clear to me why this next one isn't the string of the next sibling. This works though!
application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]

application.address = table.find(text="Location").parent.findNextSibling().string.strip()
application.postcode = getPostcodeFromText(application.address)

application.info_url = urlparse.urljoin(self.base_url, table.a['href'])

# Let's go to the info_page and get the OSGB and the date_received
info_request = urllib2.Request(application.info_url)

# We need to add the language header in order to get UK style dates
info_request.add_header("Accept-Language", "en-gb,en")
info_response = urllib2.urlopen(info_request)
info_soup = BeautifulSoup(info_response.read())
grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
x_element = grid_reference_td.font
application.osgb_x = x_element.string.strip()
application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()

application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))

application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()


# There is a link to comment from the info page, though I can't click it.
application.comment_url = application.info_url

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = FlintshireParser()
print parser.getResults(22,5,2008)

# TODO

# 1) Email the council about broken first result.
# This is always
# slightly broken (two </td>s for one of the <td>s and upsets beautiful
# soup.

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Ver fichero

@@ -27,3 +27,4 @@
"Kensington.py", "420"
"Fife.py", "420"
"ForestOfDean.py", "420"
"Flintshire.py", "420"

+ 20
- 10
trunk/python_scrapers/PlanningUtils.py Ver fichero

@@ -87,6 +87,11 @@ class PlanningApplication:
# expecting this as a datetime.date object
self.date_received = None

# If we can get them, we may as well include OSGB.
# These will be the entirely numeric version.
self.osgb_x = None
self.osgb_y = None

def __repr__(self):
return self.displayXML()

@@ -105,14 +110,19 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
return u"<application>\n" +\
u"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
u"<address>%s</address>\n" %xmlQuote(self.address) +\
u"<postcode>%s</postcode>\n" %self.postcode +\
u"<description>%s</description>\n" %xmlQuote(self.description) +\
u"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
u"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
u"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
u"</application>\n"

contents = [
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
u"<address>%s</address>" %xmlQuote(self.address),
u"<postcode>%s</postcode>" %self.postcode,
u"<description>%s</description>" %xmlQuote(self.description),
u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
if self.osgb_y:
contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y))

return u"<application>\n%s\n</application>" %('\n'.join(contents))

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Ver fichero

@@ -227,3 +227,4 @@
"East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
"Fife Council", "Fife", "", "Fife", "FifeParser"
"Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"
"Flintshire County Council", "Flintshire", "", "Flintshire", "FlintshireParser"

Cargando…
Cancelar
Guardar