Browse Source

Add scraper for Flintshire. I've included the OSGB x,y coordinates in the hope that we can start using these where

we don't have postcodes.
master
duncan.parkes 16 years ago
parent
commit
c5690efa2d
4 changed files with 115 additions and 10 deletions
  1. +93
    -0
      python_scrapers/Flintshire.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +20
    -10
      python_scrapers/PlanningUtils.py
  4. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 93
- 0
python_scrapers/Flintshire.py View File

@@ -0,0 +1,93 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class FlintshireParser:
def __init__(self, *args):

self.authority_name = "Flintshire County Council"
self.authority_short_name = "Flintshire"

# I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

# We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
"start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
soup = BeautifulSoup(response.read())

# Each app is stored in it's own table
result_tables = soup.findAll("table", border="1")

# For the moment, we'll have to ignore the first result (see TODO list).
for table in result_tables[1:]:
application = PlanningApplication()

# It's not clear to me why this next one isn't the string of the next sibling. This works though!
application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]

application.address = table.find(text="Location").parent.findNextSibling().string.strip()
application.postcode = getPostcodeFromText(application.address)

application.info_url = urlparse.urljoin(self.base_url, table.a['href'])

# Let's go to the info_page and get the OSGB and the date_received
info_request = urllib2.Request(application.info_url)

# We need to add the language header in order to get UK style dates
info_request.add_header("Accept-Language", "en-gb,en")
info_response = urllib2.urlopen(info_request)
info_soup = BeautifulSoup(info_response.read())
grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
x_element = grid_reference_td.font
application.osgb_x = x_element.string.strip()
application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()

application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))

application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()


# There is a link to comment from the info page, though I can't click it.
application.comment_url = application.info_url

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = FlintshireParser()
print parser.getResults(22,5,2008)

# TODO

# 1) Email the council about broken first result.
# This is always
# slightly broken (two </td>s for one of the <td>s and upsets beautiful
# soup.

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv View File

@@ -27,3 +27,4 @@
"Kensington.py", "420"
"Fife.py", "420"
"ForestOfDean.py", "420"
"Flintshire.py", "420"

+ 20
- 10
python_scrapers/PlanningUtils.py View File

@@ -87,6 +87,11 @@ class PlanningApplication:
# expecting this as a datetime.date object
self.date_received = None

# If we can get them, we may as well include OSGB.
# These will be the entirely numeric version.
self.osgb_x = None
self.osgb_y = None

def __repr__(self):
return self.displayXML()

@@ -105,14 +110,19 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
return u"<application>\n" +\
u"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
u"<address>%s</address>\n" %xmlQuote(self.address) +\
u"<postcode>%s</postcode>\n" %self.postcode +\
u"<description>%s</description>\n" %xmlQuote(self.description) +\
u"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
u"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
u"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
u"</application>\n"

contents = [
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
u"<address>%s</address>" %xmlQuote(self.address),
u"<postcode>%s</postcode>" %self.postcode,
u"<description>%s</description>" %xmlQuote(self.description),
u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
if self.osgb_y:
contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y))

return u"<application>\n%s\n</application>" %('\n'.join(contents))

+ 1
- 0
python_scrapers/SitesToGenerate.csv View File

@@ -227,3 +227,4 @@
"East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
"Fife Council", "Fife", "", "Fife", "FifeParser"
"Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"
"Flintshire County Council", "Flintshire", "", "Flintshire", "FlintshireParser"

Loading…
Cancel
Save