Преглед на файлове

Add scraper for Flintshire. I've included the OSGB x,y coordinates in the hope that we can start using these where

we don't have postcodes.
master
duncan.parkes преди 16 години
родител
ревизия
c5690efa2d
променени са 4 файла, в които са добавени 115 реда и са изтрити 10 реда
  1. +93
    -0
      python_scrapers/Flintshire.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +20
    -10
      python_scrapers/PlanningUtils.py
  4. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 93
- 0
python_scrapers/Flintshire.py Целия файл

@@ -0,0 +1,93 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class FlintshireParser:
def __init__(self, *args):

self.authority_name = "Flintshire County Council"
self.authority_short_name = "Flintshire"

# I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

# We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
"start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
soup = BeautifulSoup(response.read())

# Each app is stored in it's own table
result_tables = soup.findAll("table", border="1")

# For the moment, we'll have to ignore the first result (see TODO list).
for table in result_tables[1:]:
application = PlanningApplication()

# It's not clear to me why this next one isn't the string of the next sibling. This works though!
application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]

application.address = table.find(text="Location").parent.findNextSibling().string.strip()
application.postcode = getPostcodeFromText(application.address)

application.info_url = urlparse.urljoin(self.base_url, table.a['href'])

# Let's go to the info_page and get the OSGB and the date_received
info_request = urllib2.Request(application.info_url)

# We need to add the language header in order to get UK style dates
info_request.add_header("Accept-Language", "en-gb,en")
info_response = urllib2.urlopen(info_request)
info_soup = BeautifulSoup(info_response.read())
grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
x_element = grid_reference_td.font
application.osgb_x = x_element.string.strip()
application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()

application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))

application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()


# There is a link to comment from the info page, though I can't click it.
application.comment_url = application.info_url

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = FlintshireParser()
print parser.getResults(22,5,2008)

# TODO

# 1) Email the council about broken first result.
# This is always
# slightly broken (two </td>s for one of the <td>s and upsets beautiful
# soup.

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Целия файл

@@ -27,3 +27,4 @@
"Kensington.py", "420"
"Fife.py", "420"
"ForestOfDean.py", "420"
"Flintshire.py", "420"

+ 20
- 10
python_scrapers/PlanningUtils.py Целия файл

@@ -87,6 +87,11 @@ class PlanningApplication:
# expecting this as a datetime.date object
self.date_received = None

# If we can get them, we may as well include OSGB.
# These will be the entirely numeric version.
self.osgb_x = None
self.osgb_y = None

def __repr__(self):
return self.displayXML()

@@ -105,14 +110,19 @@ class PlanningApplication:
def displayXML(self):
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
return u"<application>\n" +\
u"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
u"<address>%s</address>\n" %xmlQuote(self.address) +\
u"<postcode>%s</postcode>\n" %self.postcode +\
u"<description>%s</description>\n" %xmlQuote(self.description) +\
u"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
u"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
u"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
u"</application>\n"

contents = [
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
u"<address>%s</address>" %xmlQuote(self.address),
u"<postcode>%s</postcode>" %self.postcode,
u"<description>%s</description>" %xmlQuote(self.description),
u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
]
if self.osgb_x:
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
if self.osgb_y:
contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y))

return u"<application>\n%s\n</application>" %('\n'.join(contents))

+ 1
- 0
python_scrapers/SitesToGenerate.csv Целия файл

@@ -227,3 +227,4 @@
"East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
"Fife Council", "Fife", "", "Fife", "FifeParser"
"Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"
"Flintshire County Council", "Flintshire", "", "Flintshire", "FlintshireParser"

Зареждане…
Отказ
Запис