Add scraper for Flintshire. I've included the OSGB x,y coordinates in the hope that we can start using these where

we don't have postcodes.
16年前 · c5690efa2d
--- a/python_scrapers/Flintshire.py
+++ b/python_scrapers/Flintshire.py
@@ -0,0 +1,93 @@
 import urllib2
 import urllib
 import urlparse

 import datetime, time
 import cgi

 import re

 from BeautifulSoup import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 date_format = "%d/%m/%Y"

 class FlintshireParser:
    def __init__(self, *args):

        self.authority_name = "Flintshire County Council"
        self.authority_short_name = "Flintshire"

        # I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
        self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_date = datetime.date(year, month, day)

        # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
        response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
                                                   "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
        soup = BeautifulSoup(response.read())

        # Each app is stored in it's own table
        result_tables = soup.findAll("table", border="1")

        # For the moment, we'll have to ignore the first result (see TODO list).
        for table in result_tables[1:]:
            application = PlanningApplication()

            # It's not clear to me why this next one isn't the string of the next sibling. This works though!
            application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]

            application.address = table.find(text="Location").parent.findNextSibling().string.strip()
            application.postcode = getPostcodeFromText(application.address)

            application.info_url = urlparse.urljoin(self.base_url, table.a['href'])

            # Let's go to the info_page and get the OSGB and the date_received
            info_request = urllib2.Request(application.info_url)

            # We need to add the language header in order to get UK style dates
            info_request.add_header("Accept-Language", "en-gb,en")
            info_response = urllib2.urlopen(info_request)
            info_soup = BeautifulSoup(info_response.read())
            
            grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
            x_element = grid_reference_td.font
            
            application.osgb_x = x_element.string.strip()
            application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
            
            date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()

            application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))

            application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()


            # There is a link to comment from the info page, though I can't click it.
            application.comment_url = application.info_url

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 if __name__ == '__main__':
    parser = FlintshireParser()
    print parser.getResults(22,5,2008)

 # TODO

 # 1) Email the council about broken first result.
 # This is always
 # slightly broken (two </td>s for one of the <td>s and upsets beautiful
 # soup.
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -27,3 +27,4 @@
 "Kensington.py", "420"
 "Fife.py", "420"
 "ForestOfDean.py", "420"
 "Flintshire.py", "420"
--- a/python_scrapers/PlanningUtils.py
+++ b/python_scrapers/PlanningUtils.py
@@ -87,6 +87,11 @@ class PlanningApplication:
        # expecting this as a datetime.date object
 	self.date_received = None

        # If we can get them, we may as well include OSGB.
        # These will be the entirely numeric version.
        self.osgb_x = None
        self.osgb_y = None

    def __repr__(self):
 	return self.displayXML()

@@ -105,14 +110,19 @@ class PlanningApplication:
        
    def displayXML(self):
        #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
 	return  u"<application>\n" +\
 	u"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
        u"<address>%s</address>\n" %xmlQuote(self.address) +\
        u"<postcode>%s</postcode>\n" %self.postcode +\
 	u"<description>%s</description>\n" %xmlQuote(self.description) +\
 	u"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\
 	u"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\
        u"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\
        u"</application>\n"

        
 	contents = [
            u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
            u"<address>%s</address>" %xmlQuote(self.address),
            u"<postcode>%s</postcode>" %self.postcode,
            u"<description>%s</description>" %xmlQuote(self.description),
            u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
            u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
            u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
            ]
        if self.osgb_x:
            contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
        if self.osgb_y:
            contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y))

        return u"<application>\n%s\n</application>" %('\n'.join(contents))
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -227,3 +227,4 @@
 "East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser"
 "Fife Council", "Fife", "", "Fife", "FifeParser"
 "Forest of Dean District Council", "Forest of Dean", "", "ForestOfDean", "ForestOfDeanParser"
 "Flintshire County Council", "Flintshire", "", "Flintshire", "FlintshireParser"