Add scraper for Herefordshire.

Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.
17 年之前 · d92f3bb6fd
--- a/python_scrapers/Herefordshire.py
+++ b/python_scrapers/Herefordshire.py
@@ -0,0 +1,85 @@

 import urllib2
 import urllib
 import urlparse

 import datetime, time
 import cgi
 import re

 from BeautifulSoup import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 date_format = "%d/%m/%Y"

 class HerefordshireParser:
    comments_email_address = "Developmentcontrol@barnsley.gov.uk"

    def __init__(self, *args):

        self.authority_name = "Herefordshire County Council"
        self.authority_short_name = "Herefordshire"
        self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0"
        #As we are going to the info page, we may as well pick up the comment url from there.
 #        self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        post_data = urllib.urlencode(
            (("show", "0"),
             ("Go", "GO"),
             )
            )

        # Now get the search page
        response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})

        soup = BeautifulSoup(response.read())

        if not soup.find(text=re.compile("Sorry, no matches found")):
            # There were apps for this date

            trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:]

            for tr in trs:
                application = PlanningApplication()
                application.date_received = search_day

                application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
                application.council_reference = tr.a.string
    #            application.comment_url = self.comment_url %(application.council_reference)

                tds = tr.findAll("td")

                application.address = tds[1].string
                application.postcode = getPostcodeFromText(application.address)

                # This just gets us an initial segment of the description.
                # We are going to have to download the info page...
                #application.description = tds[2].string.strip()

                info_response = urllib.urlopen(application.info_url)

                info_soup = BeautifulSoup(info_response.read())

                application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()
                application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href'])

                self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 if __name__ == '__main__':
    parser = HerefordshireParser()
    print parser.getResults(31,8,2008)

--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -55,3 +55,4 @@
 "Halton.py", "420"
 "Hampshire.py", "420"
 "Hastings.py", "420"
 "Herefordshire.py", "420"
--- a/python_scrapers/PlanningExplorer.py
+++ b/python_scrapers/PlanningExplorer.py
@@ -631,7 +631,7 @@ if __name__ == '__main__':
    # NOTE - 04/11/2007 is a sunday
    # I'm using it to test that the scrapers behave on days with no apps.
    
    parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
 #    parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
 #    parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
 #    parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
 #    parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
@@ -639,7 +639,7 @@ if __name__ == '__main__':
 #    parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
 #    parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
 #    parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
 #    parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
    parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
 #    parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
 #    parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
 #    parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
@@ -655,7 +655,7 @@ if __name__ == '__main__':
 #    parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
 #    parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
 #    parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
    print parser.getResults(3, 7, 2008)
    print parser.getResults(4, 9, 2008)

 # To Do

--- a/python_scrapers/PlanningUtils.py
+++ b/python_scrapers/PlanningUtils.py
@@ -4,13 +4,6 @@ import re

 date_format = "%d/%m/%Y"


 def xmlQuote(text):
    # Change &s to &amp;s
    # I suspect there is probably some standard python
    # function I should be using for this...
    return text.replace('&', '&amp;')

 def fixNewlines(text):
    # This can be used to sort out windows newlines
    return text.replace("\r\n","\n")
@@ -112,13 +105,13 @@ class PlanningApplication:
        #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

 	contents = [
            u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference),
            u"<address>%s</address>" %xmlQuote(self.address),
            u"<postcode>%s</postcode>" %self.postcode,
            u"<description>%s</description>" %xmlQuote(self.description),
            u"<info_url>%s</info_url>" %xmlQuote(self.info_url),
            u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url),
            u"<date_received>%s</date_received>" %self.date_received.strftime(date_format),
            u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
            u"<address><![CDATA[%s]]></address>" %(self.address),
            u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
            u"<description><![CDATA[%s]]></description>" %(self.description),
            u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
            u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
            u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
            ]
        if self.osgb_x:
            contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -259,3 +259,4 @@
 "Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
 "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
 "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
 "Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"