A few bits of superfluous code in PlanningExplorer.py - now removed

16年前 · a0e67de742
--- a/python_scrapers/PlanningExplorer.py
+++ b/python_scrapers/PlanningExplorer.py
@@ -4,7 +4,7 @@ import urlparse
 import cgi
 import re
 import datetime
 import BeautifulSoup


 import cookielib

@@ -13,130 +13,9 @@ cookie_jar = cookielib.CookieJar()

 from BeautifulSoup import BeautifulSoup

 __auth__ = None

 import re

 date_format = "%d/%m/%Y"

 def fixNewlines(text):
    # This can be used to sort out windows newlines
    return text.replace("\r\n","\n")

 # So what can a postcode look like then?
 # This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
 #AN NAA         M1 1AA
 #ANN NAA        M60 1NW
 #AAN NAA        CR2 6XH
 #AANN NAA       DN55 1PT
 #ANA NAA        W1A 1HP
 #AANA NAA       EC1A 1BB

 postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")

 def getPostcodeFromText(text, default_postcode="No Postcode"):
    """This function takes a piece of text and returns the first
    bit of it that looks like a postcode."""

    postcode_match = postcode_regex.search(text)

    return postcode_match.group() if postcode_match else default_postcode


 class PlanningAuthorityResults:
    """This class represents a set of results of a planning search.

       This should probably be separated out so that it can be used for
       authorities other than Cherwell.
       """

    def __init__(self, authority_name, authority_short_name):
        self.authority_name = authority_name
        self.authority_short_name = authority_short_name

        # this will be a list of PlanningApplication objects
        self.planning_applications = []


    def addApplication(self, application):
        self.planning_applications.append(application)

    def __repr__(self):
        return self.displayXML()

    def displayXML(self):
        """This should display the contents of this object in the planningalerts format.
           i.e. in the same format as this one:
           http://www.planningalerts.com/lambeth.xml
           """

        applications_bit = "".join([x.displayXML() for x in self.planning_applications])

        return u"""<?xml version="1.0" encoding="UTF-8"?>\n""" + \
            u"<planning>\n" +\
            u"<authority_name>%s</authority_name>\n" %self.authority_name +\
            u"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
            u"<applications>\n" + applications_bit +\
            u"</applications>\n" +\
            u"</planning>\n"



 class PlanningApplication:
    def __init__(self):
        self.council_reference = None
        self.address = None
        self.postcode = None
        self.description = None
        self.info_url = None
        self.comment_url = None

        # expecting this as a datetime.date object
        self.date_received = None

        # If we can get them, we may as well include OSGB.
        # These will be the entirely numeric version.
        self.osgb_x = None
        self.osgb_y = None

    def __repr__(self):
        return self.displayXML()

    def is_ready(self):
        # This method tells us if the application is complete
        # Because of the postcode default, we can't really
        # check the postcode - make sure it is filled in when
        # you do the address.
        return self.council_reference \
            and self.address \
            and self.description \
            and self.info_url \
            and self.comment_url \
            and self.date_received


    def displayXML(self):
        #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received

        if not self.postcode:
            self.postcode = getPostcodeFromText(self.address)

        contents = [
            u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
            u"<address><![CDATA[%s]]></address>" %(self.address),
            u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
            u"<description><![CDATA[%s]]></description>" %(self.description),
            u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
            u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
            u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
            ]
        if self.osgb_x:
            contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
        if self.osgb_y:
            contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y))

        return u"<application>\n%s\n</application>" %('\n'.join(contents))

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 # Date format to enter into search boxes
 date_format = "%d/%m/%Y"
@@ -159,7 +38,7 @@ class PlanningExplorerParser:
    # authority, then they can be overridden in a subclass.
    info_url_path = "MVM/Online/Generic/"
    search_url_path = "MVM/Online/PL/GeneralSearch.aspx"

   
    # This is the most common place for comments urls to live
    # The %s will be filled in with an application code
    comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s"
@@ -226,7 +105,7 @@ class PlanningExplorerParser:
        override this method returning a dictionary of header key to
        header value."""
        headers = {}

       
        if self.use_firefox_user_agent:
            headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10"

@@ -259,7 +138,7 @@ class PlanningExplorerParser:
                ("csbtnSearch", "Search"),
                ("cboNumRecs", "99999"),
                ))

       
        return post_data


@@ -271,7 +150,7 @@ class PlanningExplorerParser:
            address = address_td.div.string
        else:
            address = address_td.string

           
        return address


@@ -283,10 +162,10 @@ class PlanningExplorerParser:
        one that parses the info page."""

        return getPostcodeFromText(self._current_application.address)

       
    def _getDescription(self, tds, info_soup):
        description_td = tds[self.description_td_no]

       
        if description_td.div is not None:
            # Mostly this is in a div
            # Use the empty string if the description is missing
@@ -311,7 +190,7 @@ class PlanningExplorerParser:

        self.search_url = urlparse.urljoin(base_url, self.search_url_path)
        self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path)

   
        self.debug = debug

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
@@ -323,6 +202,7 @@ class PlanningExplorerParser:
        get_request = urllib2.Request(self.search_url)
        get_response = urllib2.urlopen(get_request)


        cookie_jar.extract_cookies(get_response, get_request)

        html = get_response.read()
@@ -338,7 +218,7 @@ class PlanningExplorerParser:
        # The post data needs to be different for different councils
        # so we have a method on each council's scraper to make it.
        post_data = self._getPostData(asp_args, search_date)

       
        headers = self._getHeaders()

        request = urllib2.Request(self.search_url, post_data, headers)
@@ -371,7 +251,7 @@ class PlanningExplorerParser:
                self._current_application = PlanningApplication()

                # There is no need to search for the date_received, it's what
                # we searched for
                # we searched for            
                self._current_application.date_received = search_date

                tds = tr.findAll("td")
@@ -386,7 +266,7 @@ class PlanningExplorerParser:
                if self.fetch_info_page:
                    # We need to quote the spaces in the info url
                    info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))

                   
                    info_soup = BeautifulSoup(urllib2.urlopen(info_request))
                else:
                    info_soup = None
@@ -493,7 +373,7 @@ class CreweParser(PlanningExplorerParser):

    info_url_path = "Northgate/PlanningExplorer/Generic/"
    search_url_path = "northgate/planningexplorer/generalsearch.aspx"

   
    results_table_attrs = {"class": "display_table"}

    def _getPostData(self, asp_args, search_date):
@@ -554,13 +434,13 @@ class HackneyParser(PlanningExplorerParser):
        real_url_tuple = urlparse.urlsplit(response.geturl())

        query_string = real_url_tuple[3]

       
        # Get the query as a list of key, value pairs
        parsed_query_list = list(cgi.parse_qsl(query_string))

        # Go through the query string replacing any PS parameters
        # with PS=99999

       
        for i in range(len(parsed_query_list)):
            key, value = parsed_query_list[i]

@@ -569,10 +449,10 @@ class HackneyParser(PlanningExplorerParser):
                parsed_query_list[i] = (key, value)

        new_query_string = urllib.urlencode(parsed_query_list)

       
        new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:]

        new_url = urlparse.urlunsplit(new_url_tuple)
       
        new_url = urlparse.urlunsplit(new_url_tuple)        
        new_request = urllib2.Request(new_url, None, self._getHeaders())
        new_response = urllib2.urlopen(new_request)

@@ -607,13 +487,13 @@ class HackneyParser(PlanningExplorerParser):

 class KennetParser(BroadlandLike, PlanningExplorerParser):
    comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"

   
 class LincolnParser(PlanningExplorerParser):
    use_firefox_user_agent = True
    use_referer = True

    results_table_attrs = {"class": "display_table"}

   
    search_url_path = "northgate/planningexplorer/generalsearch.aspx"
    info_url_path = "Northgate/PlanningExplorer/Generic/"

@@ -751,7 +631,7 @@ class SouthShropshireParser(PlanningExplorerParser):
                ("cboNumRecs", "99999"),
                ("csbtnSearch", "Search"),
                ))

       
        return post_data

 class SouthTynesideParser(BroadlandLike, PlanningExplorerParser):
@@ -759,6 +639,7 @@ class SouthTynesideParser(BroadlandLike, PlanningExplorerParser):
    pass



 class StockportParser(PlanningExplorerParser):
    comments_email_address = "admin.dc@stockport.gov.uk"
    info_url_path = "MVM/Online/PL/"
@@ -868,11 +749,11 @@ class MendipParser(BroadlandLike, PlanningExplorerParser):
 if __name__ == '__main__':
    # NOTE - 04/11/2007 is a sunday
    # I'm using it to test that the scrapers behave on days with no apps.

   
 #    parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
 #    parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
 #    parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
 #    parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
    parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
 #    parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
 #    parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
 #    parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
@@ -895,7 +776,8 @@ if __name__ == '__main__':
 #    parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
 #    parser = MendipParser("Mendip District Council", "Mendip", "http://planning.mendip.gov.uk/")
    parser = BirminghamParser("Birmingham City Council", "Birmingham", "http://eplanning.birmingham.gov.uk/Northgate/")
    print parser.getResults(27, 4, 2010)

    print parser.getResults(12, 6, 2009)

 # To Do