adrianshort
/
planningalerts
镜像自地址 https://github.com/adrianshort/planningalerts.git

"""
This is the screenscraper for Westminster City Council.

I have just noticed that there is a PublicAccess underneath all this, but
it only has the apps in for which they are accepting comments, so I think
we may as well use this url and get the lot...

This is the PublicAccess url:
http://publicaccess.westminster.gov.uk/publicaccess/
"""

import urllib
import urlparse

import pycurl
import StringIO

import datetime, time
import cgi

import sys

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

date_format = "%d%%2F%m%%2F%Y"

class WestminsterParser:
    def __init__(self, *args):

        self.authority_name = "City of Westminster"
        self.authority_short_name = "Westminster"
        self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

#         post_data = [
#             ("EFNO", ""),
#             ("STName", ""),
#             ("STNUMB", ""),
#             ("ADRSNO", ""),
#             ("WARD", "AllWards"),
#             ("AGT", ""),
#             ("ATCDE", "AllApps"),
#             ("DECDE", "AllDecs"),
#             ("DTErec", search_day.strftime(date_format)),
#             ("DTErecTo", search_day.strftime(date_format)),
#             ("DTEvalid", ""),
#             ("DTEvalidTo", ""),
#             ("APDECDE", "AllAppDecs"),
#             ("submit", "Start+Search"),
#             ]
        post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}

        while post_data:
            

            # Now get the search page

            sys.stderr.write("Fetching: %s\n" %self.base_url)
            sys.stderr.write("post data: %s\n" %post_data) 
            

            # This gives us something to use as the callback
            fakefile = StringIO.StringIO()

            curlobj = pycurl.Curl()
            curlobj.setopt(pycurl.URL, self.base_url)
            curlobj.setopt(pycurl.POST, True)
            curlobj.setopt(pycurl.POSTFIELDS, post_data)
            curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
            curlobj.setopt(pycurl.FOLLOWLOCATION, True)
            curlobj.setopt(pycurl.MAXREDIRS, 10)

            curlobj.perform()

            sys.stderr.write("Got it\n")
            soup = BeautifulSoup(fakefile.getvalue())

            # We may as well free up the memory used by fakefile
            fakefile.close()

            sys.stderr.write("Created soup\n")

            results_form = soup.find("form", {"name": "currentsearchresultsNext"})

            # Sort out the post_data for the next page, if there is one
            # If there is no next page then there will be no inputs in the form.
            # In this case, post_data will be '', which is false.

            sys.stderr.write("Found form containing results\n")

            post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])

            sys.stderr.write("Got post data\n")

            # Each result has one link, and they are the only links in the form

            links = results_form.findAll("a")

            sys.stderr.write("Got list of links\n")

            for link in links:

                sys.stderr.write("Working on link: %s\n" %link['href'])

                application = PlanningApplication()

                application.date_received = search_day
                application.info_url = urlparse.urljoin(self.base_url, link['href'])
                application.council_reference = link.string.strip()

                application.address = link.findNext("td").string.strip()
                application.postcode = getPostcodeFromText(application.address)

                application.description = link.findNext("tr").findAll("td")[-1].string.strip()

                # To get the comment url, we're going to have to go to each info url :-(

                sys.stderr.write("Fetching: %s\n" %application.info_url)


                fakefile = StringIO.StringIO()


                curlobj.setopt(pycurl.HTTPGET, True)
                curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

                # We have to convert the info url to ascii for curl
                curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))

                curlobj.perform()

                sys.stderr.write("Got it\n")

                info_soup = BeautifulSoup(fakefile.getvalue())

                fakefile.close()

                comment_nav_string = info_soup.find(text="Comment on this case")
                if comment_nav_string:
                    application.comment_url = comment_nav_string.parent['href']
                else:
                    application.comment_url = "No Comments"

    #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500

                self._results.addApplication(application)

                sys.stderr.write("Finished that link\n")


        sys.stderr.write("Finished while loop, returning stuff.\n")

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
    parser = WestminsterParser()
    print parser.getResults(1,8,2008)