Add scraper for Hounslow.

18 年之前 · 98f49f172f
--- a/python_scrapers/Hounslow.py
+++ b/python_scrapers/Hounslow.py
@@ -0,0 +1,72 @@
 import urllib2
 import urllib
 import urlparse

 import datetime, time
 import cgi

 from BeautifulSoup import BeautifulSoup

 from PlanningUtils import PlanningApplication, \
    PlanningAuthorityResults, \
    getPostcodeFromText

 date_format = "%d/%m/%Y"

 class HounslowParser:
    def __init__(self, *args):

        self.authority_name = "London Borough of Hounslow"
        self.authority_short_name = "Hounslow"
        self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500"
        # Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty.

        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


    def getResultsByDayMonthYear(self, day, month, year):
        search_day = datetime.date(year, month, day)

        # Now get the search page
        response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
        soup = BeautifulSoup(response.read())

        # Results are shown in a table each. The tables don't have any nice
        # attributes, but they do all contain a NavString "Application",
        # and nothing else does...
        nav_strings = soup.findAll(text="Application")

        for nav_string in nav_strings:
            result_table = nav_string.findPrevious("table")

            application = PlanningApplication()
            application.date_received = search_day

            links = result_table.findAll("a")

            # We can get OSGB coordinates from the link to streetmap
            map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]['href'])[3])
            
            application.osgb_x = map_qs_dict.get("x")[0]
            application.osgb_y = map_qs_dict.get("y")[0]

            application.council_reference = links[1].string.strip()
            application.info_url = urlparse.urljoin(self.base_url, links[1]['href'])
            application.comment_url = urlparse.urljoin(self.base_url, links[2]['href'])

            application.address = ' '.join(links[0].previous.strip().split())
            application.postcode = getPostcodeFromText(application.address)

            application.description = links[2].previous.strip()

            self._results.addApplication(application)

        return self._results

    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

 if __name__ == '__main__':
    parser = HounslowParser()
    print parser.getResults(1,8,2008)

--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -49,3 +49,4 @@
 "Berwick.py", "420"
 "Birmingham.py", "420"
 "KingstonUponThames.py", "420"
 "Hounslow.py", "420"
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -253,3 +253,4 @@
 "Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
 "Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"
 "Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"
 "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"