Browse Source

Add scraper for Hounslow.

import/raw
duncan.parkes 16 years ago
parent
commit
da2be2c394
3 changed files with 74 additions and 0 deletions
  1. +72
    -0
      trunk/python_scrapers/Hounslow.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 72
- 0
trunk/python_scrapers/Hounslow.py View File

@@ -0,0 +1,72 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HounslowParser:
def __init__(self, *args):

self.authority_name = "London Borough of Hounslow"
self.authority_short_name = "Hounslow"
self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500"
# Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty.

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
soup = BeautifulSoup(response.read())

# Results are shown in a table each. The tables don't have any nice
# attributes, but they do all contain a NavString "Application",
# and nothing else does...
nav_strings = soup.findAll(text="Application")

for nav_string in nav_strings:
result_table = nav_string.findPrevious("table")

application = PlanningApplication()
application.date_received = search_day

links = result_table.findAll("a")

# We can get OSGB coordinates from the link to streetmap
map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]['href'])[3])
application.osgb_x = map_qs_dict.get("x")[0]
application.osgb_y = map_qs_dict.get("y")[0]

application.council_reference = links[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, links[1]['href'])
application.comment_url = urlparse.urljoin(self.base_url, links[2]['href'])

application.address = ' '.join(links[0].previous.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = links[2].previous.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HounslowParser()
print parser.getResults(1,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -49,3 +49,4 @@
"Berwick.py", "420"
"Birmingham.py", "420"
"KingstonUponThames.py", "420"
"Hounslow.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -253,3 +253,4 @@
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"
"Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"

Loading…
Cancel
Save