Bladeren bron

Add scraper for Hounslow.

import/raw
duncan.parkes 16 jaren geleden
bovenliggende
commit
da2be2c394
3 gewijzigde bestanden met toevoegingen van 74 en 0 verwijderingen
  1. +72
    -0
      trunk/python_scrapers/Hounslow.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 72
- 0
trunk/python_scrapers/Hounslow.py Bestand weergeven

@@ -0,0 +1,72 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HounslowParser:
def __init__(self, *args):

self.authority_name = "London Borough of Hounslow"
self.authority_short_name = "Hounslow"
self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500"
# Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty.

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
soup = BeautifulSoup(response.read())

# Results are shown in a table each. The tables don't have any nice
# attributes, but they do all contain a NavString "Application",
# and nothing else does...
nav_strings = soup.findAll(text="Application")

for nav_string in nav_strings:
result_table = nav_string.findPrevious("table")

application = PlanningApplication()
application.date_received = search_day

links = result_table.findAll("a")

# We can get OSGB coordinates from the link to streetmap
map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]['href'])[3])
application.osgb_x = map_qs_dict.get("x")[0]
application.osgb_y = map_qs_dict.get("y")[0]

application.council_reference = links[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, links[1]['href'])
application.comment_url = urlparse.urljoin(self.base_url, links[2]['href'])

application.address = ' '.join(links[0].previous.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = links[2].previous.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HounslowParser()
print parser.getResults(1,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Bestand weergeven

@@ -49,3 +49,4 @@
"Berwick.py", "420"
"Birmingham.py", "420"
"KingstonUponThames.py", "420"
"Hounslow.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Bestand weergeven

@@ -253,3 +253,4 @@
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"
"Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"

Laden…
Annuleren
Opslaan