Ver a proveniência

Add scraper for Hounslow.

import/raw
duncan.parkes há 16 anos
ascendente
cometimento
da2be2c394
3 ficheiros alterados com 74 adições e 0 eliminações
  1. +72
    -0
      trunk/python_scrapers/Hounslow.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 72
- 0
trunk/python_scrapers/Hounslow.py Ver ficheiro

@@ -0,0 +1,72 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HounslowParser:
def __init__(self, *args):

self.authority_name = "London Borough of Hounslow"
self.authority_short_name = "Hounslow"
self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500"
# Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty.

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
soup = BeautifulSoup(response.read())

# Results are shown in a table each. The tables don't have any nice
# attributes, but they do all contain a NavString "Application",
# and nothing else does...
nav_strings = soup.findAll(text="Application")

for nav_string in nav_strings:
result_table = nav_string.findPrevious("table")

application = PlanningApplication()
application.date_received = search_day

links = result_table.findAll("a")

# We can get OSGB coordinates from the link to streetmap
map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]['href'])[3])
application.osgb_x = map_qs_dict.get("x")[0]
application.osgb_y = map_qs_dict.get("y")[0]

application.council_reference = links[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, links[1]['href'])
application.comment_url = urlparse.urljoin(self.base_url, links[2]['href'])

application.address = ' '.join(links[0].previous.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = links[2].previous.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HounslowParser()
print parser.getResults(1,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Ver ficheiro

@@ -49,3 +49,4 @@
"Berwick.py", "420"
"Birmingham.py", "420"
"KingstonUponThames.py", "420"
"Hounslow.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Ver ficheiro

@@ -253,3 +253,4 @@
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"
"Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"

Carregando…
Cancelar
Guardar