Kaynağa Gözat

Add scraper for Hounslow.

master
duncan.parkes 16 yıl önce
ebeveyn
işleme
98f49f172f
3 değiştirilmiş dosya ile 74 ekleme ve 0 silme
  1. +72
    -0
      python_scrapers/Hounslow.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 72
- 0
python_scrapers/Hounslow.py Dosyayı Görüntüle

@@ -0,0 +1,72 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HounslowParser:
def __init__(self, *args):

self.authority_name = "London Borough of Hounslow"
self.authority_short_name = "Hounslow"
self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500"
# Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty.

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
soup = BeautifulSoup(response.read())

# Results are shown in a table each. The tables don't have any nice
# attributes, but they do all contain a NavString "Application",
# and nothing else does...
nav_strings = soup.findAll(text="Application")

for nav_string in nav_strings:
result_table = nav_string.findPrevious("table")

application = PlanningApplication()
application.date_received = search_day

links = result_table.findAll("a")

# We can get OSGB coordinates from the link to streetmap
map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]['href'])[3])
application.osgb_x = map_qs_dict.get("x")[0]
application.osgb_y = map_qs_dict.get("y")[0]

application.council_reference = links[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, links[1]['href'])
application.comment_url = urlparse.urljoin(self.base_url, links[2]['href'])

application.address = ' '.join(links[0].previous.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = links[2].previous.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HounslowParser()
print parser.getResults(1,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Dosyayı Görüntüle

@@ -49,3 +49,4 @@
"Berwick.py", "420"
"Birmingham.py", "420"
"KingstonUponThames.py", "420"
"Hounslow.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Dosyayı Görüntüle

@@ -253,3 +253,4 @@
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"
"Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"

Yükleniyor…
İptal
Kaydet