Przeglądaj źródła

Add scraper for Hounslow.

master
duncan.parkes 16 lat temu
rodzic
commit
98f49f172f
3 zmienionych plików z 74 dodań i 0 usunięć
  1. +72
    -0
      python_scrapers/Hounslow.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 72
- 0
python_scrapers/Hounslow.py Wyświetl plik

@@ -0,0 +1,72 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class HounslowParser:
def __init__(self, *args):

self.authority_name = "London Borough of Hounslow"
self.authority_short_name = "Hounslow"
self.base_url = "http://planning.hounslow.gov.uk/planningv2/planning_summary.aspx?strWeekListType=SRCH&strRecTo=%(date)s&strRecFrom=%(date)s&strWard=ALL&strAppTyp=ALL&strWardTxt=All%%20Wards&strAppTypTxt=All%%20Application%%20Types&strArea=ALL&strAreaTxt=All%%20Areas&strStreet=ALL&strStreetTxt=All%%20Streets&strPC=&strLimit=500"
# Limited to 500 cases - putting 1000 causes a default value of 50 to be used. 500 should be plenty.

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)})
soup = BeautifulSoup(response.read())

# Results are shown in a table each. The tables don't have any nice
# attributes, but they do all contain a NavString "Application",
# and nothing else does...
nav_strings = soup.findAll(text="Application")

for nav_string in nav_strings:
result_table = nav_string.findPrevious("table")

application = PlanningApplication()
application.date_received = search_day

links = result_table.findAll("a")

# We can get OSGB coordinates from the link to streetmap
map_qs_dict = cgi.parse_qs(urlparse.urlsplit(links[0]['href'])[3])
application.osgb_x = map_qs_dict.get("x")[0]
application.osgb_y = map_qs_dict.get("y")[0]

application.council_reference = links[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, links[1]['href'])
application.comment_url = urlparse.urljoin(self.base_url, links[2]['href'])

application.address = ' '.join(links[0].previous.strip().split())
application.postcode = getPostcodeFromText(application.address)

application.description = links[2].previous.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = HounslowParser()
print parser.getResults(1,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Wyświetl plik

@@ -49,3 +49,4 @@
"Berwick.py", "420"
"Birmingham.py", "420"
"KingstonUponThames.py", "420"
"Hounslow.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Wyświetl plik

@@ -253,3 +253,4 @@
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"
"Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser"
"London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser"

Ładowanie…
Anuluj
Zapisz