Просмотр исходного кода

Add scraper for Birmingham.

import/raw
duncan.parkes 18 лет назад
Родитель
Сommit
08e63c7566
3 измененных файлов: 125 добавлений и 1 удалений
  1. +122
    -0
      trunk/python_scrapers/Birmingham.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +2
    -1
      trunk/python_scrapers/SitesToGenerate.csv

+ 122
- 0
trunk/python_scrapers/Birmingham.py Просмотреть файл

@@ -0,0 +1,122 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BirminghamParser:
comments_email_address = "planning.enquiries@birmingham.gov.uk"

def __init__(self, *args):

self.authority_name = "Birmingham City Council"
self.authority_short_name = "Birmingham"

self.get_url = "http://www.birmingham.gov.uk/GenerateContent?CONTENT_ITEM_ID=67548&CONTENT_ITEM_TYPE=0&MENU_ID=12189"
# What a lovely intuitive URL it is.
self.for_cookie_url = "http://www.birmingham.gov.uk/PSR/control/main"
self.post_url = "http://www.birmingham.gov.uk/PSR/control/searchresults"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# We seem to need to get this page in order to get a cookie
for_cookie_request = urllib2.Request(self.for_cookie_url)
for_cookie_response = urllib2.urlopen(for_cookie_request)
cookie_jar.extract_cookies(for_cookie_response, for_cookie_request)

post_data = [
("JAVASCRIPT_ENABLED", "FALSE"),
("txt_PSR_CurrentSearchPage", "0"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Application_Form"),
("txt_PSR_Application_ApplicationNumber", ""),
("txt_PSR_Application_Status", "awaitingDecision"),
("txt_PSR_Application_TypeOfApplication", ""),
("txt_PSR_Application_DecisionType", ""),
("txt_PSR_Application_District", ""),
("txt_PSR_Application_Ward", ""),
("txt_PSR_Application_Location", ""),
("txt_PSR_Application_Applicant", ""),
("txt_PSR_Application_Agent", ""),
("txt_PSR_Application_SearchDay", day),
("txt_PSR_Application_SearchMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchYear", year),
("txt_PSR_Application_SearchToDay", day),
("txt_PSR_Application_SearchToMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchToYear", year),
("txt_PSR_Application_SearchSortOrder", "LatestFirst"),
("txt_PSR_Application_ResultsSkipRows", "0"),
("txt_PSR_Application_ResultsPerPage", "1000"), # That should be enough to keep things on one page
("btn_PSR_Application_ApplicationSearch", "Search"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Appeals_Form"),
("txt_PSR_Appeals_ApplicationNumber", ""),
("txt_PSR_Appeals_Status", "awaitingDecision"),
("txt_PSR_Appeals_TypeOfAppeal", ""),
("txt_PSR_Appeals_DecisionType", ""),
("txt_PSR_Appeals_District", ""),
("txt_PSR_Appeals_Ward", ""),
("txt_PSR_Appeals_Location", ""),
("txt_PSR_Appeals_Applicant", ""),
("txt_PSR_Appeals_Agent", ""),
("txt_PSR_Appeals_SearchDay", ""),
("txt_PSR_Appeals_SearchMonth", ""),
("txt_PSR_Appeals_SearchYear", ""),
("txt_PSR_Appeals_SearchToDay", ""),
("txt_PSR_Appeals_SearchToMonth", ""),
("txt_PSR_Appeals_SearchToYear", ""),
("txt_PSR_Appeals_SearchSortOrder", "LatestFirst"),
("txt_PSR_Appeals_ResultsSkipRows", "0"),
("txt_PSR_Appeals_ResultsPerPage", "10"),
]


post_request = urllib2.Request(self.post_url, urllib.urlencode(post_data))
cookie_jar.add_cookie_header(post_request)

post_response = urllib2.urlopen(post_request)

soup = BeautifulSoup(post_response.read())

result_tables = soup.findAll("table", summary=re.compile("Summary of planning application"))

for result_table in result_tables:
application = PlanningApplication()
application.info_url = urlparse.urljoin(self.post_url, result_table.find(text="Application number").findNext("a")['href'])
application.council_reference = result_table.find(text="Application number").findNext("a").string
application.date_received = search_day
application.address = result_table.find(text="Location").findNext("td").p.string
application.postcode = getPostcodeFromText(application.address)
application.description = result_table.find(text="Proposal").findNext("td").p.string.replace(" ", " ").strip()
# Comment link gives an Access Denied, so we'll have to use the email
application.comment_url = self.comments_email_address

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BirminghamParser()
print parser.getResults(1,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Просмотреть файл

@@ -47,3 +47,4 @@
"Brent.py", "420" "Brent.py", "420"
"Carmarthenshire.py", "420" "Carmarthenshire.py", "420"
"Berwick.py", "420" "Berwick.py", "420"
"Birmingham.py", "420"

+ 2
- 1
trunk/python_scrapers/SitesToGenerate.csv Просмотреть файл

@@ -250,4 +250,5 @@
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser" "Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser" "London Borough of Brent", "Brent", "", "Brent", "BrentParser"
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser" "Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"

Загрузка…
Отмена
Сохранить