Browse Source

Add scraper for Birmingham.

import/raw
duncan.parkes 16 years ago
parent
commit
08e63c7566
3 changed files with 125 additions and 1 deletions
  1. +122
    -0
      trunk/python_scrapers/Birmingham.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +2
    -1
      trunk/python_scrapers/SitesToGenerate.csv

+ 122
- 0
trunk/python_scrapers/Birmingham.py View File

@@ -0,0 +1,122 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BirminghamParser:
comments_email_address = "planning.enquiries@birmingham.gov.uk"

def __init__(self, *args):

self.authority_name = "Birmingham City Council"
self.authority_short_name = "Birmingham"

self.get_url = "http://www.birmingham.gov.uk/GenerateContent?CONTENT_ITEM_ID=67548&CONTENT_ITEM_TYPE=0&MENU_ID=12189"
# What a lovely intuitive URL it is.
self.for_cookie_url = "http://www.birmingham.gov.uk/PSR/control/main"
self.post_url = "http://www.birmingham.gov.uk/PSR/control/searchresults"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# We seem to need to get this page in order to get a cookie
for_cookie_request = urllib2.Request(self.for_cookie_url)
for_cookie_response = urllib2.urlopen(for_cookie_request)
cookie_jar.extract_cookies(for_cookie_response, for_cookie_request)

post_data = [
("JAVASCRIPT_ENABLED", "FALSE"),
("txt_PSR_CurrentSearchPage", "0"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Application_Form"),
("txt_PSR_Application_ApplicationNumber", ""),
("txt_PSR_Application_Status", "awaitingDecision"),
("txt_PSR_Application_TypeOfApplication", ""),
("txt_PSR_Application_DecisionType", ""),
("txt_PSR_Application_District", ""),
("txt_PSR_Application_Ward", ""),
("txt_PSR_Application_Location", ""),
("txt_PSR_Application_Applicant", ""),
("txt_PSR_Application_Agent", ""),
("txt_PSR_Application_SearchDay", day),
("txt_PSR_Application_SearchMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchYear", year),
("txt_PSR_Application_SearchToDay", day),
("txt_PSR_Application_SearchToMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchToYear", year),
("txt_PSR_Application_SearchSortOrder", "LatestFirst"),
("txt_PSR_Application_ResultsSkipRows", "0"),
("txt_PSR_Application_ResultsPerPage", "1000"), # That should be enough to keep things on one page
("btn_PSR_Application_ApplicationSearch", "Search"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Appeals_Form"),
("txt_PSR_Appeals_ApplicationNumber", ""),
("txt_PSR_Appeals_Status", "awaitingDecision"),
("txt_PSR_Appeals_TypeOfAppeal", ""),
("txt_PSR_Appeals_DecisionType", ""),
("txt_PSR_Appeals_District", ""),
("txt_PSR_Appeals_Ward", ""),
("txt_PSR_Appeals_Location", ""),
("txt_PSR_Appeals_Applicant", ""),
("txt_PSR_Appeals_Agent", ""),
("txt_PSR_Appeals_SearchDay", ""),
("txt_PSR_Appeals_SearchMonth", ""),
("txt_PSR_Appeals_SearchYear", ""),
("txt_PSR_Appeals_SearchToDay", ""),
("txt_PSR_Appeals_SearchToMonth", ""),
("txt_PSR_Appeals_SearchToYear", ""),
("txt_PSR_Appeals_SearchSortOrder", "LatestFirst"),
("txt_PSR_Appeals_ResultsSkipRows", "0"),
("txt_PSR_Appeals_ResultsPerPage", "10"),
]


post_request = urllib2.Request(self.post_url, urllib.urlencode(post_data))
cookie_jar.add_cookie_header(post_request)

post_response = urllib2.urlopen(post_request)

soup = BeautifulSoup(post_response.read())

result_tables = soup.findAll("table", summary=re.compile("Summary of planning application"))

for result_table in result_tables:
application = PlanningApplication()
application.info_url = urlparse.urljoin(self.post_url, result_table.find(text="Application number").findNext("a")['href'])
application.council_reference = result_table.find(text="Application number").findNext("a").string
application.date_received = search_day
application.address = result_table.find(text="Location").findNext("td").p.string
application.postcode = getPostcodeFromText(application.address)
application.description = result_table.find(text="Proposal").findNext("td").p.string.replace(" ", " ").strip()
# Comment link gives an Access Denied, so we'll have to use the email
application.comment_url = self.comments_email_address

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BirminghamParser()
print parser.getResults(1,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -47,3 +47,4 @@
"Brent.py", "420" "Brent.py", "420"
"Carmarthenshire.py", "420" "Carmarthenshire.py", "420"
"Berwick.py", "420" "Berwick.py", "420"
"Birmingham.py", "420"

+ 2
- 1
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -250,4 +250,5 @@
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser" "Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser" "London Borough of Brent", "Brent", "", "Brent", "BrentParser"
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser" "Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"

Loading…
Cancel
Save