Ver a proveniência

Add scraper for Birmingham.

master
duncan.parkes há 16 anos
ascendente
cometimento
bcf26d0e43
3 ficheiros alterados com 125 adições e 1 eliminações
  1. +122
    -0
      python_scrapers/Birmingham.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +2
    -1
      python_scrapers/SitesToGenerate.csv

+ 122
- 0
python_scrapers/Birmingham.py Ver ficheiro

@@ -0,0 +1,122 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BirminghamParser:
comments_email_address = "planning.enquiries@birmingham.gov.uk"

def __init__(self, *args):

self.authority_name = "Birmingham City Council"
self.authority_short_name = "Birmingham"

self.get_url = "http://www.birmingham.gov.uk/GenerateContent?CONTENT_ITEM_ID=67548&CONTENT_ITEM_TYPE=0&MENU_ID=12189"
# What a lovely intuitive URL it is.
self.for_cookie_url = "http://www.birmingham.gov.uk/PSR/control/main"
self.post_url = "http://www.birmingham.gov.uk/PSR/control/searchresults"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# We seem to need to get this page in order to get a cookie
for_cookie_request = urllib2.Request(self.for_cookie_url)
for_cookie_response = urllib2.urlopen(for_cookie_request)
cookie_jar.extract_cookies(for_cookie_response, for_cookie_request)

post_data = [
("JAVASCRIPT_ENABLED", "FALSE"),
("txt_PSR_CurrentSearchPage", "0"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Application_Form"),
("txt_PSR_Application_ApplicationNumber", ""),
("txt_PSR_Application_Status", "awaitingDecision"),
("txt_PSR_Application_TypeOfApplication", ""),
("txt_PSR_Application_DecisionType", ""),
("txt_PSR_Application_District", ""),
("txt_PSR_Application_Ward", ""),
("txt_PSR_Application_Location", ""),
("txt_PSR_Application_Applicant", ""),
("txt_PSR_Application_Agent", ""),
("txt_PSR_Application_SearchDay", day),
("txt_PSR_Application_SearchMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchYear", year),
("txt_PSR_Application_SearchToDay", day),
("txt_PSR_Application_SearchToMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchToYear", year),
("txt_PSR_Application_SearchSortOrder", "LatestFirst"),
("txt_PSR_Application_ResultsSkipRows", "0"),
("txt_PSR_Application_ResultsPerPage", "1000"), # That should be enough to keep things on one page
("btn_PSR_Application_ApplicationSearch", "Search"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Appeals_Form"),
("txt_PSR_Appeals_ApplicationNumber", ""),
("txt_PSR_Appeals_Status", "awaitingDecision"),
("txt_PSR_Appeals_TypeOfAppeal", ""),
("txt_PSR_Appeals_DecisionType", ""),
("txt_PSR_Appeals_District", ""),
("txt_PSR_Appeals_Ward", ""),
("txt_PSR_Appeals_Location", ""),
("txt_PSR_Appeals_Applicant", ""),
("txt_PSR_Appeals_Agent", ""),
("txt_PSR_Appeals_SearchDay", ""),
("txt_PSR_Appeals_SearchMonth", ""),
("txt_PSR_Appeals_SearchYear", ""),
("txt_PSR_Appeals_SearchToDay", ""),
("txt_PSR_Appeals_SearchToMonth", ""),
("txt_PSR_Appeals_SearchToYear", ""),
("txt_PSR_Appeals_SearchSortOrder", "LatestFirst"),
("txt_PSR_Appeals_ResultsSkipRows", "0"),
("txt_PSR_Appeals_ResultsPerPage", "10"),
]


post_request = urllib2.Request(self.post_url, urllib.urlencode(post_data))
cookie_jar.add_cookie_header(post_request)

post_response = urllib2.urlopen(post_request)

soup = BeautifulSoup(post_response.read())

result_tables = soup.findAll("table", summary=re.compile("Summary of planning application"))

for result_table in result_tables:
application = PlanningApplication()
application.info_url = urlparse.urljoin(self.post_url, result_table.find(text="Application number").findNext("a")['href'])
application.council_reference = result_table.find(text="Application number").findNext("a").string
application.date_received = search_day
application.address = result_table.find(text="Location").findNext("td").p.string
application.postcode = getPostcodeFromText(application.address)
application.description = result_table.find(text="Proposal").findNext("td").p.string.replace(" ", " ").strip()
# Comment link gives an Access Denied, so we'll have to use the email
application.comment_url = self.comments_email_address

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BirminghamParser()
print parser.getResults(1,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Ver ficheiro

@@ -47,3 +47,4 @@
"Brent.py", "420" "Brent.py", "420"
"Carmarthenshire.py", "420" "Carmarthenshire.py", "420"
"Berwick.py", "420" "Berwick.py", "420"
"Birmingham.py", "420"

+ 2
- 1
python_scrapers/SitesToGenerate.csv Ver ficheiro

@@ -250,4 +250,5 @@
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser" "Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser" "London Borough of Brent", "Brent", "", "Brent", "BrentParser"
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser" "Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"

Carregando…
Cancelar
Guardar