Ver código fonte

Add scraper for Birmingham.

master
duncan.parkes 17 anos atrás
pai
commit
bcf26d0e43
3 arquivos alterados com 125 adições e 1 exclusões
  1. +122
    -0
      python_scrapers/Birmingham.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +2
    -1
      python_scrapers/SitesToGenerate.csv

+ 122
- 0
python_scrapers/Birmingham.py Ver arquivo

@@ -0,0 +1,122 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BirminghamParser:
comments_email_address = "planning.enquiries@birmingham.gov.uk"

def __init__(self, *args):

self.authority_name = "Birmingham City Council"
self.authority_short_name = "Birmingham"

self.get_url = "http://www.birmingham.gov.uk/GenerateContent?CONTENT_ITEM_ID=67548&CONTENT_ITEM_TYPE=0&MENU_ID=12189"
# What a lovely intuitive URL it is.
self.for_cookie_url = "http://www.birmingham.gov.uk/PSR/control/main"
self.post_url = "http://www.birmingham.gov.uk/PSR/control/searchresults"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# We seem to need to get this page in order to get a cookie
for_cookie_request = urllib2.Request(self.for_cookie_url)
for_cookie_response = urllib2.urlopen(for_cookie_request)
cookie_jar.extract_cookies(for_cookie_response, for_cookie_request)

post_data = [
("JAVASCRIPT_ENABLED", "FALSE"),
("txt_PSR_CurrentSearchPage", "0"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Application_Form"),
("txt_PSR_Application_ApplicationNumber", ""),
("txt_PSR_Application_Status", "awaitingDecision"),
("txt_PSR_Application_TypeOfApplication", ""),
("txt_PSR_Application_DecisionType", ""),
("txt_PSR_Application_District", ""),
("txt_PSR_Application_Ward", ""),
("txt_PSR_Application_Location", ""),
("txt_PSR_Application_Applicant", ""),
("txt_PSR_Application_Agent", ""),
("txt_PSR_Application_SearchDay", day),
("txt_PSR_Application_SearchMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchYear", year),
("txt_PSR_Application_SearchToDay", day),
("txt_PSR_Application_SearchToMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchToYear", year),
("txt_PSR_Application_SearchSortOrder", "LatestFirst"),
("txt_PSR_Application_ResultsSkipRows", "0"),
("txt_PSR_Application_ResultsPerPage", "1000"), # That should be enough to keep things on one page
("btn_PSR_Application_ApplicationSearch", "Search"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Appeals_Form"),
("txt_PSR_Appeals_ApplicationNumber", ""),
("txt_PSR_Appeals_Status", "awaitingDecision"),
("txt_PSR_Appeals_TypeOfAppeal", ""),
("txt_PSR_Appeals_DecisionType", ""),
("txt_PSR_Appeals_District", ""),
("txt_PSR_Appeals_Ward", ""),
("txt_PSR_Appeals_Location", ""),
("txt_PSR_Appeals_Applicant", ""),
("txt_PSR_Appeals_Agent", ""),
("txt_PSR_Appeals_SearchDay", ""),
("txt_PSR_Appeals_SearchMonth", ""),
("txt_PSR_Appeals_SearchYear", ""),
("txt_PSR_Appeals_SearchToDay", ""),
("txt_PSR_Appeals_SearchToMonth", ""),
("txt_PSR_Appeals_SearchToYear", ""),
("txt_PSR_Appeals_SearchSortOrder", "LatestFirst"),
("txt_PSR_Appeals_ResultsSkipRows", "0"),
("txt_PSR_Appeals_ResultsPerPage", "10"),
]


post_request = urllib2.Request(self.post_url, urllib.urlencode(post_data))
cookie_jar.add_cookie_header(post_request)

post_response = urllib2.urlopen(post_request)

soup = BeautifulSoup(post_response.read())

result_tables = soup.findAll("table", summary=re.compile("Summary of planning application"))

for result_table in result_tables:
application = PlanningApplication()
application.info_url = urlparse.urljoin(self.post_url, result_table.find(text="Application number").findNext("a")['href'])
application.council_reference = result_table.find(text="Application number").findNext("a").string
application.date_received = search_day
application.address = result_table.find(text="Location").findNext("td").p.string
application.postcode = getPostcodeFromText(application.address)
application.description = result_table.find(text="Proposal").findNext("td").p.string.replace(" ", " ").strip()
# Comment link gives an Access Denied, so we'll have to use the email
application.comment_url = self.comments_email_address

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BirminghamParser()
print parser.getResults(1,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Ver arquivo

@@ -47,3 +47,4 @@
"Brent.py", "420" "Brent.py", "420"
"Carmarthenshire.py", "420" "Carmarthenshire.py", "420"
"Berwick.py", "420" "Berwick.py", "420"
"Birmingham.py", "420"

+ 2
- 1
python_scrapers/SitesToGenerate.csv Ver arquivo

@@ -250,4 +250,5 @@
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser" "Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser" "London Borough of Brent", "Brent", "", "Brent", "BrentParser"
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser" "Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"

Carregando…
Cancelar
Salvar