Przeglądaj źródła

Add scraper for Birmingham.

import/raw
duncan.parkes 16 lat temu
rodzic
commit
08e63c7566
3 zmienionych plików z 125 dodań i 1 usunięć
  1. +122
    -0
      trunk/python_scrapers/Birmingham.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +2
    -1
      trunk/python_scrapers/SitesToGenerate.csv

+ 122
- 0
trunk/python_scrapers/Birmingham.py Wyświetl plik

@@ -0,0 +1,122 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BirminghamParser:
comments_email_address = "planning.enquiries@birmingham.gov.uk"

def __init__(self, *args):

self.authority_name = "Birmingham City Council"
self.authority_short_name = "Birmingham"

self.get_url = "http://www.birmingham.gov.uk/GenerateContent?CONTENT_ITEM_ID=67548&CONTENT_ITEM_TYPE=0&MENU_ID=12189"
# What a lovely intuitive URL it is.
self.for_cookie_url = "http://www.birmingham.gov.uk/PSR/control/main"
self.post_url = "http://www.birmingham.gov.uk/PSR/control/searchresults"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# We seem to need to get this page in order to get a cookie
for_cookie_request = urllib2.Request(self.for_cookie_url)
for_cookie_response = urllib2.urlopen(for_cookie_request)
cookie_jar.extract_cookies(for_cookie_response, for_cookie_request)

post_data = [
("JAVASCRIPT_ENABLED", "FALSE"),
("txt_PSR_CurrentSearchPage", "0"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Application_Form"),
("txt_PSR_Application_ApplicationNumber", ""),
("txt_PSR_Application_Status", "awaitingDecision"),
("txt_PSR_Application_TypeOfApplication", ""),
("txt_PSR_Application_DecisionType", ""),
("txt_PSR_Application_District", ""),
("txt_PSR_Application_Ward", ""),
("txt_PSR_Application_Location", ""),
("txt_PSR_Application_Applicant", ""),
("txt_PSR_Application_Agent", ""),
("txt_PSR_Application_SearchDay", day),
("txt_PSR_Application_SearchMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchYear", year),
("txt_PSR_Application_SearchToDay", day),
("txt_PSR_Application_SearchToMonth", month-1), # Months are counted from zero...
("txt_PSR_Application_SearchToYear", year),
("txt_PSR_Application_SearchSortOrder", "LatestFirst"),
("txt_PSR_Application_ResultsSkipRows", "0"),
("txt_PSR_Application_ResultsPerPage", "1000"), # That should be enough to keep things on one page
("btn_PSR_Application_ApplicationSearch", "Search"),
("PSR_CURRENT_FORM", "psr_Application_PSRSearch_Appeals_Form"),
("txt_PSR_Appeals_ApplicationNumber", ""),
("txt_PSR_Appeals_Status", "awaitingDecision"),
("txt_PSR_Appeals_TypeOfAppeal", ""),
("txt_PSR_Appeals_DecisionType", ""),
("txt_PSR_Appeals_District", ""),
("txt_PSR_Appeals_Ward", ""),
("txt_PSR_Appeals_Location", ""),
("txt_PSR_Appeals_Applicant", ""),
("txt_PSR_Appeals_Agent", ""),
("txt_PSR_Appeals_SearchDay", ""),
("txt_PSR_Appeals_SearchMonth", ""),
("txt_PSR_Appeals_SearchYear", ""),
("txt_PSR_Appeals_SearchToDay", ""),
("txt_PSR_Appeals_SearchToMonth", ""),
("txt_PSR_Appeals_SearchToYear", ""),
("txt_PSR_Appeals_SearchSortOrder", "LatestFirst"),
("txt_PSR_Appeals_ResultsSkipRows", "0"),
("txt_PSR_Appeals_ResultsPerPage", "10"),
]


post_request = urllib2.Request(self.post_url, urllib.urlencode(post_data))
cookie_jar.add_cookie_header(post_request)

post_response = urllib2.urlopen(post_request)

soup = BeautifulSoup(post_response.read())

result_tables = soup.findAll("table", summary=re.compile("Summary of planning application"))

for result_table in result_tables:
application = PlanningApplication()
application.info_url = urlparse.urljoin(self.post_url, result_table.find(text="Application number").findNext("a")['href'])
application.council_reference = result_table.find(text="Application number").findNext("a").string
application.date_received = search_day
application.address = result_table.find(text="Location").findNext("td").p.string
application.postcode = getPostcodeFromText(application.address)
application.description = result_table.find(text="Proposal").findNext("td").p.string.replace(" ", " ").strip()
# Comment link gives an Access Denied, so we'll have to use the email
application.comment_url = self.comments_email_address

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BirminghamParser()
print parser.getResults(1,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Wyświetl plik

@@ -47,3 +47,4 @@
"Brent.py", "420"
"Carmarthenshire.py", "420"
"Berwick.py", "420"
"Birmingham.py", "420"

+ 2
- 1
trunk/python_scrapers/SitesToGenerate.csv Wyświetl plik

@@ -250,4 +250,5 @@
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser"
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Berwick-upon-Tweed Borough Council", "Berwick", "", "Berwick", "BerwickParser"
"Birmingham City Council", "Birmingham", "", "Birmingham", "BirminghamParser"

Ładowanie…
Anuluj
Zapisz