Procházet zdrojové kódy

Add scraper for Brent.

import/raw
duncan.parkes před 16 roky
rodič
revize
af50d991f3
3 změnil soubory, kde provedl 127 přidání a 0 odebrání
  1. +125
    -0
      trunk/python_scrapers/Brent.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 125
- 0
trunk/python_scrapers/Brent.py Zobrazit soubor

@@ -0,0 +1,125 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BrentParser:
def __init__(self, *args):

self.authority_name = "London Borough of Brent"
self.authority_short_name = "Brent"
# self.base_url = "http://www.brent.gov.uk/servlet/ep.ext?extId=101149&byPeriod=Y&st=PL&periodUnits=day&periodMultiples=14"
self.base_url = "http://www.brent.gov.uk/servlet/ep.ext"

self._current_application = None

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = [
("from", search_day.strftime(date_format)),
("until", search_day.strftime(date_format)),
("EXECUTEQUERY", "Query"),
# ("auth", "402"),
("st", "PL"),
("periodUnits", "day"),
("periodMultiples", "14"),
("title", "Search+by+Application+Date"),
("instructions", "Enter+a+date+range+to+search+for+existing+applications+by+the+date+of+application.%0D%0A%3Cbr%3E%3Cbr%3E%0D%0A%3Cstrong%3ENote%3A%3C%2Fstrong%3E+Where+%27%28Applicant%27s+Description%29%27+appears+in+the+proposal%2C+the+text+may+subsequently+be+amended+when+the+application+is+checked."),
("byFormat", "N"),
("byOther1", "N"),
("byOther2", "N"),
("byOther3", "N"),
("byOther4", "N"),
("byOther5", "N"),
("byPostcode", "N"),
("byStreet", "N"),
("byHouseNumber", "N"),
("byAddress", "N"),
("byPeriod", "Y"),
("extId", "101149"), # I wonder what this is...
("queried", "Y"),
("other1Label", "Other1"),
("other2Label", "Other2"),
("other3Label", "Other3"),
("other4Label", "Other4"),
("other5Label", "Other5"),
("other1List", ""),
("other2List", ""),
("other3List", ""),
("other4List", ""),
("other5List", ""),
("periodLabel", "From"),
("addressLabel", "Select+Address"),
("print", "")
]

# Now get the search page
response = urllib2.urlopen(self.base_url, urllib.urlencode(post_data))

soup = BeautifulSoup(response.read())

trs = soup.find(text="Search Results").findNext("table").findAll("tr")[:-1]

# There are six trs per application, ish

# The first contains the case no and the application date.
# The second contains the address
# The third contains the description
# The fourth contains the info page link
# The fifth contains the comment link (or a note that comments are currently not being accepted
# The sixth is a spacer.

count = 0
for tr in trs:
count +=1

ref = tr.find(text=re.compile("Case No:"))
if ref:
self._current_application = PlanningApplication()
count = 1

self._current_application.council_reference = ref.split(":")[1].strip()
self._current_application.date_received = search_day

if count % 6 == 2:
self._current_application.address = tr.td.string.strip()
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
if count % 6 == 3:
self._current_application.description = tr.td.string.strip()
if count % 6 == 4:
self._current_application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
if count % 6 == 5:
try:
self._current_application.comment_url = urlparse.urljoin(self.base_url, tr.a['href'])
except:
# Comments are not currently being accepted. We'll leave this app for the moment - we'll pick it up later if they start accepting comments
continue
if count % 6 == 0 and self._current_application.is_ready():
self._results.addApplication(self._current_application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BrentParser()
print parser.getResults(6,8,2008)


+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Zobrazit soubor

@@ -44,3 +44,4 @@
"Redbridge.cgi", "493"
"AmberValley.py", "420"
"Aberdeenshire.py", "420"
"Brent.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Zobrazit soubor

@@ -248,3 +248,4 @@
"London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser"
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser"

Načítá se…
Zrušit
Uložit