Ver código fonte

Add scraper for Brent.

master
duncan.parkes 17 anos atrás
pai
commit
193f2a6cac
3 arquivos alterados com 127 adições e 0 exclusões
  1. +125
    -0
      python_scrapers/Brent.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 125
- 0
python_scrapers/Brent.py Ver arquivo

@@ -0,0 +1,125 @@

import urllib2
import urllib
import urlparse

import datetime, time
import cgi
import re

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class BrentParser:
def __init__(self, *args):

self.authority_name = "London Borough of Brent"
self.authority_short_name = "Brent"
# self.base_url = "http://www.brent.gov.uk/servlet/ep.ext?extId=101149&byPeriod=Y&st=PL&periodUnits=day&periodMultiples=14"
self.base_url = "http://www.brent.gov.uk/servlet/ep.ext"

self._current_application = None

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

post_data = [
("from", search_day.strftime(date_format)),
("until", search_day.strftime(date_format)),
("EXECUTEQUERY", "Query"),
# ("auth", "402"),
("st", "PL"),
("periodUnits", "day"),
("periodMultiples", "14"),
("title", "Search+by+Application+Date"),
("instructions", "Enter+a+date+range+to+search+for+existing+applications+by+the+date+of+application.%0D%0A%3Cbr%3E%3Cbr%3E%0D%0A%3Cstrong%3ENote%3A%3C%2Fstrong%3E+Where+%27%28Applicant%27s+Description%29%27+appears+in+the+proposal%2C+the+text+may+subsequently+be+amended+when+the+application+is+checked."),
("byFormat", "N"),
("byOther1", "N"),
("byOther2", "N"),
("byOther3", "N"),
("byOther4", "N"),
("byOther5", "N"),
("byPostcode", "N"),
("byStreet", "N"),
("byHouseNumber", "N"),
("byAddress", "N"),
("byPeriod", "Y"),
("extId", "101149"), # I wonder what this is...
("queried", "Y"),
("other1Label", "Other1"),
("other2Label", "Other2"),
("other3Label", "Other3"),
("other4Label", "Other4"),
("other5Label", "Other5"),
("other1List", ""),
("other2List", ""),
("other3List", ""),
("other4List", ""),
("other5List", ""),
("periodLabel", "From"),
("addressLabel", "Select+Address"),
("print", "")
]

# Now get the search page
response = urllib2.urlopen(self.base_url, urllib.urlencode(post_data))

soup = BeautifulSoup(response.read())

trs = soup.find(text="Search Results").findNext("table").findAll("tr")[:-1]

# There are six trs per application, ish

# The first contains the case no and the application date.
# The second contains the address
# The third contains the description
# The fourth contains the info page link
# The fifth contains the comment link (or a note that comments are currently not being accepted
# The sixth is a spacer.

count = 0
for tr in trs:
count +=1

ref = tr.find(text=re.compile("Case No:"))
if ref:
self._current_application = PlanningApplication()
count = 1

self._current_application.council_reference = ref.split(":")[1].strip()
self._current_application.date_received = search_day

if count % 6 == 2:
self._current_application.address = tr.td.string.strip()
self._current_application.postcode = getPostcodeFromText(self._current_application.address)
if count % 6 == 3:
self._current_application.description = tr.td.string.strip()
if count % 6 == 4:
self._current_application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
if count % 6 == 5:
try:
self._current_application.comment_url = urlparse.urljoin(self.base_url, tr.a['href'])
except:
# Comments are not currently being accepted. We'll leave this app for the moment - we'll pick it up later if they start accepting comments
continue
if count % 6 == 0 and self._current_application.is_ready():
self._results.addApplication(self._current_application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = BrentParser()
print parser.getResults(6,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Ver arquivo

@@ -44,3 +44,4 @@
"Redbridge.cgi", "493" "Redbridge.cgi", "493"
"AmberValley.py", "420" "AmberValley.py", "420"
"Aberdeenshire.py", "420" "Aberdeenshire.py", "420"
"Brent.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Ver arquivo

@@ -248,3 +248,4 @@
"London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" "London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser"
"Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser" "Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser"
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser" "Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser"

Carregando…
Cancelar
Salvar