Procházet zdrojové kódy

Add Thomas' scraper for Solihull.

import/raw
duncan.parkes před 16 roky
rodič
revize
f37d02d3de
3 změnil soubory, kde provedl 83 přidání a 0 odebrání
  1. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  2. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv
  3. +81
    -0
      trunk/python_scrapers/Solihull.py

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Zobrazit soubor

@@ -68,3 +68,4 @@
"Broxtowe.py", "420"
"Mendip.py", "420"
"Weymouth.py", "420"
"Solihull.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Zobrazit soubor

@@ -274,3 +274,4 @@
"Broxtowe Borough Council", "Broxtowe", "", "Broxtowe", "BroxtoweParser"
"Mendip District Council", "Mendip", "", "Mendip", "MendipParser"
"Weymouth and Portland Borough Council", "Weymouth and Portland", "", "Weymouth", "WeymouthParser"
"Solihull Metropolitan Borough Council", "Solihull", "", "Solihull", "SolihullParser"

+ 81
- 0
trunk/python_scrapers/Solihull.py Zobrazit soubor

@@ -0,0 +1,81 @@
"""
This is the screenscraper for planning apps for
Solihull Metropolitan Borough Council.

The apps for Solihull are displayed in html pages one per week, starting on Monday.
"""

import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class SolihullParser:

def __init__(self, *args):

self.authority_name = "Solihull Metropolitan Borough Council"
self.authority_short_name = "Solihull"
self.base_url = "http://www.solihull.gov.uk/planning/dc/weeklist.asp?SD=%s&ward=ALL"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# What we actually need is the monday before the date searched for:
monday_before = search_day - datetime.timedelta(search_day.weekday())

# Now get the search page
response = urllib2.urlopen(self.base_url %(monday_before.strftime(date_format)))
soup = BeautifulSoup(response.read())

result_tables = soup.findAll("table", width="98%", cellpadding="2")

for table in result_tables:
application = PlanningApplication()

trs = table.findAll("tr")
application.council_reference = trs[0].strong.string.strip()
relative_info_url = trs[0].a['href']
application.info_url = urlparse.urljoin(self.base_url, relative_info_url)

application.address = trs[1].findAll("td")[1].string.strip()
application.postcode = getPostcodeFromText(application.address)
application.description = trs[2].findAll("td")[1].string.strip()

#There's probably a prettier way to get the date, but with Python, it's easier for me to reinvent the wheel than to find an existing wheel!
raw_date_recv = trs[3].findAll("td")[3].string.strip().split("/")
#Check whether the application is on the target day. If not, discard it and move on.
if int(raw_date_recv[0]) != day:
continue
application.date_received = datetime.date(int(raw_date_recv[2]), int(raw_date_recv[1]), int(raw_date_recv[0]))

try:
relative_comment_url = trs[5].findAll("td")[1].a['href']
application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url)
except:
application.comment_url = "No Comment URL."

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = SolihullParser()
#Put this in with constant numbers, copying the Barnsley example. Works for testing, but should it use the arguments for a real run?
print parser.getResults(27,10,2008)

Načítá se…
Zrušit
Uložit