Kaynağa Gözat

Add scraper for Exmoor. Fix name of Herefordshire.

master
duncan.parkes 16 yıl önce
ebeveyn
işleme
2f155cbc6f
4 değiştirilmiş dosya ile 74 ekleme ve 2 silme
  1. +70
    -0
      python_scrapers/Exmoor.py
  2. +1
    -1
      python_scrapers/Herefordshire.py
  3. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  4. +2
    -1
      python_scrapers/SitesToGenerate.csv

+ 70
- 0
python_scrapers/Exmoor.py Dosyayı Görüntüle

@@ -0,0 +1,70 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

search_date_format = "%d+%b+%Y"
received_date_format = "%d %b %Y"

class ExmoorParser:
def __init__(self, *args):

self.authority_name = "Exmoor National Park"
self.authority_short_name = "Exmoor"
self.base_url = "http://www.exmoor-nationalpark.gov.uk/planning_weekly_list.htm?weeklylist=%s"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

response = urllib2.urlopen(self.base_url %(search_day.strftime(search_date_format)))
soup = BeautifulSoup(response.read())

# The first <tr> contains headers
trs = soup.table.findAll("tr")[1:]

for tr in trs:
application = PlanningApplication()

tds = tr.findAll("td")

application.date_received = datetime.datetime.strptime(tds[0].string, received_date_format).date()

application.info_url = urllib.unquote(urllib.quote_plus(urlparse.urljoin(self.base_url, tds[1].a['href'])))
application.council_reference = tds[1].a.string.strip()
application.address = tds[2].a.string.strip()
application.postcode = getPostcodeFromText(application.address)

# Now fetch the info url

info_response = urllib.urlopen(application.info_url)
info_soup = BeautifulSoup(info_response.read())

application.description = info_soup.find(text="Proposal:").findNext("td").string.strip()

try:
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text="Comment").parent['href'])
except:
application.comment_url = "No Comments"

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = ExmoorParser()
print parser.getResults(1,8,2008)


+ 1
- 1
python_scrapers/Herefordshire.py Dosyayı Görüntüle

@@ -20,7 +20,7 @@ class HerefordshireParser:

def __init__(self, *args):

self.authority_name = "Herefordshire County Council"
self.authority_name = "Herefordshire Council"
self.authority_short_name = "Herefordshire"
self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0"
#As we are going to the info page, we may as well pick up the comment url from there.


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Dosyayı Görüntüle

@@ -56,3 +56,4 @@
"Hampshire.py", "420"
"Hastings.py", "420"
"Herefordshire.py", "420"
"Exmoor.py", "420"

+ 2
- 1
python_scrapers/SitesToGenerate.csv Dosyayı Görüntüle

@@ -259,4 +259,5 @@
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser"
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser"
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"
"Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"
"Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser"

Yükleniyor…
İptal
Kaydet