Pārlūkot izejas kodu

Add Carmarthenshire scraper.

master
duncan.parkes pirms 16 gadiem
vecāks
revīzija
fb7ba977ae
3 mainītis faili ar 80 papildinājumiem un 0 dzēšanām
  1. +78
    -0
      python_scrapers/Carmarthenshire.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 78
- 0
python_scrapers/Carmarthenshire.py Parādīt failu

@@ -0,0 +1,78 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class CarmarthenshireParser:
def __init__(self, *args):
self.comments_email_address = "planning@carmarthenshire.gov.uk"

self.authority_name = "Carmarthenshire County Council"
self.authority_short_name = "Carmarthenshire"
self.base_url = "http://www.carmarthenshire.gov.uk/CCC_APPS/eng/plannaps/CCC_PlanningApplicationsResults.asp?datemode=range&in_lo_date=%(day)s%%2F%(month)s%%2F%(year)s&in_hi_date=%(day)s%%2F%(month)s%%2F%(year)s&SUBMIT=Search"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# Now get the search page
response = urllib2.urlopen(self.base_url %{"day": day,
"month": month,
"year": year,
})
soup = BeautifulSoup(response.read())

trs = soup.findAll("tr", valign="middle")

count = 0
for tr in trs:
# The odd trs are just spacers
if count % 2 == 0:
application = PlanningApplication()

tds = tr.findAll("td")
application.date_received = search_day
application.council_reference = tds[1].a.string
application.address = tds[3].a.string
application.postcode = getPostcodeFromText(application.address)
# All the links in this <tr> go to the same place...
application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])

# Still looking for description and comment url
# For the description, we'll need the info page
info_soup = BeautifulSoup(urllib2.urlopen(application.info_url).read())

application.description = info_soup.find(text="Description").findNext("td").findNext("td").font.string

# While we're here, lets get the OSGB grid ref
application.osgb_x, application.osgb_y = info_soup.find(text="Grid Reference").findNext("td").font.string.split("-")

# We'll have to use an email address for comments
application.comment_url = self.comments_email_address

self._results.addApplication(application)

count += 1

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = CarmarthenshireParser()
print parser.getResults(8,8,2008)


+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Parādīt failu

@@ -45,3 +45,4 @@
"AmberValley.py", "420"
"Aberdeenshire.py", "420"
"Brent.py", "420"
"Carmarthenshire.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Parādīt failu

@@ -249,3 +249,4 @@
"Amber Valley Borough Council", "Amber Valley", "", "AmberValley", "AmberValleyParser"
"Aberdeenshire Council", "Aberdeenshire", "", "Aberdeenshire", "AberdeenshireParser"
"London Borough of Brent", "Brent", "", "Brent", "BrentParser"
"Carmarthenshire County Council", "Carmarthenshire", "", "Carmarthenshire", "CarmarthenshireParser"

Notiek ielāde…
Atcelt
Saglabāt