Adrian Short 11 år sedan
incheckning
8e277871bc
1 ändrade filer med 64 tillägg och 0 borttagningar
  1. +64
    -0
      scrape.rb

+ 64
- 0
scrape.rb Visa fil

@@ -0,0 +1,64 @@
# Scrape webpage into a podcast RSS feed
# https://www.sutton.gov.uk/index.aspx?articleid=4332

require 'nokogiri'
require 'open-uri'
require 'time'
require 'pp'

FEED_TITLE = "Cheam North and Worcester Park Local Committee"
FEED_IMAGE = "https://dl.dropbox.com/u/300783/logo.png"
FEED_AUTHOR = "London Borough of Sutton"
FEED_LINK = "https://www.sutton.gov.uk/index.aspx?articleid=4332"

url = "cnwp.html"

doc = Nokogiri.parse(open(url).read)

meeting = ''
items = []

doc.at("#bodytext").children.each do |node|
if node.inner_text.match(/\d{1,2}\s+\w+\s+\d{4}/) # eg 10 December 2012
meeting = node.inner_text.strip
end

node.children.each do |subnode|
if subnode.name == 'a' && subnode['href'].match(/\.mp3$/i)
items << {
:d => Time.parse(meeting),
:href => subnode['href'].strip,
:title => subnode.inner_text.strip
}
end
end
end

builder = Nokogiri::XML::Builder.new do |xml|
xml.rss('xmlns:itunes' => "http://www.itunes.com/dtds/podcast-1.0.dtd",
:version => "2.0") {

xml.channel {
xml.title FEED_TITLE
xml.link FEED_LINK
xml['itunes'].image(:href => FEED_IMAGE)
xml['itunes'].author FEED_AUTHOR

items.each do |i|
xml.item {
xml.title i[:title]
xml['itunes'].author FEED_AUTHOR
xml.enclosure(
:url => i[:href],
:type => "audio/mpeg"
)
xml.guid i[:href]
xml.pubDate i[:d].rfc822
}
end
}

}
end

puts builder.to_xml

Laddar…
Avbryt
Spara