From 8e277871bc381f4d8c45041e5aa1dd17e8f49414 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Thu, 3 Jan 2013 16:49:21 +0000 Subject: [PATCH] --- scrape.rb | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 scrape.rb diff --git a/scrape.rb b/scrape.rb new file mode 100644 index 0000000..847d0fc --- /dev/null +++ b/scrape.rb @@ -0,0 +1,64 @@ +# Scrape webpage into a podcast RSS feed +# https://www.sutton.gov.uk/index.aspx?articleid=4332 + +require 'nokogiri' +require 'open-uri' +require 'time' +require 'pp' + +FEED_TITLE = "Cheam North and Worcester Park Local Committee" +FEED_IMAGE = "https://dl.dropbox.com/u/300783/logo.png" +FEED_AUTHOR = "London Borough of Sutton" +FEED_LINK = "https://www.sutton.gov.uk/index.aspx?articleid=4332" + +url = "cnwp.html" + +doc = Nokogiri.parse(open(url).read) + +meeting = '' +items = [] + +doc.at("#bodytext").children.each do |node| + if node.inner_text.match(/\d{1,2}\s+\w+\s+\d{4}/) # eg 10 December 2012 + meeting = node.inner_text.strip + end + + node.children.each do |subnode| + if subnode.name == 'a' && subnode['href'].match(/\.mp3$/i) + items << { + :d => Time.parse(meeting), + :href => subnode['href'].strip, + :title => subnode.inner_text.strip + } + end + end +end + +builder = Nokogiri::XML::Builder.new do |xml| + xml.rss('xmlns:itunes' => "http://www.itunes.com/dtds/podcast-1.0.dtd", + :version => "2.0") { + + xml.channel { + xml.title FEED_TITLE + xml.link FEED_LINK + xml['itunes'].image(:href => FEED_IMAGE) + xml['itunes'].author FEED_AUTHOR + + items.each do |i| + xml.item { + xml.title i[:title] + xml['itunes'].author FEED_AUTHOR + xml.enclosure( + :url => i[:href], + :type => "audio/mpeg" + ) + xml.guid i[:href] + xml.pubDate i[:d].rfc822 + } + end + } + + } +end + +puts builder.to_xml