# Scrape webpage into a podcast RSS feed # https://www.sutton.gov.uk/index.aspx?articleid=4332 require 'nokogiri' require 'open-uri' require 'time' require 'pp' FEED_TITLE = "Cheam North and Worcester Park Local Committee" FEED_IMAGE = "https://dl.dropbox.com/u/300783/logo.png" FEED_AUTHOR = "London Borough of Sutton" FEED_LINK = "https://www.sutton.gov.uk/index.aspx?articleid=4332" url = "cnwp.html" doc = Nokogiri.parse(open(url).read) meeting = '' items = [] items_this_meeting = 0 doc.at("#bodytext").children.each do |node| if node.inner_text.match(/\d{1,2}\s+\w+\s+\d{4}/) # eg 10 December 2012 meeting = node.inner_text.strip items_this_meeting = 0 end node.children.each do |subnode| if subnode.name == 'a' && subnode['href'].match(/\.mp3$/i) items_this_meeting += 1 items << { :d => Time.parse(meeting) + ((items_this_meeting - 1) * 30 * 60), :href => subnode['href'].strip, :title => subnode.inner_text.strip } end end end builder = Nokogiri::XML::Builder.new do |xml| xml.rss('xmlns:itunes' => "http://www.itunes.com/dtds/podcast-1.0.dtd", :version => "2.0") { xml.channel { xml.title FEED_TITLE xml.link FEED_LINK xml['itunes'].image(:href => FEED_IMAGE) xml['itunes'].author FEED_AUTHOR items.each do |i| xml.item { xml.title i[:title] xml['itunes'].author FEED_AUTHOR xml.enclosure( :url => i[:href], :type => "audio/mpeg" ) xml.guid i[:href] xml.pubDate i[:d].rfc822 } end } } end puts builder.to_xml