From 8e277871bc381f4d8c45041e5aa1dd17e8f49414 Mon Sep 17 00:00:00 2001
From: Adrian Short <adrian.short@gmail.com>
Date: Thu, 3 Jan 2013 16:49:21 +0000
Subject: [PATCH]

---
 scrape.rb | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 scrape.rb

diff --git a/scrape.rb b/scrape.rb
new file mode 100644
index 0000000..847d0fc
--- /dev/null
+++ b/scrape.rb
@@ -0,0 +1,64 @@
+# Scrape webpage into a podcast RSS feed
+# https://www.sutton.gov.uk/index.aspx?articleid=4332
+
+require 'nokogiri'
+require 'open-uri'
+require 'time'
+require 'pp'
+
+FEED_TITLE = "Cheam North and Worcester Park Local Committee"
+FEED_IMAGE = "https://dl.dropbox.com/u/300783/logo.png"
+FEED_AUTHOR = "London Borough of Sutton"
+FEED_LINK = "https://www.sutton.gov.uk/index.aspx?articleid=4332"
+
+url = "cnwp.html"
+
+doc = Nokogiri.parse(open(url).read)
+
+meeting = ''
+items = []
+
+doc.at("#bodytext").children.each do |node|
+  if node.inner_text.match(/\d{1,2}\s+\w+\s+\d{4}/) # eg 10 December 2012
+    meeting = node.inner_text.strip
+  end
+
+  node.children.each do |subnode|
+    if subnode.name == 'a' && subnode['href'].match(/\.mp3$/i)
+      items << {
+        :d => Time.parse(meeting),
+        :href => subnode['href'].strip,
+        :title => subnode.inner_text.strip
+      }
+    end
+  end
+end
+
+builder = Nokogiri::XML::Builder.new do |xml|
+  xml.rss('xmlns:itunes' => "http://www.itunes.com/dtds/podcast-1.0.dtd",
+          :version => "2.0") {
+
+    xml.channel {
+      xml.title FEED_TITLE
+      xml.link FEED_LINK
+      xml['itunes'].image(:href => FEED_IMAGE)
+      xml['itunes'].author FEED_AUTHOR
+
+      items.each do |i|
+        xml.item {
+          xml.title i[:title]
+          xml['itunes'].author FEED_AUTHOR
+          xml.enclosure(
+            :url => i[:href],
+            :type => "audio/mpeg"
+          )
+          xml.guid i[:href]
+          xml.pubDate i[:d].rfc822
+        }
+      end
+    }
+
+  }
+end
+
+puts builder.to_xml