#!/usr/bin/ruby require 'rubygems' require 'json' require 'httpclient' require 'uri' require 'time' def backup(query, since_id = nil) uri = URI::parse('http://search.twitter.com') client = HTTPClient.new @tweets = 0 @page = 1 @next_page = "?q=#{query}&rpp=100&page=#{@page}&result_type=recent" unless since_id.nil? @next_page += "&since_id=#{since_id}" end loop do $stderr.puts "Trying page #{@page}" url = "#{uri}/search.json#{@next_page}" $stderr.puts url response = client.get(url) @json = JSON.parse(response.body) if response.status_code == 200 $stderr.puts "Got page #{@page} OK" @count = @json['results'].size @next_page = @json['next_page'] @json['results'].each do |tweet| bits = [] bits << Time.parse(tweet['created_at']).to_s bits << tweet['id_str'] bits << tweet['from_user'] bits << tweet['text'].gsub(/\n/, ' ').gsub(/\t/, ' ') bits << tweet['source'] bits << tweet['from_user_id_str'] bits << tweet['to_user'] bits << tweet['to_user_id_str'] puts bits.join "\t" end $stderr.puts "#{@count} tweets processed" @tweets += @count @page += 1 else # Some kind of error $stderr.puts "HTTP #{response.status_code} status code" @json['errors'].each do |error| $stderr.puts "ERROR code #{error['code']}: #{error['message']}" end if response.status_code == 403 break end end sleep(10) if @count == 0 break end end $stderr.puts "#{@tweets} tweets collected" end backup(ARGV.shift, ARGV.shift)