| @@ -0,0 +1,79 @@ | |||
| #!/usr/bin/ruby | |||
| require 'rubygems' | |||
| require 'json' | |||
| require 'httpclient' | |||
| require 'uri' | |||
| require 'time' | |||
| def backup(query, since_id = nil) | |||
| uri = URI::parse('http://search.twitter.com') | |||
| client = HTTPClient.new | |||
| @tweets = 0 | |||
| @page = 1 | |||
| @next_page = "?q=#{query}&rpp=100&page=#{@page}&result_type=recent" | |||
| unless since_id.nil? | |||
| @next_page += "&since_id=#{since_id}" | |||
| end | |||
| loop do | |||
| $stderr.puts "Trying page #{@page}" | |||
| url = "#{uri}/search.json#{@next_page}" | |||
| $stderr.puts url | |||
| response = client.get(url) | |||
| @json = JSON.parse(response.body) | |||
| if response.status_code == 200 | |||
| $stderr.puts "Got page #{@page} OK" | |||
| @count = @json['results'].size | |||
| @next_page = @json['next_page'] | |||
| @json['results'].each do |tweet| | |||
| bits = [] | |||
| bits << Time.parse(tweet['created_at']).to_s | |||
| bits << tweet['id_str'] | |||
| bits << tweet['from_user'] | |||
| bits << tweet['text'].gsub(/\n/, ' ').gsub(/\t/, ' ') | |||
| bits << tweet['source'] | |||
| bits << tweet['from_user_id_str'] | |||
| bits << tweet['to_user'] | |||
| bits << tweet['to_user_id_str'] | |||
| puts bits.join "\t" | |||
| end | |||
| $stderr.puts "#{@count} tweets processed" | |||
| @tweets += @count | |||
| @page += 1 | |||
| else | |||
| # Some kind of error | |||
| $stderr.puts "HTTP #{response.status_code} status code" | |||
| @json['errors'].each do |error| | |||
| $stderr.puts "ERROR code #{error['code']}: #{error['message']}" | |||
| end | |||
| if response.status_code == 403 | |||
| break | |||
| end | |||
| end | |||
| sleep(10) | |||
| if @count == 0 | |||
| break | |||
| end | |||
| end | |||
| $stderr.puts "#{@tweets} tweets collected" | |||
| end | |||
| backup(ARGV.shift, ARGV.shift) | |||