|
|
@@ -0,0 +1,79 @@ |
|
|
|
#!/usr/bin/ruby |
|
|
|
|
|
|
|
require 'rubygems' |
|
|
|
require 'json' |
|
|
|
require 'httpclient' |
|
|
|
require 'uri' |
|
|
|
require 'time' |
|
|
|
|
|
|
|
def backup(query, since_id = nil) |
|
|
|
|
|
|
|
uri = URI::parse('http://search.twitter.com') |
|
|
|
client = HTTPClient.new |
|
|
|
|
|
|
|
@tweets = 0 |
|
|
|
@page = 1 |
|
|
|
@next_page = "?q=#{query}&rpp=100&page=#{@page}&result_type=recent" |
|
|
|
|
|
|
|
unless since_id.nil? |
|
|
|
@next_page += "&since_id=#{since_id}" |
|
|
|
end |
|
|
|
|
|
|
|
loop do |
|
|
|
$stderr.puts "Trying page #{@page}" |
|
|
|
|
|
|
|
url = "#{uri}/search.json#{@next_page}" |
|
|
|
$stderr.puts url |
|
|
|
response = client.get(url) |
|
|
|
|
|
|
|
@json = JSON.parse(response.body) |
|
|
|
|
|
|
|
if response.status_code == 200 |
|
|
|
$stderr.puts "Got page #{@page} OK" |
|
|
|
|
|
|
|
@count = @json['results'].size |
|
|
|
@next_page = @json['next_page'] |
|
|
|
|
|
|
|
@json['results'].each do |tweet| |
|
|
|
bits = [] |
|
|
|
bits << Time.parse(tweet['created_at']).to_s |
|
|
|
bits << tweet['id_str'] |
|
|
|
bits << tweet['from_user'] |
|
|
|
bits << tweet['text'].gsub(/\n/, ' ').gsub(/\t/, ' ') |
|
|
|
bits << tweet['source'] |
|
|
|
bits << tweet['from_user_id_str'] |
|
|
|
bits << tweet['to_user'] |
|
|
|
bits << tweet['to_user_id_str'] |
|
|
|
puts bits.join "\t" |
|
|
|
end |
|
|
|
|
|
|
|
$stderr.puts "#{@count} tweets processed" |
|
|
|
@tweets += @count |
|
|
|
@page += 1 |
|
|
|
|
|
|
|
else |
|
|
|
# Some kind of error |
|
|
|
$stderr.puts "HTTP #{response.status_code} status code" |
|
|
|
|
|
|
|
|
|
|
|
@json['errors'].each do |error| |
|
|
|
$stderr.puts "ERROR code #{error['code']}: #{error['message']}" |
|
|
|
end |
|
|
|
|
|
|
|
if response.status_code == 403 |
|
|
|
break |
|
|
|
end |
|
|
|
end |
|
|
|
|
|
|
|
sleep(10) |
|
|
|
|
|
|
|
if @count == 0 |
|
|
|
break |
|
|
|
end |
|
|
|
end |
|
|
|
|
|
|
|
$stderr.puts "#{@tweets} tweets collected" |
|
|
|
end |
|
|
|
|
|
|
|
|
|
|
|
backup(ARGV.shift, ARGV.shift) |