Download Twitter searches to TSV
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
1.7 KiB

  1. #!/usr/bin/ruby
  2. require 'rubygems'
  3. require 'json'
  4. require 'httpclient'
  5. require 'uri'
  6. require 'time'
  7. def backup(query, since_id = nil)
  8. uri = URI::parse('http://search.twitter.com')
  9. client = HTTPClient.new
  10. @tweets = 0
  11. @page = 1
  12. @next_page = "?q=#{query}&rpp=100&page=#{@page}&result_type=recent"
  13. unless since_id.nil?
  14. @next_page += "&since_id=#{since_id}"
  15. end
  16. loop do
  17. $stderr.puts "Trying page #{@page}"
  18. url = "#{uri}/search.json#{@next_page}"
  19. $stderr.puts url
  20. response = client.get(url)
  21. @json = JSON.parse(response.body)
  22. if response.status_code == 200
  23. $stderr.puts "Got page #{@page} OK"
  24. @count = @json['results'].size
  25. @next_page = @json['next_page']
  26. @json['results'].each do |tweet|
  27. bits = []
  28. bits << Time.parse(tweet['created_at']).to_s
  29. bits << tweet['id_str']
  30. bits << tweet['from_user']
  31. bits << tweet['text'].gsub(/\n/, ' ').gsub(/\t/, ' ')
  32. bits << tweet['source']
  33. bits << tweet['from_user_id_str']
  34. bits << tweet['to_user']
  35. bits << tweet['to_user_id_str']
  36. puts bits.join "\t"
  37. end
  38. $stderr.puts "#{@count} tweets processed"
  39. @tweets += @count
  40. @page += 1
  41. else
  42. # Some kind of error
  43. $stderr.puts "HTTP #{response.status_code} status code"
  44. @json['errors'].each do |error|
  45. $stderr.puts "ERROR code #{error['code']}: #{error['message']}"
  46. end
  47. if response.status_code == 403
  48. break
  49. end
  50. end
  51. sleep(10)
  52. if @count == 0
  53. break
  54. end
  55. end
  56. $stderr.puts "#{@tweets} tweets collected"
  57. end
  58. backup(ARGV.shift, ARGV.shift)