website_agent.rb 6.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. require 'nokogiri'
  2. require 'typhoeus'
  3. require 'date'
  4. module Agents
  5. class WebsiteAgent < Agent
  6. cannot_receive_events!
  7. description <<-MD
  8. The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
  9. Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
  10. The `type` value can be `xml`, `html`, or `json`.
  11. To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
  12. When parsing HTML or XML, these sub-hashes specify how to extract with a `:css` CSS selector and either `:text => true` or `attr` pointing to an attribute name to grab. An example:
  13. :extract => {
  14. :url => { :css => "#comic img", :attr => "src" },
  15. :title => { :css => "#comic img", :attr => "title" },
  16. :body_text => { :css => "div.main", :text => true }
  17. }
  18. When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example:
  19. :extract => {
  20. :title => { :path => "results.data[*].title" },
  21. :description => { :path => "results.data[*].description" }
  22. }
  23. Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
  24. Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent.
  25. MD
  26. event_description do
  27. "Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options[:extract]}"
  28. end
  29. default_schedule "every_12h"
  30. UNIQUENESS_LOOK_BACK = 30
  31. def working?
  32. event_created_within(options[:expected_update_period_in_days]) && !recent_error_logs?
  33. end
  34. def default_options
  35. {
  36. :expected_update_period_in_days => "2",
  37. :url => "http://xkcd.com",
  38. :type => "html",
  39. :mode => :on_change,
  40. :extract => {
  41. :url => {:css => "#comic img", :attr => "src"},
  42. :title => {:css => "#comic img", :attr => "title"}
  43. }
  44. }
  45. end
  46. def validate_options
  47. errors.add(:base, "url and expected_update_period_in_days are required") unless options[:expected_update_period_in_days].present? && options[:url].present?
  48. if !options[:extract].present? && extraction_type != "json"
  49. errors.add(:base, "extract is required for all types except json")
  50. end
  51. end
  52. def check
  53. hydra = Typhoeus::Hydra.new
  54. log "Fetching #{options[:url]}"
  55. request = Typhoeus::Request.new(options[:url], :followlocation => true)
  56. request.on_failure do |response|
  57. error "Failed: #{response.inspect}"
  58. end
  59. request.on_success do |response|
  60. doc = parse(response.body)
  61. if extract_full_json?
  62. result = doc
  63. if store_payload? result
  64. log "Storing new result for '#{name}': #{result.inspect}"
  65. create_event :payload => result
  66. end
  67. else
  68. output = {}
  69. options[:extract].each do |name, extraction_details|
  70. result = if extraction_type == "json"
  71. output[name] = Utils.values_at(doc, extraction_details[:path])
  72. else
  73. output[name] = doc.css(extraction_details[:css]).map { |node|
  74. if extraction_details[:attr]
  75. node.attr(extraction_details[:attr])
  76. elsif extraction_details[:text]
  77. node.text()
  78. else
  79. error ":attr or :text is required on HTML or XML extraction patterns"
  80. return
  81. end
  82. }
  83. end
  84. log "Extracting #{extraction_type} at #{extraction_details[:path] || extraction_details[:css]}: #{result}"
  85. end
  86. num_unique_lengths = options[:extract].keys.map { |name| output[name].length }.uniq
  87. if num_unique_lengths.length != 1
  88. error "Got an uneven number of matches for #{options[:name]}: #{options[:extract].inspect}"
  89. return
  90. end
  91. num_unique_lengths.first.times do |index|
  92. result = {}
  93. options[:extract].keys.each do |name|
  94. result[name] = output[name][index]
  95. if name.to_s == 'url'
  96. result[name] = URI.join(options[:url], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
  97. end
  98. end
  99. if store_payload? result
  100. log "Storing new parsed result for '#{name}': #{result.inspect}"
  101. create_event :payload => result
  102. end
  103. end
  104. end
  105. end
  106. hydra.queue request
  107. hydra.run
  108. end
  109. private
  110. def store_payload? result
  111. !options[:mode] || options[:mode].to_s == "all" || (options[:mode].to_s == "on_change" && !previous_payloads.include?(result.to_json))
  112. end
  113. def previous_payloads
  114. events.order("id desc").limit(UNIQUENESS_LOOK_BACK).pluck(:payload).map(&:to_json) if options[:mode].to_s == "on_change"
  115. end
  116. def extract_full_json?
  117. (!options[:extract].present? && extraction_type == "json")
  118. end
  119. def extraction_type
  120. (options[:type] || begin
  121. if options[:url] =~ /\.(rss|xml)$/i
  122. "xml"
  123. elsif options[:url] =~ /\.json$/i
  124. "json"
  125. else
  126. "html"
  127. end
  128. end).to_s
  129. end
  130. def parse(data)
  131. case extraction_type
  132. when "xml"
  133. Nokogiri::XML(data)
  134. when "json"
  135. JSON.parse(data)
  136. when "html"
  137. Nokogiri::HTML(data)
  138. else
  139. raise "Unknown extraction type #{extraction_type}"
  140. end
  141. end
  142. end
  143. end