website_agent.rb 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. require 'nokogiri'
  2. require 'typhoeus'
  3. require 'date'
  4. module Agents
  5. class WebsiteAgent < Agent
  6. cannot_receive_events!
  7. default_schedule "every_12h"
  8. UNIQUENESS_LOOK_BACK = 200
  9. UNIQUENESS_FACTOR = 3
  10. description <<-MD
  11. The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
  12. Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
  13. `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
  14. The `type` value can be `xml`, `html`, or `json`.
  15. To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
  16. When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `'text': true` or `attr` pointing to an attribute name to grab. An example:
  17. 'extract': {
  18. 'url': { 'css': "#comic img", 'attr': "src" },
  19. 'title': { 'css': "#comic img", 'attr': "title" },
  20. 'body_text': { 'css': "div.main", 'text': true }
  21. }
  22. When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example:
  23. 'extract': {
  24. 'title': { 'path': "results.data[*].title" },
  25. 'description': { 'path': "results.data[*].description" }
  26. }
  27. Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
  28. Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `username:password`.
  29. Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. This is only used to set the "working" status.
  30. Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
  31. Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset.
  32. MD
  33. event_description do
  34. "Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options['extract']}"
  35. end
  36. def working?
  37. event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
  38. end
  39. def default_options
  40. {
  41. 'expected_update_period_in_days' => "2",
  42. 'url' => "http://xkcd.com",
  43. 'type' => "html",
  44. 'mode' => "on_change",
  45. 'extract' => {
  46. 'url' => { 'css' => "#comic img", 'attr' => "src" },
  47. 'title' => { 'css' => "#comic img", 'attr' => "alt" },
  48. 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
  49. }
  50. }
  51. end
  52. def validate_options
  53. # Check for required fields
  54. errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
  55. if !options['extract'].present? && extraction_type != "json"
  56. errors.add(:base, "extract is required for all types except json")
  57. end
  58. # Check for optional fields
  59. if options['mode'].present?
  60. errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode'])
  61. end
  62. if options['expected_update_period_in_days'].present?
  63. errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days'])
  64. end
  65. if options['uniqueness_look_back'].present?
  66. errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
  67. end
  68. if (encoding = options['force_encoding']).present?
  69. case encoding
  70. when String
  71. begin
  72. Encoding.find(encoding)
  73. rescue ArgumentError
  74. errors.add(:base, "Unknown encoding: #{encoding.inspect}")
  75. end
  76. else
  77. errors.add(:base, "force_encoding must be a string")
  78. end
  79. end
  80. end
  81. def check
  82. hydra = Typhoeus::Hydra.new
  83. log "Fetching #{options['url']}"
  84. request_opts = { :followlocation => true }
  85. request_opts[:userpwd] = options['basic_auth'] if options['basic_auth'].present?
  86. requests = []
  87. if options['url'].kind_of?(Array)
  88. options['url'].each do |url|
  89. requests.push(Typhoeus::Request.new(url, request_opts))
  90. end
  91. else
  92. requests.push(Typhoeus::Request.new(options['url'], request_opts))
  93. end
  94. requests.each do |request|
  95. request.on_failure do |response|
  96. error "Failed: #{response.inspect}"
  97. end
  98. request.on_success do |response|
  99. body = response.body
  100. if (encoding = options['force_encoding']).present?
  101. body = body.encode(Encoding::UTF_8, encoding)
  102. end
  103. doc = parse(body)
  104. if extract_full_json?
  105. if store_payload!(previous_payloads(1), doc)
  106. log "Storing new result for '#{name}': #{doc.inspect}"
  107. create_event :payload => doc
  108. end
  109. else
  110. output = {}
  111. options['extract'].each do |name, extraction_details|
  112. if extraction_type == "json"
  113. result = Utils.values_at(doc, extraction_details['path'])
  114. log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
  115. else
  116. case
  117. when css = extraction_details['css']
  118. nodes = doc.css(css)
  119. when xpath = extraction_details['xpath']
  120. nodes = doc.xpath(xpath)
  121. else
  122. error "'css' or 'xpath' is required for HTML or XML extraction"
  123. return
  124. end
  125. unless Nokogiri::XML::NodeSet === nodes
  126. error "The result of HTML/XML extraction was not a NodeSet"
  127. return
  128. end
  129. result = nodes.map { |node|
  130. if extraction_details['attr']
  131. node.attr(extraction_details['attr'])
  132. elsif extraction_details['text']
  133. node.text()
  134. else
  135. error "'attr' or 'text' is required on HTML or XML extraction patterns"
  136. return
  137. end
  138. }
  139. log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
  140. end
  141. output[name] = result
  142. end
  143. num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
  144. if num_unique_lengths.length != 1
  145. error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
  146. return
  147. end
  148. old_events = previous_payloads num_unique_lengths.first
  149. num_unique_lengths.first.times do |index|
  150. result = {}
  151. options['extract'].keys.each do |name|
  152. result[name] = output[name][index]
  153. if name.to_s == 'url'
  154. result[name] = URI.join(options['url'], result[name]).to_s if (result[name] =~ URI::DEFAULT_PARSER.regexp[:ABS_URI]).nil?
  155. end
  156. end
  157. if store_payload!(old_events, result)
  158. log "Storing new parsed result for '#{name}': #{result.inspect}"
  159. create_event :payload => result
  160. end
  161. end
  162. end
  163. end
  164. hydra.queue request
  165. hydra.run
  166. end
  167. end
  168. private
  169. # This method returns true if the result should be stored as a new event.
  170. # If mode is set to 'on_change', this method may return false and update an existing
  171. # event to expire further in the future.
  172. def store_payload!(old_events, result)
  173. if !options['mode'].present?
  174. return true
  175. elsif options['mode'].to_s == "all"
  176. return true
  177. elsif options['mode'].to_s == "on_change"
  178. result_json = result.to_json
  179. old_events.each do |old_event|
  180. if old_event.payload.to_json == result_json
  181. old_event.expires_at = new_event_expiration_date
  182. old_event.save!
  183. return false
  184. end
  185. end
  186. return true
  187. end
  188. raise "Illegal options[mode]: " + options['mode'].to_s
  189. end
  190. def previous_payloads(num_events)
  191. if options['uniqueness_look_back'].present?
  192. look_back = options['uniqueness_look_back'].to_i
  193. else
  194. # Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK
  195. look_back = UNIQUENESS_FACTOR * num_events
  196. if look_back < UNIQUENESS_LOOK_BACK
  197. look_back = UNIQUENESS_LOOK_BACK
  198. end
  199. end
  200. events.order("id desc").limit(look_back) if options['mode'].present? && options['mode'].to_s == "on_change"
  201. end
  202. def extract_full_json?
  203. !options['extract'].present? && extraction_type == "json"
  204. end
  205. def extraction_type
  206. (options['type'] || begin
  207. if options['url'] =~ /\.(rss|xml)$/i
  208. "xml"
  209. elsif options['url'] =~ /\.json$/i
  210. "json"
  211. else
  212. "html"
  213. end
  214. end).to_s
  215. end
  216. def parse(data)
  217. case extraction_type
  218. when "xml"
  219. Nokogiri::XML(data)
  220. when "json"
  221. JSON.parse(data)
  222. when "html"
  223. Nokogiri::HTML(data)
  224. else
  225. raise "Unknown extraction type #{extraction_type}"
  226. end
  227. end
  228. def is_positive_integer?(value)
  229. begin
  230. Integer(value) >= 0
  231. rescue
  232. false
  233. end
  234. end
  235. end
  236. end