Keine Beschreibung http://j1x-huginn.herokuapp.com

website_agent.rb 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. require 'nokogiri'
  2. require 'faraday'
  3. require 'faraday_middleware'
  4. require 'date'
  5. module Agents
  6. class WebsiteAgent < Agent
  7. default_schedule "every_12h"
  8. UNIQUENESS_LOOK_BACK = 200
  9. UNIQUENESS_FACTOR = 3
  10. description <<-MD
  11. The WebsiteAgent scrapes a website, XML document, or JSON feed and creates Events based on the results.
  12. Specify a `url` and select a `mode` for when to create Events based on the scraped data, either `all` or `on_change`.
  13. `url` can be a single url, or an array of urls (for example, for multiple pages with the exact same structure but different content to scrape)
  14. The `type` value can be `xml`, `html`, or `json`.
  15. To tell the Agent how to parse the content, specify `extract` as a hash with keys naming the extractions and values of hashes.
  16. When parsing HTML or XML, these sub-hashes specify how to extract with either a `css` CSS selector or a `xpath` XPath expression and either `"text": true` or `attr` pointing to an attribute name to grab. An example:
  17. "extract": {
  18. "url": { "css": "#comic img", "attr": "src" },
  19. "title": { "css": "#comic img", "attr": "title" },
  20. "body_text": { "css": "div.main", "text": true }
  21. }
  22. When parsing JSON, these sub-hashes specify [JSONPaths](http://goessner.net/articles/JsonPath/) to the values that you care about. For example:
  23. "extract": {
  24. "title": { "path": "results.data[*].title" },
  25. "description": { "path": "results.data[*].description" }
  26. }
  27. Note that for all of the formats, whatever you extract MUST have the same number of matches for each extractor. E.g., if you're extracting rows, all extractors must match all rows. For generating CSS selectors, something like [SelectorGadget](http://selectorgadget.com) may be helpful.
  28. Can be configured to use HTTP basic auth by including the `basic_auth` parameter with `"username:password"`, or `["username", "password"]`.
  29. Set `expected_update_period_in_days` to the maximum amount of time that you'd expect to pass between Events being created by this Agent. This is only used to set the "working" status.
  30. Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance). This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results.
  31. Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset.
  32. Set `user_agent` to a custom User-Agent name if the website does not like the default value ("Faraday v#{Faraday::VERSION}").
  33. The WebsiteAgent can also scrape based on incoming events. It will scrape the url contained in the `url` key of the incoming event payload.
  34. MD
  35. event_description do
  36. "Events will have the fields you specified. Your options look like:\n\n #{Utils.pretty_print options['extract']}"
  37. end
  38. def working?
  39. event_created_within?(options['expected_update_period_in_days']) && !recent_error_logs?
  40. end
  41. def default_options
  42. {
  43. 'expected_update_period_in_days' => "2",
  44. 'url' => "http://xkcd.com",
  45. 'type' => "html",
  46. 'mode' => "on_change",
  47. 'extract' => {
  48. 'url' => { 'css' => "#comic img", 'attr' => "src" },
  49. 'title' => { 'css' => "#comic img", 'attr' => "alt" },
  50. 'hovertext' => { 'css' => "#comic img", 'attr' => "title" }
  51. }
  52. }
  53. end
  54. def validate_options
  55. # Check for required fields
  56. errors.add(:base, "url and expected_update_period_in_days are required") unless options['expected_update_period_in_days'].present? && options['url'].present?
  57. if !options['extract'].present? && extraction_type != "json"
  58. errors.add(:base, "extract is required for all types except json")
  59. end
  60. # Check for optional fields
  61. if options['mode'].present?
  62. errors.add(:base, "mode must be set to on_change or all") unless %w[on_change all].include?(options['mode'])
  63. end
  64. if options['expected_update_period_in_days'].present?
  65. errors.add(:base, "Invalid expected_update_period_in_days format") unless is_positive_integer?(options['expected_update_period_in_days'])
  66. end
  67. if options['uniqueness_look_back'].present?
  68. errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back'])
  69. end
  70. if (encoding = options['force_encoding']).present?
  71. case encoding
  72. when String
  73. begin
  74. Encoding.find(encoding)
  75. rescue ArgumentError
  76. errors.add(:base, "Unknown encoding: #{encoding.inspect}")
  77. end
  78. else
  79. errors.add(:base, "force_encoding must be a string")
  80. end
  81. end
  82. if options['user_agent'].present?
  83. errors.add(:base, "user_agent must be a string") unless options['user_agent'].is_a?(String)
  84. end
  85. begin
  86. basic_auth_credentials()
  87. rescue => e
  88. errors.add(:base, e.message)
  89. end
  90. end
  91. def check
  92. check_url options['url']
  93. end
  94. def check_url(in_url)
  95. return unless in_url.present?
  96. Array(in_url).each do |url|
  97. log "Fetching #{url}"
  98. response = faraday.get(url)
  99. if response.success?
  100. body = response.body
  101. if (encoding = options['force_encoding']).present?
  102. body = body.encode(Encoding::UTF_8, encoding)
  103. end
  104. doc = parse(body)
  105. if extract_full_json?
  106. if store_payload!(previous_payloads(1), doc)
  107. log "Storing new result for '#{name}': #{doc.inspect}"
  108. create_event :payload => doc
  109. end
  110. else
  111. output = {}
  112. options['extract'].each do |name, extraction_details|
  113. if extraction_type == "json"
  114. result = Utils.values_at(doc, extraction_details['path'])
  115. log "Extracting #{extraction_type} at #{extraction_details['path']}: #{result}"
  116. else
  117. case
  118. when css = extraction_details['css']
  119. nodes = doc.css(css)
  120. when xpath = extraction_details['xpath']
  121. nodes = doc.xpath(xpath)
  122. else
  123. error '"css" or "xpath" is required for HTML or XML extraction'
  124. return
  125. end
  126. unless Nokogiri::XML::NodeSet === nodes
  127. error "The result of HTML/XML extraction was not a NodeSet"
  128. return
  129. end
  130. result = nodes.map { |node|
  131. if extraction_details['attr']
  132. node.attr(extraction_details['attr'])
  133. elsif extraction_details['text']
  134. node.text()
  135. else
  136. error '"attr" or "text" is required on HTML or XML extraction patterns'
  137. return
  138. end
  139. }
  140. log "Extracting #{extraction_type} at #{xpath || css}: #{result}"
  141. end
  142. output[name] = result
  143. end
  144. num_unique_lengths = options['extract'].keys.map { |name| output[name].length }.uniq
  145. if num_unique_lengths.length != 1
  146. error "Got an uneven number of matches for #{options['name']}: #{options['extract'].inspect}"
  147. return
  148. end
  149. old_events = previous_payloads num_unique_lengths.first
  150. num_unique_lengths.first.times do |index|
  151. result = {}
  152. options['extract'].keys.each do |name|
  153. result[name] = output[name][index]
  154. if name.to_s == 'url'
  155. result[name] = (response.env[:url] + result[name]).to_s
  156. end
  157. end
  158. if store_payload!(old_events, result)
  159. log "Storing new parsed result for '#{name}': #{result.inspect}"
  160. create_event :payload => result
  161. end
  162. end
  163. end
  164. else
  165. error "Failed: #{response.inspect}"
  166. end
  167. end
  168. end
  169. def receive(incoming_events)
  170. incoming_events.each do |event|
  171. url_to_scrape = event.payload['url']
  172. check_url(url_to_scrape) if url_to_scrape =~ /^https?:\/\//i
  173. end
  174. end
  175. private
  176. # This method returns true if the result should be stored as a new event.
  177. # If mode is set to 'on_change', this method may return false and update an existing
  178. # event to expire further in the future.
  179. def store_payload!(old_events, result)
  180. if !options['mode'].present?
  181. return true
  182. elsif options['mode'].to_s == "all"
  183. return true
  184. elsif options['mode'].to_s == "on_change"
  185. result_json = result.to_json
  186. old_events.each do |old_event|
  187. if old_event.payload.to_json == result_json
  188. old_event.expires_at = new_event_expiration_date
  189. old_event.save!
  190. return false
  191. end
  192. end
  193. return true
  194. end
  195. raise "Illegal options[mode]: " + options['mode'].to_s
  196. end
  197. def previous_payloads(num_events)
  198. if options['uniqueness_look_back'].present?
  199. look_back = options['uniqueness_look_back'].to_i
  200. else
  201. # Larger of UNIQUENESS_FACTOR * num_events and UNIQUENESS_LOOK_BACK
  202. look_back = UNIQUENESS_FACTOR * num_events
  203. if look_back < UNIQUENESS_LOOK_BACK
  204. look_back = UNIQUENESS_LOOK_BACK
  205. end
  206. end
  207. events.order("id desc").limit(look_back) if options['mode'].present? && options['mode'].to_s == "on_change"
  208. end
  209. def extract_full_json?
  210. !options['extract'].present? && extraction_type == "json"
  211. end
  212. def extraction_type
  213. (options['type'] || begin
  214. if options['url'] =~ /\.(rss|xml)$/i
  215. "xml"
  216. elsif options['url'] =~ /\.json$/i
  217. "json"
  218. else
  219. "html"
  220. end
  221. end).to_s
  222. end
  223. def parse(data)
  224. case extraction_type
  225. when "xml"
  226. Nokogiri::XML(data)
  227. when "json"
  228. JSON.parse(data)
  229. when "html"
  230. Nokogiri::HTML(data)
  231. else
  232. raise "Unknown extraction type #{extraction_type}"
  233. end
  234. end
  235. def is_positive_integer?(value)
  236. begin
  237. Integer(value) >= 0
  238. rescue
  239. false
  240. end
  241. end
  242. def faraday
  243. @faraday ||= Faraday.new { |builder|
  244. if (user_agent = options['user_agent']).present?
  245. builder.headers[:user_agent] = user_agent
  246. end
  247. builder.use FaradayMiddleware::FollowRedirects
  248. builder.request :url_encoded
  249. if userinfo = basic_auth_credentials()
  250. builder.request :basic_auth, *userinfo
  251. end
  252. case backend = faraday_backend
  253. when :typhoeus
  254. require 'typhoeus/adapters/faraday'
  255. end
  256. builder.adapter backend
  257. }
  258. end
  259. def faraday_backend
  260. ENV.fetch('FARADAY_HTTP_BACKEND', 'typhoeus').to_sym
  261. end
  262. def basic_auth_credentials
  263. case value = options['basic_auth']
  264. when nil, ''
  265. return nil
  266. when Array
  267. return value if value.size == 2
  268. when /:/
  269. return value.split(/:/, 2)
  270. end
  271. raise "bad value for basic_auth: #{value.inspect}"
  272. end
  273. end
  274. end