Keine Beschreibung http://j1x-huginn.herokuapp.com

rss_agent.rb 3.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. require 'rss'
  2. require 'feed-normalizer'
  3. module Agents
  4. class RssAgent < Agent
  5. include WebRequestConcern
  6. cannot_receive_events!
  7. default_schedule "every_1d"
  8. description do
  9. <<-MD
  10. This Agent consumes RSS feeds and emits events when they change.
  11. (If you want to *output* an RSS feed, use the DataOutputAgent. Also, you can technically parse RSS and XML feeds
  12. with the WebsiteAgent as well. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).)
  13. Options:
  14. * `url` - The URL of the RSS feed.
  15. * `clean` - Attempt to use [feed-normalizer](https://github.com/aasmith/feed-normalizer)'s' `clean!` method to cleanup HTML in the feed. Set to `true` to use.
  16. * `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
  17. * `headers` - When present, it should be a hash of headers to send with the request.
  18. * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
  19. * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
  20. MD
  21. end
  22. def default_options
  23. {
  24. 'expected_update_period_in_days' => "5",
  25. 'clean' => 'false',
  26. 'url' => "https://github.com/cantino/huginn/commits/master.atom"
  27. }
  28. end
  29. def working?
  30. event_created_within?((interpolated['expected_update_period_in_days'].presence || 10).to_i) && !recent_error_logs?
  31. end
  32. def validate_options
  33. errors.add(:base, "url is required") unless options['url'].present?
  34. unless options['expected_update_period_in_days'].present? && options['expected_update_period_in_days'].to_i > 0
  35. errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
  36. end
  37. validate_web_request_options!
  38. end
  39. def check
  40. response = faraday.get(interpolated['url'])
  41. if response.success?
  42. feed = FeedNormalizer::FeedNormalizer.parse(response.body)
  43. feed.clean! if interpolated['clean'] == 'true'
  44. created_event_count = 0
  45. feed.entries.each do |entry|
  46. if check_and_track(entry.id)
  47. created_event_count += 1
  48. create_event(:payload => {
  49. :id => entry.id,
  50. :date_published => entry.date_published,
  51. :last_updated => entry.last_updated,
  52. :urls => entry.urls,
  53. :description => entry.description,
  54. :content => entry.content,
  55. :title => entry.title,
  56. :authors => entry.authors,
  57. :categories => entry.categories
  58. })
  59. end
  60. end
  61. log "Fetched #{interpolated['url']} and created #{created_event_count} event(s)."
  62. else
  63. error "Failed to fetch #{interpolated['url']}: #{response.inspect}"
  64. end
  65. end
  66. protected
  67. def check_and_track(entry_id)
  68. memory['seen_ids'] ||= []
  69. if memory['seen_ids'].include?(entry_id)
  70. false
  71. else
  72. memory['seen_ids'].unshift entry_id
  73. memory['seen_ids'].pop if memory['seen_ids'].length > 500
  74. true
  75. end
  76. end
  77. end
  78. end