Keine Beschreibung http://j1x-huginn.herokuapp.com

rss_agent.rb 4.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. require 'rss'
  2. require 'feed-normalizer'
  3. module Agents
  4. class RssAgent < Agent
  5. include WebRequestConcern
  6. cannot_receive_events!
  7. can_dry_run!
  8. default_schedule "every_1d"
  9. description do
  10. <<-MD
  11. This Agent consumes RSS feeds and emits events when they change.
  12. This Agent is fairly simple, using [feed-normalizer](https://github.com/aasmith/feed-normalizer) as a base. For complex feeds
  13. with additional field types, we recommend using a WebsiteAgent. See [this example](https://github.com/cantino/huginn/wiki/Agent-configuration-examples#itunes-trailers).
  14. If you want to *output* an RSS feed, use the DataOutputAgent.
  15. Options:
  16. * `url` - The URL of the RSS feed (an array of URLs can also be used; items with identical guids across feeds will be considered duplicates).
  17. * `clean` - Attempt to use [feed-normalizer](https://github.com/aasmith/feed-normalizer)'s' `clean!` method to cleanup HTML in the feed. Set to `true` to use.
  18. * `expected_update_period_in_days` - How often you expect this RSS feed to change. If more than this amount of time passes without an update, the Agent will mark itself as not working.
  19. * `headers` - When present, it should be a hash of headers to send with the request.
  20. * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`.
  21. * `disable_ssl_verification` - Set to `true` to disable ssl verification.
  22. * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}").
  23. * `max_items_per_feed` - Limit number of items parsed (events created) per feed.
  24. MD
  25. end
  26. def default_options
  27. {
  28. 'expected_update_period_in_days' => "5",
  29. 'clean' => 'false',
  30. 'url' => "https://github.com/cantino/huginn/commits/master.atom"
  31. }
  32. end
  33. event_description <<-MD
  34. Events look like:
  35. {
  36. "id": "829f845279611d7925146725317b868d",
  37. "date_published": "2014-09-11 01:30:00 -0700",
  38. "last_updated": "Thu, 11 Sep 2014 01:30:00 -0700",
  39. "url": "http://example.com/...",
  40. "urls": [ "http://example.com/..." ],
  41. "description": "Some description",
  42. "content": "Some content",
  43. "title": "Some title",
  44. "authors": [ ... ],
  45. "categories": [ ... ]
  46. }
  47. MD
  48. def working?
  49. event_created_within?((interpolated['expected_update_period_in_days'].presence || 10).to_i) && !recent_error_logs?
  50. end
  51. def validate_options
  52. errors.add(:base, "url is required") unless options['url'].present?
  53. unless options['expected_update_period_in_days'].present? && options['expected_update_period_in_days'].to_i > 0
  54. errors.add(:base, "Please provide 'expected_update_period_in_days' to indicate how many days can pass without an update before this Agent is considered to not be working")
  55. end
  56. validate_web_request_options!
  57. end
  58. def check
  59. Array(interpolated['url']).each do |url|
  60. response = faraday.get(url)
  61. if response.success?
  62. feed = FeedNormalizer::FeedNormalizer.parse(response.body)
  63. feed.clean! if interpolated['clean'] == 'true'
  64. max_events = Integer(interpolated['max_items_per_feed']) if options['max_items_per_feed'].present?
  65. created_event_count = 0
  66. feed.entries.sort_by { |entry| [entry.date_published, entry.last_updated] }.each do |entry|
  67. break if (!max_events.nil?) && (max_events >= 0) && (created_event_count >= max_events)
  68. entry_id = get_entry_id(entry)
  69. if check_and_track(entry_id)
  70. created_event_count += 1
  71. create_event(payload: {
  72. id: entry_id,
  73. date_published: entry.date_published,
  74. last_updated: entry.last_updated,
  75. url: entry.url,
  76. urls: entry.urls,
  77. description: entry.description,
  78. content: entry.content,
  79. title: entry.title,
  80. authors: entry.authors,
  81. categories: entry.categories
  82. })
  83. end
  84. end
  85. log "Fetched #{url} and created #{created_event_count} event(s)."
  86. else
  87. error "Failed to fetch #{url}: #{response.inspect}"
  88. end
  89. end
  90. end
  91. protected
  92. def get_entry_id(entry)
  93. entry.id.presence || Digest::MD5.hexdigest(entry.content)
  94. end
  95. def check_and_track(entry_id)
  96. memory['seen_ids'] ||= []
  97. if memory['seen_ids'].include?(entry_id)
  98. false
  99. else
  100. memory['seen_ids'].unshift entry_id
  101. memory['seen_ids'].pop if memory['seen_ids'].length > 500
  102. true
  103. end
  104. end
  105. end
  106. end