Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prometheus alerts #40

Merged
merged 1 commit into from
Oct 16, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ def verify_prometheus_credentials
client.prometheus_try_connect
end

def verify_prometheus_alerts_credentials
ensure_monitoring_manager
monitoring_manager.verify_credentials
end

# UI methods for determining availability of fields
def supports_port?
true
Expand Down Expand Up @@ -131,6 +136,7 @@ def authentications_to_validate
at = [:bearer]
at << :hawkular if has_authentication_type?(:hawkular)
at << :prometheus if has_authentication_type?(:prometheus)
at << :prometheus_alerts if has_authentication_type?(:prometheus_alerts)
at
end

Expand All @@ -148,6 +154,8 @@ def verify_credentials(auth_type = nil, options = {})
verify_hawkular_credentials
elsif options[:auth_type].to_s == "prometheus"
verify_prometheus_credentials
elsif options[:auth_type].to_s == "prometheus_alerts"
verify_prometheus_alerts_credentials
else
with_provider_connection(options, &:api_valid?)
end
Expand All @@ -166,7 +174,7 @@ def ensure_authentications_record
end

def supported_auth_types
%w(default password bearer hawkular prometheus)
%w(default password bearer hawkular prometheus prometheus_alerts)
end

def supports_authentication?(authtype)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
module ManageIQ::Providers
class Kubernetes::MonitoringManager < ManageIQ::Providers::MonitoringManager
require_nested :EventCatcher

include ManageIQ::Providers::Kubernetes::MonitoringManagerMixin

belongs_to :parent_manager,
Expand All @@ -14,5 +16,9 @@ def self.ems_type
def self.description
@description ||= "Kubernetes Monitor".freeze
end

def self.event_monitor_class
ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
class ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher < ManageIQ::Providers::BaseManager::EventCatcher
require_nested :Runner
require_nested :RunnerMixin
require_nested :Stream

def self.ems_class
ManageIQ::Providers::Kubernetes::MonitoringManager
end

def self.settings_name
:event_catcher_prometheus
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
class ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::Runner < ManageIQ::Providers::BaseManager::EventCatcher::Runner
include ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::RunnerMixin
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
module ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::RunnerMixin
extend ActiveSupport::Concern

# This module is shared between:
# - Kubernetes::MonitoringManager::EventCatcher
# - Openshift::MonitoringManager::EventCatcher

def event_monitor_handle
@event_monitor_handle ||= ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::Stream.new(@ems)
end

def reset_event_monitor_handle
@event_monitor_handle = nil
end

def stop_event_monitor
@event_monitor_handle.stop unless @event_monitor_handle.nil?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@moolitayer there's a codeclimate comment to remember here for future fixes/refactorings:

Use safe navigation (&.) instead of checking if an object exists before calling the method. 

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See this comment: #40 (comment)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@moolitayer why can't you use try meanwhile?

rescue => err
$cn_monitoring_log.error("Event Monitor error [#{err.message}]")
$cn_monitoring_log.error("Error details: [#{err.details}]")
$cn_monitoring_log.log_backtrace(err)
ensure
reset_event_monitor_handle
end

def monitor_events
$cn_monitoring_log.info("[#{self.class.name}] Event Monitor started")
@target_ems_id = @ems.parent_manager.id
event_monitor_handle.start
event_monitor_running
event_monitor_handle.each_batch do |events|
@queue.enq(events) unless events.blank?
sleep_poll_normal
end
ensure
reset_event_monitor_handle
end

def queue_event(event)
event_hash = extract_event_data(event)
if event_hash
$cn_monitoring_log.info("Queuing event [#{event_hash}]")
EmsEvent.add_queue("add", @target_ems_id, event_hash)
end
end

def extract_event_data(event)
# EXAMPLE:
#
# {
# "annotations": {
# "message": "Node ocp-compute01.10.35.48.236.nip.io is down",
# "severity": "HIGH",
# "source": "ManageIQ",
# "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
# },
# "endsAt": "0001-01-01T00:00:00Z",
# "generatorURL": "http://prometheus-4018548653-w3str:9090/graph?g0.expr=container_fs_usage_bytes%7Bcontainer_name%3D%22%22%2Cdevice%3D%22%2Fdev%2Fmapper%2Fvg0-lv_root%22%7D+%3E+4e%2B07&g0.tab=0",
# "labels": {
# "alertname": "Node down",
# "beta_kubernetes_io_arch": "amd64",
# "beta_kubernetes_io_os": "linux",
# "device": "/dev/mapper/vg0-lv_root",
# "id": "/",
# "instance": "ocp-compute01.10.35.48.236.nip.io",
# "job": "kubernetes-nodes",
# "kubernetes_io_hostname": "ocp-compute01.10.35.48.236.nip.io",
# "region": "primary",
# "zone": "default"
# },
# "startsAt": "2017-07-17T12:18:00.457154718Z",
# "status": "firing",
# "generationID" : "323e0863-f501-4896-b7dc-353cf863597d", # Added in stream
# "index": 1, # Added in stream
# },
event = event.dup

annotations = event["annotations"]
event[:url] = annotations["url"]
event[:severity] = parse_severity(annotations["severity"])
labels = event["labels"]
event[:ems_ref] = incident_identifier(event, labels, annotations)
event[:resolved] = event["status"] == "resolved"
timestamp = event["timestamp"]

target = find_target(labels)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@moolitayer all the target references below (e.g. target.id, etc.) are crashing when target is nil.
cc @ilackarms @joelddiaz

{
:ems_id => @cfg[:ems_id],
:source => "DATAWAREHOUSE",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is DATAWAREHOUSE the right source for this? I thought this was a MonitoringManager not a DatawarehouseManager?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zgalor had the same concern. See #40 (comment)

Copy link
Member

@agrare agrare Aug 8, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:source => "PROMETHEUS" might be more obvious

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I missed that, would it make sense to fix that first?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately that change will take weeks to merge and I don't want it blocking this PR.
I'll update the code here once possible.

:timestamp => timestamp,
:event_type => "datawarehouse_alert",
:target_type => target.class.name,
:target_id => target.id,
:container_node_id => target.id,
:container_node_name => target.name,
:message => annotations["message"],
:full_data => event.to_h
}
end

def find_target(labels)
instance = ContainerNode.find_by(:name => labels["instance"], :ems_id => @target_ems_id)
$cn_monitoring_log.error("Could not find alert target from labels: [#{labels}]") unless instance
instance
end

def parse_severity(severity)
MiqAlertStatus::SEVERITY_LEVELS.find { |x| x == severity.to_s.downcase } || "error"
end

def incident_identifier(event, labels, annotations)
# When event b resolves event a, they both have the same startAt.
# Labels are added to avoid having two incidents starting at the same time.
Digest::SHA256.hexdigest(
[event["startsAt"], annotations["url"], labels["instance"], labels["alertname"]].join('|')
)
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
class ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::Stream
def initialize(ems)
@ems = ems
end

def start
@collecting_events = true
end

def stop
@collecting_events = false
end

def each_batch
while @collecting_events
yield(fetch)
end
rescue EOFError => err
$cn_monitoring_log.info("Monitoring connection closed #{err}")
end

def fetch
unless @current_generation
@current_generation, @current_index = last_position
end
$cn_monitoring_log.info("Fetching alerts. Generation: [#{@current_generation}/#{@current_index}]")

response = @ems.connect.get do |req|
req.params['generationID'] = @current_generation
req.params['fromIndex'] = @current_index
end
# {
# "generationID":"323e0863-f501-4896-b7dc-353cf863597d",
# "messages":[
# ...
# ]
# }
alert_list = response.body
alerts = []
@current_generation = alert_list["generationID"]
return alerts if alert_list['messages'].blank?
alert_list["messages"].each do |message|
@current_index = message['index']
unless message.fetch_path("data", "commonAnnotations", "miqTarget") == 'ContainerNode'
$cn_monitoring_log.info("Skipping alert due to missing annotation")
next
end
message["data"]["alerts"].each_with_index do |alert, i|
alert['generationID'] = @current_generation
alert['index'] = @current_index
alert['timestamp'] = timestamp_indent(alert, i)
alerts << alert
end
@current_index += 1
end
$cn_monitoring_log.info("[#{alerts.size}] new alerts. New generation: [#{@current_generation}/#{@current_index}]")
$cn_monitoring_log.debug(alerts)
alerts
end

def timestamp_indent(alert, indent)
# This is currently needed due to a uniqueness constraint on ems events
# see https://github.com/ManageIQ/manageiq/pull/15719
# Prometheus alert timestamp equals the evaluation cycle start timestamp
# We are adding an artificial indent of the lest significant bit since several alerts
# for different entities or from different alert definitions are likely to have the same timestamp
timestamp = alert["status"] == 'resolved' ? alert["endsAt"] : alert["startsAt"]
Time.zone.at((Time.parse(timestamp).to_f + (0.000001 * indent)))
end

def last_position
last_event = @ems.parent_manager.ems_events.last || OpenStruct.new(:full_data => {})
last_index = last_event.full_data['index']
[
last_event.full_data['generationID'].to_s,
last_index ? last_index + 1 : 0
]
end
end
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
module ManageIQ::Providers::Kubernetes::MonitoringManagerMixin
extend ActiveSupport::Concern

ENDPOINT_ROLE = :prometheus_alerts
DEFAULT_PORT = 9093

included do
delegate :authentications,
delegate :authentication_check,
:authentication_for_summary,
:authentication_status,
:authentication_status_ok,
:authentication_token,
:authentications,
:endpoints,
:zone,
:to => :parent_manager,
:allow_nil => true

default_value_for :port do |manager|
manager.port || DEFAULT_PORT
end
end

module ClassMethods
Expand All @@ -19,52 +22,44 @@ def raw_connect(hostname, port, options)
end
end

def default_endpoint
endpoints && endpoints.detect { |x| x.role == ENDPOINT_ROLE.to_s }
end

def supports_port?
true
end

# Authentication related methods, see AuthenticationMixin
def authentications_to_validate
[ENDPOINT_ROLE]
end

def required_credential_fields(_type)
[:auth_key]
end

def default_authentication_type
ENDPOINT_ROLE
def prometheus_alerts_endpoint
connection_configurations.prometheus_alerts.try(:endpoint)
end

def verify_credentials(auth_type = nil, options = {})
with_provider_connection(options.merge(:auth_type => auth_type)) do |conn|
# TODO: move to a client method, once we have one
def verify_credentials(_auth_type = nil, _options = {})
with_provider_connection do |conn|
conn.get.body.key?('generationID')
end
rescue OpenSSL::X509::CertificateError => err
raise MiqException::MiqInvalidCredentialsError, "SSL Error: #{err.message}"
rescue Faraday::ParsingError
raise MiqException::MiqUnreachableError, 'Unexpected Response'
rescue Faraday::ClientError => err
raise MiqException::MiqUnreachableError, err.message
rescue StandardError => err
raise MiqException::MiqUnreachableError, err.message, err.backtrace
end

def connect(options = {})
self.class.raw_connect(
options[:hostname] || hostname,
options[:port] || port,
:bearer => options[:bearer] || authentication_token(options[:auth_type] || 'bearer'),
options[:hostname] || prometheus_alerts_endpoint.hostname,
options[:port] || prometheus_alerts_endpoint.port,
:bearer => options[:bearer] || authentication_token, # goes to the default endpoint
:verify_ssl => options[:verify_ssl] || verify_ssl,
:cert_store => options[:cert_store] || ssl_cert_store
)
end

def default_authentication_type
ENDPOINT_ROLE
end

def ssl_cert_store
# nil === use system CA bundle
default_endpoint.try(:ssl_cert_store)
prometheus_alerts_endpoint.try(:ssl_cert_store)
end

def verify_ssl
default_endpoint.verify_ssl?
prometheus_alerts_endpoint.verify_ssl?
end
end
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
class ManageIQ::Providers::Kubernetes::Prometheus::MessageBufferClient
require 'faraday'
require 'faraday_middleware'

def initialize(host, port)
@host = host
Expand Down
2 changes: 2 additions & 0 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@
:event_catcher:
:event_catcher_kubernetes:
:poll: 1.seconds
:event_catcher_prometheus:
:poll: 20.seconds
:queue_worker_base:
:ems_metrics_collector_worker:
:ems_metrics_collector_worker_kubernetes:
Expand Down
Loading