-
Notifications
You must be signed in to change notification settings - Fork 63
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Prometheus alerts #40
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
class ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher < ManageIQ::Providers::BaseManager::EventCatcher | ||
require_nested :Runner | ||
require_nested :RunnerMixin | ||
require_nested :Stream | ||
|
||
def self.ems_class | ||
ManageIQ::Providers::Kubernetes::MonitoringManager | ||
end | ||
|
||
def self.settings_name | ||
:event_catcher_prometheus | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
class ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::Runner < ManageIQ::Providers::BaseManager::EventCatcher::Runner | ||
include ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::RunnerMixin | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
module ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::RunnerMixin | ||
extend ActiveSupport::Concern | ||
|
||
# This module is shared between: | ||
# - Kubernetes::MonitoringManager::EventCatcher | ||
# - Openshift::MonitoringManager::EventCatcher | ||
|
||
def event_monitor_handle | ||
@event_monitor_handle ||= ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::Stream.new(@ems) | ||
end | ||
|
||
def reset_event_monitor_handle | ||
@event_monitor_handle = nil | ||
end | ||
|
||
def stop_event_monitor | ||
@event_monitor_handle.stop unless @event_monitor_handle.nil? | ||
rescue => err | ||
$cn_monitoring_log.error("Event Monitor error [#{err.message}]") | ||
$cn_monitoring_log.error("Error details: [#{err.details}]") | ||
$cn_monitoring_log.log_backtrace(err) | ||
ensure | ||
reset_event_monitor_handle | ||
end | ||
|
||
def monitor_events | ||
$cn_monitoring_log.info("[#{self.class.name}] Event Monitor started") | ||
@target_ems_id = @ems.parent_manager.id | ||
event_monitor_handle.start | ||
event_monitor_running | ||
event_monitor_handle.each_batch do |events| | ||
@queue.enq(events) unless events.blank? | ||
sleep_poll_normal | ||
end | ||
ensure | ||
reset_event_monitor_handle | ||
end | ||
|
||
def queue_event(event) | ||
event_hash = extract_event_data(event) | ||
if event_hash | ||
$cn_monitoring_log.info("Queuing event [#{event_hash}]") | ||
EmsEvent.add_queue("add", @target_ems_id, event_hash) | ||
end | ||
end | ||
|
||
def extract_event_data(event) | ||
# EXAMPLE: | ||
# | ||
# { | ||
# "annotations": { | ||
# "message": "Node ocp-compute01.10.35.48.236.nip.io is down", | ||
# "severity": "HIGH", | ||
# "source": "ManageIQ", | ||
# "url": "https://www.youtube.com/watch?v=dQw4w9WgXcQ" | ||
# }, | ||
# "endsAt": "0001-01-01T00:00:00Z", | ||
# "generatorURL": "http://prometheus-4018548653-w3str:9090/graph?g0.expr=container_fs_usage_bytes%7Bcontainer_name%3D%22%22%2Cdevice%3D%22%2Fdev%2Fmapper%2Fvg0-lv_root%22%7D+%3E+4e%2B07&g0.tab=0", | ||
# "labels": { | ||
# "alertname": "Node down", | ||
# "beta_kubernetes_io_arch": "amd64", | ||
# "beta_kubernetes_io_os": "linux", | ||
# "device": "/dev/mapper/vg0-lv_root", | ||
# "id": "/", | ||
# "instance": "ocp-compute01.10.35.48.236.nip.io", | ||
# "job": "kubernetes-nodes", | ||
# "kubernetes_io_hostname": "ocp-compute01.10.35.48.236.nip.io", | ||
# "region": "primary", | ||
# "zone": "default" | ||
# }, | ||
# "startsAt": "2017-07-17T12:18:00.457154718Z", | ||
# "status": "firing", | ||
# "generationID" : "323e0863-f501-4896-b7dc-353cf863597d", # Added in stream | ||
# "index": 1, # Added in stream | ||
# }, | ||
event = event.dup | ||
|
||
annotations = event["annotations"] | ||
event[:url] = annotations["url"] | ||
event[:severity] = parse_severity(annotations["severity"]) | ||
labels = event["labels"] | ||
event[:ems_ref] = incident_identifier(event, labels, annotations) | ||
event[:resolved] = event["status"] == "resolved" | ||
timestamp = event["timestamp"] | ||
|
||
target = find_target(labels) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @moolitayer all the |
||
{ | ||
:ems_id => @cfg[:ems_id], | ||
:source => "DATAWAREHOUSE", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zgalor had the same concern. See #40 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I missed that, would it make sense to fix that first? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately that change will take weeks to merge and I don't want it blocking this PR. |
||
:timestamp => timestamp, | ||
:event_type => "datawarehouse_alert", | ||
:target_type => target.class.name, | ||
:target_id => target.id, | ||
:container_node_id => target.id, | ||
:container_node_name => target.name, | ||
:message => annotations["message"], | ||
:full_data => event.to_h | ||
} | ||
end | ||
|
||
def find_target(labels) | ||
instance = ContainerNode.find_by(:name => labels["instance"], :ems_id => @target_ems_id) | ||
$cn_monitoring_log.error("Could not find alert target from labels: [#{labels}]") unless instance | ||
instance | ||
end | ||
|
||
def parse_severity(severity) | ||
MiqAlertStatus::SEVERITY_LEVELS.find { |x| x == severity.to_s.downcase } || "error" | ||
end | ||
|
||
def incident_identifier(event, labels, annotations) | ||
# When event b resolves event a, they both have the same startAt. | ||
# Labels are added to avoid having two incidents starting at the same time. | ||
Digest::SHA256.hexdigest( | ||
[event["startsAt"], annotations["url"], labels["instance"], labels["alertname"]].join('|') | ||
) | ||
end | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
class ManageIQ::Providers::Kubernetes::MonitoringManager::EventCatcher::Stream | ||
def initialize(ems) | ||
@ems = ems | ||
end | ||
|
||
def start | ||
@collecting_events = true | ||
end | ||
|
||
def stop | ||
@collecting_events = false | ||
end | ||
|
||
def each_batch | ||
while @collecting_events | ||
yield(fetch) | ||
end | ||
rescue EOFError => err | ||
$cn_monitoring_log.info("Monitoring connection closed #{err}") | ||
end | ||
|
||
def fetch | ||
unless @current_generation | ||
@current_generation, @current_index = last_position | ||
end | ||
$cn_monitoring_log.info("Fetching alerts. Generation: [#{@current_generation}/#{@current_index}]") | ||
|
||
response = @ems.connect.get do |req| | ||
req.params['generationID'] = @current_generation | ||
req.params['fromIndex'] = @current_index | ||
end | ||
# { | ||
# "generationID":"323e0863-f501-4896-b7dc-353cf863597d", | ||
# "messages":[ | ||
# ... | ||
# ] | ||
# } | ||
alert_list = response.body | ||
alerts = [] | ||
@current_generation = alert_list["generationID"] | ||
return alerts if alert_list['messages'].blank? | ||
alert_list["messages"].each do |message| | ||
@current_index = message['index'] | ||
unless message.fetch_path("data", "commonAnnotations", "miqTarget") == 'ContainerNode' | ||
$cn_monitoring_log.info("Skipping alert due to missing annotation") | ||
next | ||
end | ||
message["data"]["alerts"].each_with_index do |alert, i| | ||
alert['generationID'] = @current_generation | ||
alert['index'] = @current_index | ||
alert['timestamp'] = timestamp_indent(alert, i) | ||
alerts << alert | ||
end | ||
@current_index += 1 | ||
end | ||
$cn_monitoring_log.info("[#{alerts.size}] new alerts. New generation: [#{@current_generation}/#{@current_index}]") | ||
$cn_monitoring_log.debug(alerts) | ||
alerts | ||
end | ||
|
||
def timestamp_indent(alert, indent) | ||
# This is currently needed due to a uniqueness constraint on ems events | ||
# see https://github.com/ManageIQ/manageiq/pull/15719 | ||
# Prometheus alert timestamp equals the evaluation cycle start timestamp | ||
# We are adding an artificial indent of the lest significant bit since several alerts | ||
# for different entities or from different alert definitions are likely to have the same timestamp | ||
timestamp = alert["status"] == 'resolved' ? alert["endsAt"] : alert["startsAt"] | ||
Time.zone.at((Time.parse(timestamp).to_f + (0.000001 * indent))) | ||
end | ||
|
||
def last_position | ||
last_event = @ems.parent_manager.ems_events.last || OpenStruct.new(:full_data => {}) | ||
last_index = last_event.full_data['index'] | ||
[ | ||
last_event.full_data['generationID'].to_s, | ||
last_index ? last_index + 1 : 0 | ||
] | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@moolitayer there's a codeclimate comment to remember here for future fixes/refactorings:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See this comment: #40 (comment)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@moolitayer why can't you use
try
meanwhile?