diff --git a/nais/alerterator-prod.yml b/nais/alerterator-prod.yml index 1da096c..1e09648 100644 --- a/nais/alerterator-prod.yml +++ b/nais/alerterator-prod.yml @@ -1,56 +1,76 @@ -apiVersion: nais.io/v1 -kind: Alert +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule metadata: name: {{app}}-alerts labels: team: {{team}} namespace: {{namespace}} spec: - receivers: - slack: - channel: {{slack-channel}} - prependText: "{{{slack-notify-type}}}" - alerts: - - alert: Applikasjon nede - severity: danger - expr: kube_deployment_status_replicas_available{deployment="{{app}}"} == 0 - for: 2m - description: "App \{{ $labels.app }} er nede i namespace \{{ $labels.kubernetes_namespace }}" - action: "`kubectl describe pod \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }} -c \{{ $labels.app }}` for events, og `kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }} -c \{{ $labels.app }}` for logger" + groups: + - name: {{app}}-alerts + rules: + - alert: Applikasjon nede + expr: kube_deployment_status_replicas_available{deployment="{{app}}"} == 0 + for: 5m + annotations: + consequence: Ingen søknader blir mellomlagret. + action: "`kubectl describe pod ` -> `kubectl logs `" + summary: "App \{{ $labels.deployment }} er nede i namespace \{{ $labels.namespace }}" + labels: + namespace: {{namespace}} + severity: danger - - alert: høy andel warning i logger - severity: warning - expr: sum by (log_app) (increase(logd_messages_total{log_app="{{app}}",log_level=~"Warning"}[3m])) > 100 - for: 1m - action: "Sjekk loggene til app \{{ $labels.log_app }} i namespace \{{ $labels.log_namespace }}, for å se hvorfor det er så mye warnings" + - alert: Høy andel error i logger + expr: sum by (app, container, pod, namespace) (floor(increase(logback_events_total{app="{{app}}", level="error"} [3m]))) > 0 + for: 5m + annotations: + action: "`kubectl logs \{{ $labels.pod }} -c \{{ $labels.container }} -n \{{ $labels.namespace }}`" + summary: "Høy andel error i logger for app \{{ $labels.app }} feiler med \{{ $labels.exception }} i namespace \{{ $labels.namespace }}" + labels: + namespace: {{namespace}} + severity: danger - - alert: høy andel error i logger - severity: danger - expr: sum by (log_app) (increase(logd_messages_total{log_app="{{app}}",log_level=~"Error"}[3m])) > 0 - for: 1m - action: "Sjekk loggene til app \{{ $labels.log_app }} i namespace \{{ $labels.log_namespace }}, for å se hvorfor det er så mye feil" + - alert: Høy andel warning i logger + expr: sum by (app, container, pod, namespace) (floor(increase(logback_events_total{app="{{app}}", level="warning"} [3m]))) > 0 + for: 5m + annotations: + action: "`kubectl logs \{{ $labels.pod }} -c \{{ $labels.container }} -n \{{ $labels.namespace }}`" + summary: "Høy andel warning i logger for app \{{ $labels.app }} feiler med \{{ $labels.exception }} i namespace \{{ $labels.namespace }}" + labels: + namespace: {{namespace}} + severity: warning - - alert: Høy andel HTTP serverfeil (5xx responser) - severity: danger - expr: floor(increase(http_server_requests_seconds_count{status=~"5.*", app="{{app}}"}[3m])) > 1 - for: 1m - description: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n - Grunn:\n ```\{{ $labels.problem_details }}```\n - Sjekk loggene for å se hvorfor dette feiler." - action: "`kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }} -c \{{ $labels.app }}`" + - alert: Høy andel HTTP serverfeil (5xx responser) + expr: floor(increase(http_server_requests_seconds_count{status=~"5.*", app="{{app}}"}[3m])) > 1 + for: 1m + annotations: + summary: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n + Grunn:\n ```\{{ $labels.problem_details }}```\n + Sjekk loggene for å se hvorfor dette feiler." + action: "`kubectl logs \{{ $labels.pod }} -n \{{ $labels.namespace }} -c \{{ $labels.app }}`" + labels: + namespace: {{namespace}} + severity: danger - - alert: Høy andel HTTP klientfeil (4xx responser) - severity: danger - expr: floor(increase(http_server_requests_seconds_count{status=~"4.*", status!~"404|401|403", app="{{app}}"}[3m])) > 0 - for: 1m - description: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n - Grunn:\n ```\{{ $labels.problem_details }}```\n - Sjekk loggene for å se hvorfor dette feiler" - action: "`kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }} -c \{{ $labels.app }}`" + - alert: Høy andel HTTP klientfeil (4xx responser) + severity: danger + expr: floor(increase(http_server_requests_seconds_count{status=~"4.*", status!~"404|401|403", app="{{app}}"}[3m])) > 0 + for: 1m + annotations: + summary: "Følgende request feilet: `Status \{{ $labels.status }} - \{{ $labels.method }} \{{ $labels.route }}`.\n + Grunn:\n ```\{{ $labels.problem_details }}```\n + Sjekk loggene for å se hvorfor dette feiler." + action: "`kubectl logs \{{ $labels.pod }} -n \{{ $labels.namespace }} -c \{{ $labels.app }}`" + labels: + namespace: {{namespace}} + severity: danger - - alert: Helsesjekk feiler - expr: floor(increase(http_server_requests_seconds_count{status!~"200", uri="/actuator/health", app="{{app}}"}[3m])) > 0 - severity: warning - for: 1m - desription: "Sjekk loggene for å se hvorfor helsesjekken feiler.`" - action: "`kubectl logs \{{ $labels.kubernetes_pod_name }} -n \{{ $labels.kubernetes_namespace }} -c \{{ $labels.app }}`" + - alert: Helsesjekk feiler + expr: floor(increase(http_server_requests_seconds_count{status!~"200", uri="/actuator/health", app="{{app}}"}[3m])) > 0 + for: 2m + annotations: + summary: "Sjekk loggene for å se hvorfor helsesjekken feiler.`" + action: "`kubectl logs \{{ $labels.pod }} -n \{{ $labels.namespace }} -c \{{ $labels.app }}`" + labels: + namespace: {{namespace}} + severity: danger