diff --git a/operations/tempo-mixin-compiled/alerts.yaml b/operations/tempo-mixin-compiled/alerts.yaml index 871b6492f35..707495dd43c 100644 --- a/operations/tempo-mixin-compiled/alerts.yaml +++ b/operations/tempo-mixin-compiled/alerts.yaml @@ -1,19 +1,6 @@ "groups": - "name": "tempo_alerts" "rules": - - "alert": "TempoRequestErrors" - "annotations": - "message": | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - "runbook_url": "https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors" - "expr": | - 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (cluster, namespace, job, route) - / - sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - > 10 - "for": "15m" - "labels": - "severity": "critical" - "alert": "TempoRequestLatency" "annotations": "message": | diff --git a/operations/tempo-mixin/alerts.libsonnet b/operations/tempo-mixin/alerts.libsonnet index 24acd6e5480..124e6bc4d53 100644 --- a/operations/tempo-mixin/alerts.libsonnet +++ b/operations/tempo-mixin/alerts.libsonnet @@ -4,25 +4,6 @@ { name: 'tempo_alerts', rules: [ - { - alert: 'TempoRequestErrors', - expr: ||| - 100 * sum(rate(tempo_request_duration_seconds_count{status_code=~"5.."}[1m])) by (%(group_by_job)s, route) - / - sum(rate(tempo_request_duration_seconds_count[1m])) by (%(group_by_job)s, route) - > 10 - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, - runbook_url: 'https://github.com/grafana/tempo/tree/main/operations/tempo-mixin/runbook.md#TempoRequestErrors', - }, - }, { alert: 'TempoRequestLatency', expr: ||| diff --git a/operations/tempo-mixin/runbook.md b/operations/tempo-mixin/runbook.md index 2e33f1e9309..17908eab6aa 100644 --- a/operations/tempo-mixin/runbook.md +++ b/operations/tempo-mixin/runbook.md @@ -1,8 +1,7 @@ # Runbook -This document should help with remediating operational issues in Tempo. +This document should help with remediation of operational issues in Tempo. -## TempoRequestErrors ## TempoRequestLatency Aside from obvious errors in the logs the only real lever you can pull here is scaling. Use the Reads or Writes dashboard @@ -281,4 +280,4 @@ The error "Unexpected error reloading meta for local block. Ignoring and continu meta.json. Repair the meta.json and then restart the ingester to successfully recover the block. Or if it is not able to be repaired then the block files can be simply deleted as the ingester has already started without it. As long as the replication factor is 2 or higher, then there will be no data loss as the -same data was also written to another ingester. \ No newline at end of file +same data was also written to another ingester.