diff --git a/pkg/alerting/receiver.go b/pkg/alerting/receiver.go index 8b965ecbd..5b184a523 100644 --- a/pkg/alerting/receiver.go +++ b/pkg/alerting/receiver.go @@ -10,6 +10,7 @@ import ( "sync" "time" + "github.com/cortexproject/cortex/pkg/util/flagext" "github.com/go-kit/kit/log" "github.com/go-kit/kit/log/level" "github.com/gorilla/mux" @@ -23,6 +24,8 @@ const ( namespace = "e2ealerting" ) +var defaultHistogramBuckets = []float64{5, 15, 30, 60, 90, 120, 240} + // Receiver implements the Alertmanager webhook receiver. It evaluates the received alerts, finds if the alert holds an annonnation with a label of "time", and if it does computes now - time for a total duration. type Receiver struct { logger log.Logger @@ -42,9 +45,10 @@ type Receiver struct { // ReceiverConfig implements the configuration for the alertmanager webhook receiver type ReceiverConfig struct { - RoundtripLabel string - PurgeLookback time.Duration - PurgeInterval time.Duration + CustomHistogramBuckets flagext.StringSliceCSV + PurgeInterval time.Duration + PurgeLookback time.Duration + RoundtripLabel string } // RegisterFlags registers the flags for the alertmanager webhook receiver @@ -52,6 +56,7 @@ func (cfg *ReceiverConfig) RegisterFlags(f *flag.FlagSet) { f.StringVar(&cfg.RoundtripLabel, "receiver.e2eduration-label", "", "Label name and value in the form 'name=value' to add for the Histogram that observes latency.") f.DurationVar(&cfg.PurgeInterval, "receiver.purge-interval", 15*time.Minute, "How often should we purge the in-memory measured timestamps tracker.") f.DurationVar(&cfg.PurgeLookback, "receiver.purge-lookback", 2*time.Hour, "Period at which measured timestamps will remain in-memory.") + f.Var(&cfg.CustomHistogramBuckets, "receiver.custom-histogram-buckets", "Custom histogram buckets, defaults to 5, 15, 30, 60, 90, 120, 240") } // NewReceiver returns an alertmanager webhook receiver @@ -88,12 +93,27 @@ func NewReceiver(cfg ReceiverConfig, log log.Logger, reg prometheus.Registerer) Help: "Total number of failed evaluations made by the webhook receiver.", }) + var buckets []float64 + + if len(cfg.CustomHistogramBuckets) == 0 { + buckets = defaultHistogramBuckets + } else { + buckets = make([]float64, len(cfg.CustomHistogramBuckets)) + for i, v := range cfg.CustomHistogramBuckets { + n, err := strconv.ParseFloat(v, 64) + if err != nil { + panic(err) + } + buckets[i] = n + } + } + r.roundtripDuration = promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ Namespace: namespace, Subsystem: "webhook_receiver", Name: "end_to_end_duration_seconds", Help: "Time spent (in seconds) from scraping a metric to receiving an alert.", - Buckets: []float64{5, 15, 30, 60, 90, 120, 240}, + Buckets: buckets, ConstLabels: lbl, }, []string{"alertname"}) @@ -106,6 +126,11 @@ func NewReceiver(cfg ReceiverConfig, log log.Logger, reg prometheus.Registerer) // RegisterRoutes registers the receiver API routes with the provided router. func (r *Receiver) RegisterRoutes(router *mux.Router) { router.Path("/api/v1/receiver").Methods(http.MethodPost).Handler(http.HandlerFunc(r.measureLatency)) + router.Path("/health").Methods(http.MethodGet).Handler(http.HandlerFunc(r.health)) +} + +func (r *Receiver) health(w http.ResponseWriter, req *http.Request) { + w.WriteHeader(http.StatusOK) } func (r *Receiver) measureLatency(w http.ResponseWriter, req *http.Request) { diff --git a/pkg/alerting/receiver_test.go b/pkg/alerting/receiver_test.go index 0798f2f7b..8509bba7e 100644 --- a/pkg/alerting/receiver_test.go +++ b/pkg/alerting/receiver_test.go @@ -6,6 +6,7 @@ import ( "fmt" "net/http" "net/http/httptest" + "strings" "testing" "time" @@ -17,6 +18,25 @@ import ( "github.com/stretchr/testify/require" ) +func Test_healthCheck(t *testing.T) { + r, err := NewReceiver( + ReceiverConfig{PurgeInterval: 1 * time.Hour}, + log.NewNopLogger(), + prometheus.NewRegistry(), + ) + require.NoError(t, err) + + router := mux.NewRouter() + r.RegisterRoutes(router) + + req := httptest.NewRequest(http.MethodGet, "/health", nil) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) +} + func Test_measureLatency(t *testing.T) { tc := []struct { name string @@ -94,3 +114,100 @@ func Test_measureLatency(t *testing.T) { }) } } + +func Test_measureLatencyCustomBuckets(t *testing.T) { + tc := []struct { + name string + alerts template.Data + err error + tracked []float64 + }{ + { + name: "with alerts to track", + alerts: template.Data{ + Alerts: template.Alerts{ + template.Alert{ + Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"}, + Annotations: template.KV{"time": "1.604069614e+09"}, + Status: string(model.AlertFiring), + }, + template.Alert{ + Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"}, + Annotations: template.KV{"time": "1.604069615e+09"}, + Status: string(model.AlertFiring), + }, + }, + }, + tracked: []float64{1604069614.00, 1604069615.00}, + }, + { + name: "with alerts that don't have a time annotation or alertname label it ignores them", + alerts: template.Data{ + Alerts: template.Alerts{ + template.Alert{ + Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"}, + Annotations: template.KV{"time": "1.604069614e+09"}, + Status: string(model.AlertFiring), + }, + template.Alert{ + Labels: template.KV{model.AlertNameLabel: "e2ealertingAlwaysFiring"}, + Status: string(model.AlertFiring), + }, + template.Alert{ + Annotations: template.KV{"time": "1.604069614e+09"}, + Status: string(model.AlertFiring), + }, + }, + }, + tracked: []float64{1604069614.00}, + }, + } + + for _, tt := range tc { + t.Run(tt.name, func(t *testing.T) { + reg := prometheus.NewRegistry() + r, err := NewReceiver( + ReceiverConfig{ + PurgeInterval: 1 * time.Hour, + CustomHistogramBuckets: []string{"0.5", "1", "10"}, + }, + log.NewNopLogger(), + reg, + ) + require.NoError(t, err) + + router := mux.NewRouter() + r.RegisterRoutes(router) + + b, err := json.Marshal(tt.alerts) + require.NoError(t, err) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/receiver", bytes.NewBuffer(b)) + w := httptest.NewRecorder() + + router.ServeHTTP(w, req) + + require.Equal(t, http.StatusOK, w.Code) + require.Equal(t, len(tt.tracked), len(r.timestamps)) + for _, timestamp := range tt.tracked { + _, exists := r.timestamps[timestamp] + require.True(t, exists, fmt.Sprintf("time %f is not tracked", timestamp)) + } + + metrics, err := reg.Gather() + require.NoError(t, err) + for _, metricFamily := range metrics { + if strings.Contains(*metricFamily.Name, "end_to_end_duration_seconds") { + for _, metric := range metricFamily.GetMetric() { + require.Len(t, metric.Histogram.Bucket, 3) + bucketBounds := []float64{} + for _, bucket := range metric.Histogram.Bucket { + bucketBounds = append(bucketBounds, *bucket.UpperBound) + } + require.Equal(t, []float64{0.5, 1, 10}, bucketBounds) + } + } + } + }) + } +}