feat: implement simple Fetch by key for cache items (#4032)

* implement simple cache GET * make better errors and add cache to tempo local * improve memcache logs * delete unused method * changelog * use new method in other places * use otel tracing * log missed and hitted keys * remove batched requests * remove span loggers from memcached
grafana · Sep 10, 2024 · e3f8096 · e3f8096
1 parent 4013355
commit e3f8096
Show file tree

Hide file tree

Showing 16 changed files with 232 additions and 222 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@
   **BREAKING CHANGE** Removed `querier_forget_delay` setting from the frontend. This configuration option did nothing.
 * [ENHANCEMENT] Update metrics-generator config in Tempo distributed docker compose example to serve TraceQL metrics [#4003](https://github.com/grafana/tempo/pull/4003) (@javiermolinar)
 * [ENHANCEMENT] Reduce allocs related to marshalling dedicated columns repeatedly in the query frontend. [#4007](https://github.com/grafana/tempo/pull/4007) (@joe-elliott)
+* [ENHANCEMENT] Implement simple Fetch by key for cache items  [#4032](https://github.com/grafana/tempo/pull/4032) (@javiermolinar)
 * [ENHANCEMENT] Replace Grafana Agent example by Grafana Alloy[#4030](https://github.com/grafana/tempo/pull/4030) (@javiermolinar)
 * [ENHANCEMENT] Support exporting internal Tempo traces via OTLP exporter when `use_otel_tracer` is enabled. Use the OpenTelemetry SDK environment variables to configure the span exporter. [#4028](https://github.com/grafana/tempo/pull/4028) (@andreasgerstmayr)
 * [ENHANCEMENT] TraceQL metrics queries: add min_over_time  [#3975](https://github.com/grafana/tempo/pull/3975) (@javiermolinar)

diff --git a/example/docker-compose/local/docker-compose.yaml b/example/docker-compose/local/docker-compose.yaml
@@ -12,11 +12,20 @@ services:
     volumes:
       - ./tempo-data:/var/tempo
 
+  memcached:
+    image: memcached:1.6.29
+    container_name: memcached
+    ports:
+      - "11211:11211"
+    environment:
+      - MEMCACHED_MAX_MEMORY=64m  # Set the maximum memory usage
+      - MEMCACHED_THREADS=4       # Number of threads to use
+
   tempo:
     image: *tempoImage
     command: [ "-config.file=/etc/tempo.yaml" ]
     volumes:
-      - ../shared/tempo.yaml:/etc/tempo.yaml
+      - ./tempo.yaml:/etc/tempo.yaml
       - ./tempo-data:/var/tempo
     ports:
       - "14268:14268"  # jaeger ingest
@@ -27,6 +36,7 @@ services:
       - "9411:9411"   # zipkin
     depends_on:
       - init
+      - memcached
 
   k6-tracing:
     image: ghcr.io/grafana/xk6-client-tracing:v0.0.5

diff --git a/example/docker-compose/local/tempo.yaml b/example/docker-compose/local/tempo.yaml
@@ -0,0 +1,70 @@
+stream_over_http_enabled: true
+server:
+  http_listen_port: 3200
+  log_level: info
+
+
+cache:
+  background:
+    writeback_goroutines: 5
+  caches:
+  - roles:
+    - frontend-search  
+    memcached: 
+      host: localhost:11211
+
+query_frontend:
+  search:
+    duration_slo: 5s
+    throughput_bytes_slo: 1.073741824e+09
+  trace_by_id:
+    duration_slo: 5s
+
+distributor:
+  receivers:                           # this configuration will listen on all ports and protocols that tempo is capable of.
+    jaeger:                            # the receives all come from the OpenTelemetry collector.  more configuration information can
+      protocols:                       # be found there: https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver
+        thrift_http:                   #
+        grpc:                          # for a production deployment you should only enable the receivers you need!
+        thrift_binary:
+        thrift_compact:
+    zipkin:
+    otlp:
+      protocols:
+        http:
+        grpc:
+    opencensus:
+
+ingester:
+  max_block_duration: 5m               # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
+
+compactor:
+  compaction:
+    block_retention: 1h                # overall Tempo trace retention. set for demo purposes
+
+metrics_generator:
+  registry:
+    external_labels:
+      source: tempo
+      cluster: docker-compose
+  storage:
+    path: /var/tempo/generator/wal
+    remote_write:
+      - url: http://prometheus:9090/api/v1/write
+        send_exemplars: true
+  traces_storage:
+    path: /var/tempo/generator/traces
+
+storage:
+  trace:
+    backend: local                     # backend configuration to use
+    wal:
+      path: /var/tempo/wal             # where to store the wal locally
+    local:
+      path: /var/tempo/blocks
+
+overrides:
+  defaults:
+    metrics_generator:
+      processors: [service-graphs, span-metrics, local-blocks] # enables metrics generator
+      generate_native_histograms: both
diff --git a/modules/cache/memcached/memcached.go b/modules/cache/memcached/memcached.go
@@ -28,9 +28,7 @@ func NewClient(cfg *Config, cfgBackground *cache.BackgroundConfig, name string,
 
 	client := cache.NewMemcachedClient(cfg.ClientConfig, name, prometheus.DefaultRegisterer, logger)
 	memcachedCfg := cache.MemcachedConfig{
-		Expiration:  cfg.TTL,
-		BatchSize:   0, // we are currently only requesting one key at a time, which is bad.  we could restructure Find() to batch request all blooms at once
-		Parallelism: 0,
+		Expiration: cfg.TTL,
 	}
 	c := cache.NewMemcached(memcachedCfg, client, name, cfg.ClientConfig.MaxItemSize, prometheus.DefaultRegisterer, logger)
 

diff --git a/modules/frontend/pipeline/sync_handler_cache.go b/modules/frontend/pipeline/sync_handler_cache.go
@@ -9,8 +9,6 @@ import (
 
 	"github.com/go-kit/log"
 	"github.com/go-kit/log/level"
-	"github.com/gogo/protobuf/jsonpb"
-	"github.com/gogo/protobuf/proto"
 	"github.com/grafana/tempo/pkg/api"
 	"github.com/grafana/tempo/pkg/cache"
 )
@@ -39,7 +37,7 @@ func (c cachingWare) RoundTrip(req Request) (*http.Response, error) {
 	// extract cache key
 	key := req.CacheKey()
 	if len(key) > 0 {
-		body := c.cache.fetchBytes(key)
+		body := c.cache.fetchBytes(req.Context(), key)
 		if len(body) > 0 {
 			resp := &http.Response{
 				Header:     http.Header{},
@@ -145,26 +143,7 @@ func (c *frontendCache) store(ctx context.Context, key string, buffer []byte) {
 }
 
 // fetch fetches the response body from the cache. the caller assumes the responsibility of closing the response body.
-func (c *frontendCache) fetch(key string, pb proto.Message) bool {
-	if c.c == nil {
-		return false
-	}
-
-	if len(key) == 0 {
-		return false
-	}
-
-	_, bufs, _ := c.c.Fetch(context.Background(), []string{key})
-	if len(bufs) != 1 {
-		return false
-	}
-
-	err := (&jsonpb.Unmarshaler{AllowUnknownFields: true}).Unmarshal(bytes.NewReader(bufs[0]), pb)
-	return err == nil
-}
-
-// fetch fetches the response body from the cache. the caller assumes the responsibility of closing the response body.
-func (c *frontendCache) fetchBytes(key string) []byte {
+func (c *frontendCache) fetchBytes(ctx context.Context, key string) []byte {
 	if c.c == nil {
 		return nil
 	}
@@ -173,10 +152,10 @@ func (c *frontendCache) fetchBytes(key string) []byte {
 		return nil
 	}
 
-	_, bufs, _ := c.c.Fetch(context.Background(), []string{key})
-	if len(bufs) != 1 {
+	buf, found := c.c.FetchKey(ctx, key)
+	if !found {
 		return nil
 	}
 
-	return bufs[0]
+	return buf
 }
diff --git a/modules/frontend/pipeline/sync_handler_cache_test.go b/modules/frontend/pipeline/sync_handler_cache_test.go
@@ -39,8 +39,9 @@ func TestCacheCaches(t *testing.T) {
 	c.store(context.Background(), testKey, testData)
 
 	actual := &tempopb.SearchTagsResponse{}
-	found := c.fetch(testKey, actual)
+	buffer := c.fetchBytes(context.Background(), testKey)
+	err = (&jsonpb.Unmarshaler{AllowUnknownFields: true}).Unmarshal(bytes.NewReader(buffer), actual)
 
-	require.True(t, found)
+	require.NoError(t, err)
 	require.Equal(t, expected, actual)
 }
diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go
@@ -2,7 +2,9 @@ package cache
 
 import (
 	"context"
+	"time"
 
+	instr "github.com/grafana/dskit/instrument"
 	"github.com/grafana/dskit/services"
 )
 
@@ -41,5 +43,14 @@ type Cache interface {
 	// TODO: both cached backend clients support deletion. Should we implement?
 	// Remove(ctx context.Context, key []string)
 	Fetch(ctx context.Context, keys []string) (found []string, bufs [][]byte, missing []string)
+	FetchKey(ctx context.Context, key string) (buf []byte, found bool)
 	Stop()
 }
+
+func measureRequest(ctx context.Context, method string, col instr.Collector, toStatusCode func(error) string, f func(context.Context) error) error {
+	start := time.Now()
+	col.Before(ctx, method, start)
+	err := f(ctx)
+	col.After(ctx, method, toStatusCode(err), start)
+	return err
+}