-
Notifications
You must be signed in to change notification settings - Fork 510
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add ability to detect virtual nodes in the servicegraph processor #2365
Changes from all commits
2f8e339
9a6aea8
480f993
d2ad922
f3290bf
fc6ecea
5f536fb
5777d50
1cba0dd
8ea1289
58847d2
68ac002
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ import ( | |
"encoding/hex" | ||
"errors" | ||
"fmt" | ||
"strings" | ||
"time" | ||
|
||
"github.com/go-kit/log" | ||
|
@@ -13,6 +14,8 @@ import ( | |
"github.com/prometheus/client_golang/prometheus" | ||
"github.com/prometheus/client_golang/prometheus/promauto" | ||
"github.com/prometheus/prometheus/util/strutil" | ||
"go.opentelemetry.io/otel/attribute" | ||
semconv "go.opentelemetry.io/otel/semconv/v1.18.0" | ||
|
||
gen "github.com/grafana/tempo/modules/generator/processor" | ||
"github.com/grafana/tempo/modules/generator/processor/servicegraphs/store" | ||
|
@@ -49,6 +52,10 @@ const ( | |
metricRequestClientSeconds = "traces_service_graph_request_client_seconds" | ||
) | ||
|
||
var defaultPeerAttributes = []attribute.Key{ | ||
semconv.PeerServiceKey, semconv.NetPeerNameKey, semconv.NetSockPeerNameKey, semconv.RPCServiceKey, semconv.NetSockPeerAddrKey, semconv.HTTPURLKey, semconv.HTTPTargetKey, | ||
} | ||
|
||
type tooManySpansError struct { | ||
droppedSpans int | ||
} | ||
|
@@ -169,6 +176,7 @@ func (p *Processor) consume(resourceSpans []*v1_trace.ResourceSpans) (err error) | |
e.Failed = e.Failed || p.spanFailed(span) | ||
p.upsertDimensions(e.Dimensions, rs.Resource.Attributes, span.Attributes) | ||
e.SpanMultiplier = spanMultiplier | ||
p.upsertPeerNode(e, span.Attributes) | ||
|
||
// A database request will only have one span, we don't wait for the server | ||
// span but just copy details from the client span | ||
|
@@ -193,6 +201,7 @@ func (p *Processor) consume(resourceSpans []*v1_trace.ResourceSpans) (err error) | |
e.Failed = e.Failed || p.spanFailed(span) | ||
p.upsertDimensions(e.Dimensions, rs.Resource.Attributes, span.Attributes) | ||
e.SpanMultiplier = spanMultiplier | ||
p.upsertPeerNode(e, span.Attributes) | ||
}) | ||
default: | ||
// this span is not part of an edge | ||
|
@@ -234,6 +243,15 @@ func (p *Processor) upsertDimensions(m map[string]string, resourceAttr []*v1_com | |
} | ||
} | ||
|
||
func (p *Processor) upsertPeerNode(e *store.Edge, spanAttr []*v1_common.KeyValue) { | ||
for _, peerKey := range p.Cfg.PeerAttributes { | ||
if v, ok := processor_util.FindAttributeValue(peerKey, spanAttr); ok { | ||
e.PeerNode = v | ||
return | ||
} | ||
} | ||
} | ||
|
||
func (p *Processor) Shutdown(_ context.Context) { | ||
close(p.closeCh) | ||
} | ||
|
@@ -259,6 +277,25 @@ func (p *Processor) onComplete(e *store.Edge) { | |
|
||
func (p *Processor) onExpire(e *store.Edge) { | ||
p.metricExpiredEdges.Inc() | ||
|
||
// If an edge is expired, we check if there are signs that the missing span is belongs to a "virtual node". | ||
// These are nodes that are outside the user's reach (eg. an external service for payment processing), | ||
// or that are not instrumented (eg. a frontend application). | ||
e.ConnectionType = store.VirtualNode | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this logic is detecting an unmatched edge and registering the virtual node if conditions are right. Could we add a comment? Should There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added comments to clarify what this is doing.
Doesn't really matter, as if it doesn't match one of the two conditions to be considered a virtual node, it won't be collected. |
||
if len(e.ClientService) == 0 { | ||
// If the client service is not set, it means that the span could have been initiated by an external system, | ||
// like a frontend application or an engineer via `curl`. | ||
// We check if the span we have is the root span, and if so, we set the client service to "user". | ||
if _, parentSpan := parseKey(e.Key()); len(parentSpan) == 0 { | ||
e.ClientService = "user" | ||
p.onComplete(e) | ||
} | ||
} else if len(e.ServerService) == 0 && len(e.PeerNode) > 0 { | ||
// If client span does not have its matching server span, but has a peer attribute present, | ||
// we make the assumption that a call was made to an external service, for which Tempo won't receive spans. | ||
e.ServerService = e.PeerNode | ||
p.onComplete(e) | ||
} | ||
} | ||
|
||
func (p *Processor) spanFailed(span *v1_trace.Span) bool { | ||
|
@@ -272,3 +309,8 @@ func spanDurationSec(span *v1_trace.Span) float64 { | |
func buildKey(k1, k2 string) string { | ||
return fmt.Sprintf("%s-%s", k1, k2) | ||
} | ||
|
||
func parseKey(key string) (string, string) { | ||
parts := strings.Split(key, "-") | ||
return parts[0], parts[1] | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at this logic - is it correct to say that the order of
PeerAttributes
is their priority? Ifnet.sock.peer.addr
exists it will always be used first. If so let's add a note to the documentation snippet inservice_graphs.md
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the order of peerAttributes is the priority, I would suggest that names be preferred over addresses in the default list. Also, does the code look at
peer.service
anywhere? There is an otel way to map host name or ip address to peer.service: https://opentelemetry.io/docs/instrumentation/java/automatic/agent-config/#peer-service-name and see https://opentelemetry.io/docs/reference/specification/trace/semantic_conventions/span-general/#general-remote-service-attributes as well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice suggestions. Applied both.