diff --git a/stats/opentelemetry/client_metrics.go b/stats/opentelemetry/client_metrics.go index 7422bebd4f6e..2f25991906b5 100644 --- a/stats/opentelemetry/client_metrics.go +++ b/stats/opentelemetry/client_metrics.go @@ -81,6 +81,7 @@ func getOrCreateCallInfo(ctx context.Context, cc *grpc.ClientConn, method string } ctx = setCallInfo(ctx, ci) } + ctx = setRetryCount(ctx, ci) return ctx, ci } diff --git a/stats/opentelemetry/client_tracing.go b/stats/opentelemetry/client_tracing.go index 868d6a2fc9c1..0377582c93cc 100644 --- a/stats/opentelemetry/client_tracing.go +++ b/stats/opentelemetry/client_tracing.go @@ -118,9 +118,15 @@ func (h *clientTracingHandler) TagConn(ctx context.Context, _ *stats.ConnTagInfo // HandleConn exists to satisfy stats.Handler for tracing. func (h *clientTracingHandler) HandleConn(context.Context, stats.ConnStats) {} +type retryCountKey struct{} + // TagRPC implements per RPC attempt context management for traces. func (h *clientTracingHandler) TagRPC(ctx context.Context, info *stats.RPCTagInfo) context.Context { ctx, ai := getOrCreateRPCAttemptInfo(ctx) + if ci, ok := retryCount(ctx); ok { + ai.previousRPCAttempts = uint32(ci.previousRPCAttempts.Load()) + } + ai.ctx = ctx ctx, ai = h.traceTagRPC(ctx, ai, info.NameResolutionDelay) return setRPCInfo(ctx, &rpcInfo{ai: ai}) } diff --git a/stats/opentelemetry/e2e_test.go b/stats/opentelemetry/e2e_test.go index 4dbaadb2078e..ee65a71dadd0 100644 --- a/stats/opentelemetry/e2e_test.go +++ b/stats/opentelemetry/e2e_test.go @@ -22,6 +22,7 @@ import ( "io" "slices" "strconv" + "strings" "testing" "time" @@ -1677,31 +1678,58 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) { t.Fatalf("%s call failed: %v", tt.name, err) } - wantSpanInfo := traceSpanInfo{ + methodName := strings.TrimPrefix(tt.spanName, "Sent.") + var wantSpanInfos []traceSpanInfo + wantSpanInfos = append(wantSpanInfos, traceSpanInfo{ name: tt.spanName, spanKind: oteltrace.SpanKindClient.String(), events: []trace.Event{{Name: delayedResolutionEventName}}, + }) + for i := range 3 { + wantSpanInfos = append(wantSpanInfos, traceSpanInfo{ + name: "Attempt." + methodName, + spanKind: oteltrace.SpanKindInternal.String(), + attributes: []attribute.KeyValue{ + attribute.Int64("previous-rpc-attempts", int64(i)), + }, + }) } - spans, err := waitForTraceSpans(ctx, exporter, []traceSpanInfo{wantSpanInfo}) + + spans, err := waitForTraceSpans(ctx, exporter, wantSpanInfos) if err != nil { t.Fatal(err) } - verifyTrace(t, spans, wantSpanInfo) + for _, want := range wantSpanInfos { + verifyTrace(t, spans, want) + } }) } } func verifyTrace(t *testing.T, spans tracetest.SpanStubs, want traceSpanInfo) { + t.Helper() match := false for _, span := range spans { if span.Name == want.name && span.SpanKind.String() == want.spanKind { match = true - if diff := cmp.Diff(want.events, span.Events, cmpopts.IgnoreFields(trace.Event{}, "Time")); diff != "" { - t.Errorf("Span event mismatch for %q (kind: %s) (-want +got):\n%s", - want.name, want.spanKind, diff) + if len(want.events) > 0 { + if diff := cmp.Diff(want.events, span.Events, cmpopts.IgnoreFields(trace.Event{}, "Time")); diff != "" { + t.Errorf("Span event mismatch for %q (kind: %s) (-want +got):\n%s", + want.name, want.spanKind, diff) + } } break } + for _, wantAttr := range want.attributes { + for _, attr := range span.Attributes { + if attr.Key == "previous-rpc-attempts" && span.Name == want.name { + if attr.Value.AsInt64() != wantAttr.Value.AsInt64() { + t.Errorf("Span %q: attribute %s = %d; want %d", span.Name, attr.Key, attr.Value.AsInt64(), wantAttr.Value.AsInt64()) + } + break + } + } + } } if !match { t.Errorf("Expected span not found: %q (kind: %s)", want.name, want.spanKind) diff --git a/stats/opentelemetry/opentelemetry.go b/stats/opentelemetry/opentelemetry.go index cd01f86c4981..c29e6f27ce92 100644 --- a/stats/opentelemetry/opentelemetry.go +++ b/stats/opentelemetry/opentelemetry.go @@ -179,6 +179,8 @@ type callInfo struct { // nameResolutionEventAdded is set when the resolver delay trace event // is added. Prevents duplicate events, since it is reported per-attempt. nameResolutionEventAdded atomic.Bool + // previousRPCAttempts holds the count of non-transparent retry attempts. + previousRPCAttempts atomic.Int32 } type callInfoKey struct{} @@ -213,6 +215,15 @@ func getRPCInfo(ctx context.Context) *rpcInfo { return ri } +func setRetryCount(ctx context.Context, ci *callInfo) context.Context { + return context.WithValue(ctx, retryCountKey{}, ci) +} + +func retryCount(ctx context.Context) (*callInfo, bool) { + ci, ok := ctx.Value(retryCountKey{}).(*callInfo) + return ci, ok +} + func removeLeadingSlash(mn string) string { return strings.TrimLeft(mn, "/") } @@ -242,6 +253,7 @@ type attemptInfo struct { countSentMsg uint32 countRecvMsg uint32 previousRPCAttempts uint32 + ctx context.Context } type clientMetrics struct { diff --git a/stats/opentelemetry/server_tracing.go b/stats/opentelemetry/server_tracing.go index 0e2181bf114c..d87785082b8d 100644 --- a/stats/opentelemetry/server_tracing.go +++ b/stats/opentelemetry/server_tracing.go @@ -41,6 +41,7 @@ func (h *serverTracingHandler) initializeTraces() { // TagRPC implements per RPC attempt context management for traces. func (h *serverTracingHandler) TagRPC(ctx context.Context, _ *stats.RPCTagInfo) context.Context { ctx, ai := getOrCreateRPCAttemptInfo(ctx) + ai.ctx = ctx ctx, ai = h.traceTagRPC(ctx, ai) return setRPCInfo(ctx, &rpcInfo{ai: ai}) } diff --git a/stats/opentelemetry/trace.go b/stats/opentelemetry/trace.go index efafdd0756eb..1007a1c2152d 100644 --- a/stats/opentelemetry/trace.go +++ b/stats/opentelemetry/trace.go @@ -17,8 +17,6 @@ package opentelemetry import ( - "sync/atomic" - "go.opentelemetry.io/otel/attribute" otelcodes "go.opentelemetry.io/otel/codes" "go.opentelemetry.io/otel/trace" @@ -50,8 +48,14 @@ func populateSpan(rs stats.RPCStats, ai *attemptInfo) { attribute.Int64("previous-rpc-attempts", int64(ai.previousRPCAttempts)), attribute.Bool("transparent-retry", rs.IsTransparentRetryAttempt), ) - // increment previous rpc attempts applicable for next attempt - atomic.AddUint32(&ai.previousRPCAttempts, 1) + // Increment retry count for the next attempt if not a transparent + // retry. + if !rs.IsTransparentRetryAttempt { + if ci, ok := retryCount(ai.ctx); ok { + ci.previousRPCAttempts.Add(1) + ai.ctx = setRetryCount(ai.ctx, ci) + } + } case *stats.PickerUpdated: span.AddEvent("Delayed LB pick complete") case *stats.InPayload: