Skip to content

stats/opentelemetry: record retry attempts from clientStream #8342

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 49 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
3ba457e
Fixed retry attempts in HandleRPC
vinothkumarr227 May 19, 2025
5d19779
Fixed the review changes
vinothkumarr227 May 20, 2025
4245950
Fixed vet issues
vinothkumarr227 May 20, 2025
5347db1
Fixed the review changes
vinothkumarr227 May 21, 2025
99e88d8
Fixed the review changes
vinothkumarr227 May 22, 2025
586cf63
Fixed the review changes
vinothkumarr227 May 26, 2025
1bdad7e
Fixed the test cases
vinothkumarr227 May 26, 2025
39c5f0d
Fixed the review changes
vinothkumarr227 May 30, 2025
11523b6
small tweaks
vinothkumarr227 May 30, 2025
3720f4e
Fixed the review changes
vinothkumarr227 Jun 3, 2025
b97a2da
Fixed the test cases
vinothkumarr227 Jun 3, 2025
a0ef86c
Fixed the test cases pick issues
vinothkumarr227 Jun 3, 2025
ac79ad2
Fixed the event ignore issues
vinothkumarr227 Jun 4, 2025
10c6a90
Fixed the picker event issues
vinothkumarr227 Jun 4, 2025
1048040
Fixed the test cases
vinothkumarr227 Jun 4, 2025
05e2cc8
Fixed the review changes
vinothkumarr227 Jun 6, 2025
ba08688
small tweaks
vinothkumarr227 Jun 6, 2025
fe1831f
Fixed the review changes
vinothkumarr227 Jun 12, 2025
4f76f19
Fixed the review changes
vinothkumarr227 Jun 16, 2025
65da5d8
Merge remote-tracking branch 'origin/master' into stats-retry-attempt…
vinothkumarr227 Jun 20, 2025
d489c91
small tweaks
vinothkumarr227 Jun 20, 2025
8de4d5e
small tweaks
vinothkumarr227 Jun 25, 2025
1654ba1
Fixed the review changes
vinothkumarr227 Jul 2, 2025
06f350c
Fixed the server trace issues
vinothkumarr227 Jul 2, 2025
b944353
Fixed the review changes
vinothkumarr227 Jul 3, 2025
daef268
Fixed the review changes
vinothkumarr227 Jul 8, 2025
33f89a5
Fixed the review changes
vinothkumarr227 Jul 9, 2025
2b6ff10
Fixed the sort issues
vinothkumarr227 Jul 11, 2025
08b5e7c
Fixed the vet issue
vinothkumarr227 Jul 11, 2025
e84407b
Fixed the issues
vinothkumarr227 Jul 11, 2025
fcb1279
Fixed the LB pick test issues
vinothkumarr227 Jul 16, 2025
c6a254b
small tweaks
vinothkumarr227 Jul 16, 2025
a3da45c
Fixed the issues
vinothkumarr227 Jul 16, 2025
41a5eb4
Fixed the LB Pick issues
vinothkumarr227 Jul 16, 2025
7db3821
Fixed the test
vinothkumarr227 Jul 16, 2025
6fc9f84
small tweaks
vinothkumarr227 Jul 16, 2025
eec064f
Fixed the LB pick issues
vinothkumarr227 Jul 16, 2025
99a1b6e
Fixed duplice Lb pick event
vinothkumarr227 Jul 17, 2025
bb239e3
Fixed the test issues
vinothkumarr227 Jul 17, 2025
72ff228
Fixed the test issues
vinothkumarr227 Jul 22, 2025
4144b46
small tweaks
vinothkumarr227 Jul 22, 2025
2bec1a5
Fixed the review changes
vinothkumarr227 Jul 28, 2025
abf0f8b
small tweaks
vinothkumarr227 Jul 28, 2025
8bc283f
Fixed the review changes
vinothkumarr227 Jul 29, 2025
9e2647d
small tweaks
vinothkumarr227 Jul 29, 2025
c9c6f4c
small tweaks
vinothkumarr227 Jul 30, 2025
919caa0
Merge remote-tracking branch 'origin/master' into stats-retry-attempt…
vinothkumarr227 Jul 30, 2025
2141cd5
Merge remote-tracking branch 'origin/master' into stats-retry-attempt…
vinothkumarr227 Aug 1, 2025
9b13b21
Fixed the review changes
vinothkumarr227 Aug 6, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions stats/opentelemetry/client_tracing.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ func (h *clientTracingHandler) initializeTraces() {
}

func (h *clientTracingHandler) unaryInterceptor(ctx context.Context, method string, req, reply any, cc *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
ci := &callInfo{numRetries: 0}
ctx = setRetryCount(ctx, ci)
ctx, _ = getOrCreateCallInfo(ctx, cc, method, opts...)

var span trace.Span
Expand All @@ -58,6 +60,8 @@ func (h *clientTracingHandler) unaryInterceptor(ctx context.Context, method stri
}

func (h *clientTracingHandler) streamInterceptor(ctx context.Context, desc *grpc.StreamDesc, cc *grpc.ClientConn, method string, streamer grpc.Streamer, opts ...grpc.CallOption) (grpc.ClientStream, error) {
ci := &callInfo{numRetries: 0}
ctx = setRetryCount(ctx, ci)
ctx, _ = getOrCreateCallInfo(ctx, cc, method, opts...)

var span trace.Span
Expand Down Expand Up @@ -124,14 +128,9 @@ type retryCountKey struct{}
// TagRPC implements per RPC attempt context management for traces.
func (h *clientTracingHandler) TagRPC(ctx context.Context, info *stats.RPCTagInfo) context.Context {
ctx, ai := getOrCreateRPCAttemptInfo(ctx)
var counter *int32
if val := ctx.Value(retryCountKey{}); val != nil {
counter = val.(*int32)
} else {
counter = new(int32)
ctx = context.WithValue(ctx, retryCountKey{}, counter)
if ci, ok := getRetryCount(ctx); ok {
ai.previousRPCAttempts = uint32(atomic.LoadInt32(&ci.numRetries))
}
ai.previousRPCAttempts = uint32(atomic.LoadInt32(counter))
ai.ctx = ctx
ctx, ai = h.traceTagRPC(ctx, ai, info.NameResolutionDelay)
return setRPCInfo(ctx, &rpcInfo{ai: ai})
Expand Down
42 changes: 35 additions & 7 deletions stats/opentelemetry/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"io"
"slices"
"strconv"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -928,7 +929,7 @@ func (s) TestMetricsAndTracesOptionEnabled(t *testing.T) {
},
{
Key: "previous-rpc-attempts",
Value: attribute.IntValue(1),
Value: attribute.IntValue(0),
},
{
Key: "transparent-retry",
Expand Down Expand Up @@ -1021,7 +1022,7 @@ func (s) TestMetricsAndTracesOptionEnabled(t *testing.T) {
},
{
Key: "previous-rpc-attempts",
Value: attribute.IntValue(1),
Value: attribute.IntValue(0),
},
{
Key: "transparent-retry",
Expand Down Expand Up @@ -1144,7 +1145,7 @@ func (s) TestSpan(t *testing.T) {
},
{
Key: "previous-rpc-attempts",
Value: attribute.IntValue(1),
Value: attribute.IntValue(0),
},
{
Key: "transparent-retry",
Expand Down Expand Up @@ -1229,7 +1230,7 @@ func (s) TestSpan(t *testing.T) {
},
{
Key: "previous-rpc-attempts",
Value: attribute.IntValue(1),
Value: attribute.IntValue(0),
},
{
Key: "transparent-retry",
Expand Down Expand Up @@ -1354,7 +1355,7 @@ func (s) TestSpan_WithW3CContextPropagator(t *testing.T) {
},
{
Key: "previous-rpc-attempts",
Value: attribute.IntValue(1),
Value: attribute.IntValue(0),
},
{
Key: "transparent-retry",
Expand Down Expand Up @@ -1439,7 +1440,7 @@ func (s) TestSpan_WithW3CContextPropagator(t *testing.T) {
},
{
Key: "previous-rpc-attempts",
Value: attribute.IntValue(1),
Value: attribute.IntValue(0),
},
{
Key: "transparent-retry",
Expand Down Expand Up @@ -1687,6 +1688,7 @@ func (s) TestTraceSpan_WithRetriesAndNameResolutionDelay(t *testing.T) {
t.Fatal(err)
}
verifyTrace(t, spans, wantSpanInfo)
verifyPreviousRPCAttempts(t, spans)
})
}
}
Expand All @@ -1708,6 +1710,32 @@ func verifyTrace(t *testing.T, spans tracetest.SpanStubs, want traceSpanInfo) {
}
}

func verifyPreviousRPCAttempts(t *testing.T, spans tracetest.SpanStubs) {
t.Helper()
const maxAttempts = 3
foundAttempts := make(map[int]bool)
observedSpans := make(map[int][]string)

for _, span := range spans {
if !strings.HasPrefix(span.Name, "Attempt.") {
continue
}
for _, attr := range span.Attributes {
if attr.Key == "previous-rpc-attempts" {
val := int(attr.Value.AsInt64())
foundAttempts[val] = true
observedSpans[val] = append(observedSpans[val], span.Name)
}
}
}

for i := range maxAttempts {
if !foundAttempts[i] {
t.Errorf("Missing span for retry attempt #%d (expected previous-rpc-attempts = %d)", i+1, i)
}
}
}

// TestStreamingRPC_TraceSequenceNumbers verifies that sequence numbers
// are incremented correctly for multiple messages sent and received
// during a streaming RPC.
Expand Down Expand Up @@ -1777,7 +1805,7 @@ func (s) TestStreamingRPC_TraceSequenceNumbers(t *testing.T) {
attributes: []attribute.KeyValue{
attribute.Bool("Client", true),
attribute.Bool("FailFast", true),
attribute.Int("previous-rpc-attempts", 1),
attribute.Int("previous-rpc-attempts", 0),
attribute.Bool("transparent-retry", false),
},
},
Expand Down
12 changes: 12 additions & 0 deletions stats/opentelemetry/opentelemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ type callInfo struct {
// nameResolutionEventAdded is set when the resolver delay trace event
// is added. Prevents duplicate events, since it is reported per-attempt.
nameResolutionEventAdded atomic.Bool
// numRetries holds the count of non-transparent retry attempts.
numRetries int32
}

type callInfoKey struct{}
Expand Down Expand Up @@ -213,6 +215,16 @@ func getRPCInfo(ctx context.Context) *rpcInfo {
return ri
}

func setRetryCount(ctx context.Context, ci *callInfo) context.Context {
return context.WithValue(ctx, retryCountKey{}, ci)
}

// getRetryCount retrieves the retry count tracking struct from the context.
func getRetryCount(ctx context.Context) (*callInfo, bool) {
ci, ok := ctx.Value(retryCountKey{}).(*callInfo)
return ci, ok
}

func removeLeadingSlash(mn string) string {
return strings.TrimLeft(mn, "/")
}
Expand Down
18 changes: 8 additions & 10 deletions stats/opentelemetry/trace.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,24 +41,22 @@ func populateSpan(rs stats.RPCStats, ai *attemptInfo) {

switch rs := rs.(type) {
case *stats.Begin:
retryCount := ai.previousRPCAttempts
if !rs.IsTransparentRetryAttempt {
if val := ai.ctx.Value(retryCountKey{}); val != nil {
// Atomic increment and get new value
retryCount = uint32(atomic.AddInt32(val.(*int32), 1))
}
}
// Note: Go always added Client and FailFast attributes even though they are not
// defined by the OpenCensus gRPC spec. Thus, they are unimportant for
// correctness.
span.SetAttributes(
attribute.Bool("Client", rs.Client),
attribute.Bool("FailFast", rs.FailFast),
attribute.Int64("previous-rpc-attempts", int64(retryCount)),
attribute.Int64("previous-rpc-attempts", int64(ai.previousRPCAttempts)),
attribute.Bool("transparent-retry", rs.IsTransparentRetryAttempt),
)
// increment previous rpc attempts applicable for next attempt
atomic.AddUint32(&ai.previousRPCAttempts, 1)
// Increment retry count for the next attempt if not a transparent
// retry.
if !rs.IsTransparentRetryAttempt {
if ci, ok := getRetryCount(ai.ctx); ok {
atomic.AddInt32(&ci.numRetries, 1)
}
}
case *stats.PickerUpdated:
span.AddEvent("Delayed LB pick complete")
case *stats.InPayload:
Expand Down
Loading