Skip to content

Commit 1f6e734

Browse files
author
healthy-pod
committed
upgrade: retry errors when dialing instances
Release note: None Epic: none Closes #108860
1 parent ad4e53f commit 1f6e734

File tree

1 file changed

+14
-2
lines changed

1 file changed

+14
-2
lines changed

pkg/upgrade/upgradecluster/tenant_cluster.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"github.com/cockroachdb/cockroach/pkg/util/retry"
3030
"github.com/cockroachdb/errors"
3131
"github.com/cockroachdb/redact"
32+
"google.golang.org/grpc"
3233
)
3334

3435
// TenantCluster represents the set of sql nodes running in a secondary tenant.
@@ -228,8 +229,19 @@ func (t *TenantCluster) ForEveryNodeOrServer(
228229
grp.GoCtx(func(ctx context.Context) error {
229230
defer alloc.Release()
230231

231-
conn, err := t.Dialer.Dial(ctx, roachpb.NodeID(instance.InstanceID), rpc.DefaultClass)
232-
if err != nil {
232+
var conn *grpc.ClientConn
233+
retryOpts := retry.Options{
234+
InitialBackoff: 0,
235+
MaxRetries: 2,
236+
MaxBackoff: 10 * time.Millisecond,
237+
}
238+
// This retry was added to benefit our tests (not users) by reducing the chance of
239+
// test flakes due to network issues.
240+
if err := retry.WithMaxAttempts(ctx, retryOpts, retryOpts.MaxRetries+1, func() error {
241+
var err error
242+
conn, err = t.Dialer.Dial(ctx, roachpb.NodeID(instance.InstanceID), rpc.DefaultClass)
243+
return err
244+
}); err != nil {
233245
if errors.HasType(err, (*netutil.InitialHeartbeatFailedError)(nil)) {
234246
if errors.Is(err, rpc.VersionCompatError) {
235247
return errors.WithHint(errors.Newf("upgrade failed due to active SQL servers with incompatible binary version(s)"),

0 commit comments

Comments
 (0)