Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Retry registration timeout fix #6136

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
377a8c1
fixed timeouts
ConvallariaMaj Jul 15, 2024
d122094
fix tests + make linter happy
ConvallariaMaj Jul 15, 2024
fd3414e
revert config structure
ConvallariaMaj Jul 15, 2024
e3d6d8a
revert config structure2
ConvallariaMaj Jul 15, 2024
94383d7
fix tests
ConvallariaMaj Jul 15, 2024
4fad1a0
make linter happy
ConvallariaMaj Jul 15, 2024
9ff5958
fix review issue
ConvallariaMaj Jul 15, 2024
4046cc5
fix review issue + revert naming
ConvallariaMaj Jul 16, 2024
ab0b71e
removed overengineering
ConvallariaMaj Jul 17, 2024
c969205
removed overengineering2
ConvallariaMaj Jul 17, 2024
9435aee
removed overengineering2
ConvallariaMaj Jul 17, 2024
d515791
reverted timeout to Proof method, fixed some tests
ConvallariaMaj Jul 17, 2024
50a3e0e
added PositioningATXSelectionTimeout to all envs
ConvallariaMaj Jul 17, 2024
4d067c6
fix test and timeout
ConvallariaMaj Jul 17, 2024
fcd2ab1
revert unnecessary changes
ConvallariaMaj Jul 17, 2024
eb4e623
added PositioningATXSelectionTimeout to test env
ConvallariaMaj Jul 17, 2024
bfe0c34
fix flag
ConvallariaMaj Jul 17, 2024
bc743d2
added changelog
ConvallariaMaj Jul 18, 2024
0d30ee1
unlimited retry
ConvallariaMaj Jul 27, 2024
d9b0513
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Jul 28, 2024
c5be14d
fix issues
ConvallariaMaj Aug 1, 2024
2c53d3e
review fixes
ConvallariaMaj Aug 7, 2024
50e3129
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 7, 2024
8e729ea
minor fixes
ConvallariaMaj Aug 12, 2024
cd14fc0
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 12, 2024
1e79763
make linter happy
ConvallariaMaj Aug 12, 2024
1d0dfb0
fix comment
ConvallariaMaj Aug 12, 2024
de5cd55
custom linear jitter
ConvallariaMaj Aug 14, 2024
b7fcf55
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 14, 2024
3ac16db
fix review issues
ConvallariaMaj Aug 14, 2024
24a4056
Merge branch 'develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ such and the node will continue to scan new ATXs for their validity.
### Features

### Improvements
* [#6035](https://github.com/spacemeshos/go-spacemesh/issues/6035) Fixed an issue where the node retried registering for the PoET round
only for 15-20 minutes instead of continuing until the start of the round

## Release v1.6.6-hotfix1

Expand Down
18 changes: 12 additions & 6 deletions activation/activation.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,18 @@ var (

// PoetConfig is the configuration to interact with the poet server.
type PoetConfig struct {
PhaseShift time.Duration `mapstructure:"phase-shift"`
CycleGap time.Duration `mapstructure:"cycle-gap"`
GracePeriod time.Duration `mapstructure:"grace-period"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
// Offset from the epoch start when the poet round starts
PhaseShift time.Duration `mapstructure:"phase-shift"`
// CycleGap gives the duration between the end of a PoET round and the start of the next
CycleGap time.Duration `mapstructure:"cycle-gap"`
// GracePeriod defines the time before the start of the next PoET round until the node
// waits before building its NiPoST challenge. Shorter durations allow the node to
// possibly pick a better positioning ATX, but come with the risk that the node might
// not be able to validate that ATX and has to fall back to using its own previous ATX.
GracePeriod time.Duration `mapstructure:"grace-period"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
// Period to find positioning ATX. Must be less, than GracePeriod
PositioningATXSelectionTimeout time.Duration `mapstructure:"positioning-atx-selection-timeout"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
PowParamsCacheTTL time.Duration `mapstructure:"pow-params-cache-ttl"`
Expand Down Expand Up @@ -559,7 +566,6 @@ func (b *Builder) BuildNIPostChallenge(ctx context.Context, nodeID types.NodeID)
case <-time.After(time.Until(wait)):
}
}

if b.poetCfg.PositioningATXSelectionTimeout > 0 {
var cancel context.CancelFunc

Expand Down
5 changes: 1 addition & 4 deletions activation/nipost.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,7 @@ func (nb *NIPostBuilder) submitPoetChallenge(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, nb.poetCfg.RequestTimeout)
defer cancel()

round, err := client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID)
round, err := client.Submit(ctx, deadline, prefix, challenge, signature, nodeID)
if err != nil {
return nipost.PoETRegistration{},
&PoetSvcUnstableError{msg: "failed to submit challenge to poet service", source: err}
Expand Down
71 changes: 47 additions & 24 deletions activation/poet.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"errors"
"fmt"
"io"
"math"
"math/rand/v2"
"net/http"
"net/url"
"sync"
Expand Down Expand Up @@ -69,10 +71,11 @@ type PoetClient interface {

// HTTPPoetClient implements PoetProvingServiceClient interface.
type HTTPPoetClient struct {
id []byte
baseURL *url.URL
client *retryablehttp.Client
logger *zap.Logger
id []byte
baseURL *url.URL
client *retryablehttp.Client
submitChallengeClient *retryablehttp.Client
logger *zap.Logger
}

func checkRetry(ctx context.Context, resp *http.Response, err error) (bool, error) {
Expand Down Expand Up @@ -126,6 +129,13 @@ func WithLogger(logger *zap.Logger) PoetClientOpts {
}
}

func customLinearJitterBackoff(min, max time.Duration, _ int, _ *http.Response) time.Duration {
if max <= min {
return min
}
return min + rand.N(max-min)
}

// NewHTTPPoetClient returns new instance of HTTPPoetClient connecting to the specified url.
func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClientOpts) (*HTTPPoetClient, error) {
client := &retryablehttp.Client{
Expand All @@ -136,6 +146,14 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
CheckRetry: checkRetry,
}

submitChallengeClient := &retryablehttp.Client{
RetryMax: math.MaxInt,
RetryWaitMin: cfg.RequestRetryDelay,
RetryWaitMax: 2 * cfg.RequestRetryDelay,
Backoff: customLinearJitterBackoff,
CheckRetry: retryablehttp.DefaultRetryPolicy,
}

baseURL, err := url.Parse(server.Address)
if err != nil {
return nil, fmt.Errorf("parsing address: %w", err)
Expand All @@ -145,11 +163,13 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
}

poetClient := &HTTPPoetClient{
id: server.Pubkey.Bytes(),
baseURL: baseURL,
client: client,
logger: zap.NewNop(),
id: server.Pubkey.Bytes(),
baseURL: baseURL,
client: client,
submitChallengeClient: submitChallengeClient,
logger: zap.NewNop(),
}

for _, opt := range opts {
opt(poetClient)
}
Expand All @@ -158,11 +178,11 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
"created poet client",
zap.Stringer("url", baseURL),
zap.Binary("pubkey", server.Pubkey.Bytes()),
zap.Int("max retries", client.RetryMax),
zap.Int("default max retries", client.RetryMax),
zap.Int("submit challenge max retries", submitChallengeClient.RetryMax),
zap.Duration("min retry wait", client.RetryWaitMin),
zap.Duration("max retry wait", client.RetryWaitMax),
)

return poetClient, nil
}

Expand All @@ -176,7 +196,7 @@ func (c *HTTPPoetClient) Address() string {

func (c *HTTPPoetClient) PowParams(ctx context.Context) (*PoetPowParams, error) {
resBody := rpcapi.PowParamsResponse{}
if err := c.req(ctx, http.MethodGet, "/v1/pow_params", nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, "/v1/pow_params", nil, &resBody, c.client); err != nil {
return nil, fmt.Errorf("querying PoW params: %w", err)
}

Expand Down Expand Up @@ -228,7 +248,7 @@ func (c *HTTPPoetClient) Submit(
}

resBody := rpcapi.SubmitResponse{}
if err := c.req(ctx, http.MethodPost, "/v1/submit", &request, &resBody); err != nil {
if err := c.req(ctx, http.MethodPost, "/v1/submit", &request, &resBody, c.submitChallengeClient); err != nil {
return nil, fmt.Errorf("submitting challenge: %w", err)
}
roundEnd := time.Time{}
Expand All @@ -241,7 +261,7 @@ func (c *HTTPPoetClient) Submit(

func (c *HTTPPoetClient) Info(ctx context.Context) (*types.PoetInfo, error) {
resBody := rpcapi.InfoResponse{}
if err := c.req(ctx, http.MethodGet, "/v1/info", nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, "/v1/info", nil, &resBody, c.client); err != nil {
return nil, fmt.Errorf("getting poet info: %w", err)
}

Expand All @@ -268,7 +288,7 @@ func (c *HTTPPoetClient) Info(ctx context.Context) (*types.PoetInfo, error) {
// Proof implements PoetProvingServiceClient.
func (c *HTTPPoetClient) Proof(ctx context.Context, roundID string) (*types.PoetProofMessage, []types.Hash32, error) {
resBody := rpcapi.ProofResponse{}
if err := c.req(ctx, http.MethodGet, fmt.Sprintf("/v1/proofs/%s", roundID), nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, fmt.Sprintf("/v1/proofs/%s", roundID), nil, &resBody, c.client); err != nil {
return nil, nil, fmt.Errorf("getting proof: %w", err)
}

Expand Down Expand Up @@ -300,7 +320,12 @@ func (c *HTTPPoetClient) Proof(ctx context.Context, roundID string) (*types.Poet
return &proof, members, nil
}

func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody, resBody proto.Message) error {
func (c *HTTPPoetClient) req(
ctx context.Context,
method, path string,
reqBody, resBody proto.Message,
client *retryablehttp.Client,
) error {
jsonReqBody, err := protojson.Marshal(reqBody)
if err != nil {
return fmt.Errorf("marshaling request body: %w", err)
Expand All @@ -312,7 +337,7 @@ func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody,
}
req.Header.Set("Content-Type", "application/json")

res, err := c.client.Do(req)
res, err := client.Do(req)
if err != nil {
return fmt.Errorf("doing request: %w", err)
}
Expand Down Expand Up @@ -343,7 +368,6 @@ func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody,
return fmt.Errorf("decoding response body to proto: %w", err)
}
}

return nil
}

Expand Down Expand Up @@ -371,9 +395,10 @@ func (c *cachedData[T]) get(init func() (T, error)) (T, error) {
// poetService is a higher-level interface to communicate with a PoET service.
// It wraps the HTTP client, adding additional functionality.
type poetService struct {
db poetDbAPI
logger *zap.Logger
client PoetClient
db poetDbAPI
logger *zap.Logger
client PoetClient

requestTimeout time.Duration

// Used to avoid concurrent requests for proof.
Expand Down Expand Up @@ -567,9 +592,7 @@ func (c *poetService) Submit(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
defer cancel()
round, err := c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
round, err := c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
switch {
case err == nil:
return round, nil
Expand All @@ -579,7 +602,7 @@ func (c *poetService) Submit(
if err != nil {
return nil, fmt.Errorf("authorizing: %w", err)
}
return c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
return c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
}
return nil, fmt.Errorf("submitting challenge: %w", err)
}
Expand Down
35 changes: 35 additions & 0 deletions activation/poet_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,41 @@ func Test_HTTPPoetClient_Submit(t *testing.T) {
require.NoError(t, err)
}

func Test_HTTPPoetClient_SubmitTillCtxCanceled(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
tries := 0
mux := http.NewServeMux()
mux.HandleFunc("POST /v1/submit", func(w http.ResponseWriter, r *http.Request) {
tries += 1
if tries == 3 {
cancel()
}
http.Error(w, "some_error", http.StatusInternalServerError)
})
ts := httptest.NewServer(mux)
defer ts.Close()
cfg := server.DefaultRoundConfig()
client, err := NewHTTPPoetClient(types.PoetServer{Address: ts.URL}, PoetConfig{
PhaseShift: cfg.PhaseShift,
CycleGap: cfg.CycleGap,
MaxRequestRetries: 1,
}, withCustomHttpClient(ts.Client()))
require.NoError(t, err)
_, err = client.Submit(
ctx,
time.Time{},
nil,
nil,
types.EmptyEdSignature,
types.NodeID{},
PoetAuth{},
)
require.ErrorIs(t, err, context.Canceled)
require.Equal(t, 3, tries)
}

func Test_HTTPPoetClient_Address(t *testing.T) {
t.Run("with scheme", func(t *testing.T) {
t.Parallel()
Expand Down
8 changes: 4 additions & 4 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,13 +317,13 @@ func AddFlags(flagSet *pflag.FlagSet, cfg *config.Config) (configPath *string) {
/**======================== PoET Flags ========================== **/

flagSet.DurationVar(&cfg.POET.PhaseShift, "phase-shift",
cfg.POET.PhaseShift, "phase shift of poet server")
cfg.POET.PhaseShift, "phase shift of poet server: duration after epoch start, at which poet round starts")
flagSet.DurationVar(&cfg.POET.CycleGap, "cycle-gap",
cfg.POET.CycleGap, "cycle gap of poet server")
cfg.POET.CycleGap, "cycle gap of poet server: gap between poet rounds")
flagSet.DurationVar(&cfg.POET.GracePeriod, "grace-period",
cfg.POET.GracePeriod, "time before PoET round starts when the node builds and submits a challenge")
cfg.POET.GracePeriod, "time before poet round starts, when the node builds and submits a challenge")
flagSet.DurationVar(&cfg.POET.RequestTimeout, "poet-request-timeout",
cfg.POET.RequestTimeout, "timeout for poet requests")
cfg.POET.RequestTimeout, "default timeout for poet requests")

/**======================== bootstrap data updater Flags ========================== **/

Expand Down
4 changes: 3 additions & 1 deletion config/presets/fastnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ func fastnet() config.Config {
conf.POET.GracePeriod = 10 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.PositioningATXSelectionTimeout = 8 * time.Second
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3
conf.POET.CertifierInfoCacheTTL = time.Minute
Expand Down
4 changes: 3 additions & 1 deletion config/presets/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ func standalone() config.Config {
conf.POET.GracePeriod = 12 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.PositioningATXSelectionTimeout = 8 * time.Second
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3
conf.POET.CertifierInfoCacheTTL = time.Minute
Expand Down
Loading