Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Merged by Bors] - Retry registration timeout fix #6136

Closed
wants to merge 31 commits into from
Closed
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
377a8c1
fixed timeouts
ConvallariaMaj Jul 15, 2024
d122094
fix tests + make linter happy
ConvallariaMaj Jul 15, 2024
fd3414e
revert config structure
ConvallariaMaj Jul 15, 2024
e3d6d8a
revert config structure2
ConvallariaMaj Jul 15, 2024
94383d7
fix tests
ConvallariaMaj Jul 15, 2024
4fad1a0
make linter happy
ConvallariaMaj Jul 15, 2024
9ff5958
fix review issue
ConvallariaMaj Jul 15, 2024
4046cc5
fix review issue + revert naming
ConvallariaMaj Jul 16, 2024
ab0b71e
removed overengineering
ConvallariaMaj Jul 17, 2024
c969205
removed overengineering2
ConvallariaMaj Jul 17, 2024
9435aee
removed overengineering2
ConvallariaMaj Jul 17, 2024
d515791
reverted timeout to Proof method, fixed some tests
ConvallariaMaj Jul 17, 2024
50a3e0e
added PositioningATXSelectionTimeout to all envs
ConvallariaMaj Jul 17, 2024
4d067c6
fix test and timeout
ConvallariaMaj Jul 17, 2024
fcd2ab1
revert unnecessary changes
ConvallariaMaj Jul 17, 2024
eb4e623
added PositioningATXSelectionTimeout to test env
ConvallariaMaj Jul 17, 2024
bfe0c34
fix flag
ConvallariaMaj Jul 17, 2024
bc743d2
added changelog
ConvallariaMaj Jul 18, 2024
0d30ee1
unlimited retry
ConvallariaMaj Jul 27, 2024
d9b0513
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Jul 28, 2024
c5be14d
fix issues
ConvallariaMaj Aug 1, 2024
2c53d3e
review fixes
ConvallariaMaj Aug 7, 2024
50e3129
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 7, 2024
8e729ea
minor fixes
ConvallariaMaj Aug 12, 2024
cd14fc0
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 12, 2024
1e79763
make linter happy
ConvallariaMaj Aug 12, 2024
1d0dfb0
fix comment
ConvallariaMaj Aug 12, 2024
de5cd55
custom linear jitter
ConvallariaMaj Aug 14, 2024
b7fcf55
Merge branch 'refs/heads/develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 14, 2024
3ac16db
fix review issues
ConvallariaMaj Aug 14, 2024
24a4056
Merge branch 'develop' into retry-registration-timeout-fix
ConvallariaMaj Aug 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions activation/activation.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,18 @@ var (

// PoetConfig is the configuration to interact with the poet server.
type PoetConfig struct {
PhaseShift time.Duration `mapstructure:"phase-shift"`
CycleGap time.Duration `mapstructure:"cycle-gap"`
GracePeriod time.Duration `mapstructure:"grace-period"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
MaxRequestRetries int `mapstructure:"retry-max"`
// Start of new PoET round
poszu marked this conversation as resolved.
Show resolved Hide resolved
PhaseShift time.Duration `mapstructure:"phase-shift"`
// A gap between end of old PoET round and start of new one
fasmat marked this conversation as resolved.
Show resolved Hide resolved
CycleGap time.Duration `mapstructure:"cycle-gap"`
// Time in the end of cycle gap, when PoST challenge must be build and send to PoET server
poszu marked this conversation as resolved.
Show resolved Hide resolved
GracePeriod time.Duration `mapstructure:"grace-period"`
// Period to find positioning ATX. Must be less, than GracePeriod
PositioningATXSelectionTimeout time.Duration `mapstructure:"positioning-atx-selection-timeout"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
MaxRequestRetries int `mapstructure:"retry-max"`
}

func DefaultPoetConfig() PoetConfig {
Expand Down Expand Up @@ -547,7 +551,6 @@ func (b *Builder) BuildNIPostChallenge(ctx context.Context, nodeID types.NodeID)
case <-time.After(time.Until(wait)):
}
}

if b.poetCfg.PositioningATXSelectionTimeout > 0 {
var cancel context.CancelFunc

Expand Down
11 changes: 9 additions & 2 deletions activation/activation_multi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,10 @@ func TestRegossip(t *testing.T) {
}

func Test_Builder_Multi_InitialPost(t *testing.T) {
tab := newTestBuilder(t, 5, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 5, WithPoetConfig(
PoetConfig{
PhaseShift: layerDuration * 4,
}))

var eg errgroup.Group
for _, sig := range tab.signers {
Expand Down Expand Up @@ -271,7 +274,11 @@ func Test_Builder_Multi_InitialPost(t *testing.T) {

func Test_Builder_Multi_HappyPath(t *testing.T) {
layerDuration := 2 * time.Second
tab := newTestBuilder(t, 3, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4, CycleGap: layerDuration}))
tab := newTestBuilder(t, 3, WithPoetConfig(
PoetConfig{
PhaseShift: layerDuration * 4,
CycleGap: layerDuration,
}))

// step 1: build initial posts
initialPostChan := make(chan struct{})
Expand Down
21 changes: 13 additions & 8 deletions activation/activation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,10 @@ func TestBuilder_StopSmeshing_failsWhenNotStarted(t *testing.T) {
}

func TestBuilder_PublishActivationTx_HappyFlow(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration}))
tab := newTestBuilder(t, 1, WithPoetConfig(
PoetConfig{
PhaseShift: layerDuration,
}))
sig := maps.Values(tab.signers)[0]

posEpoch := postGenesisEpoch
Expand Down Expand Up @@ -380,7 +383,9 @@ func TestBuilder_PublishActivationTx_HappyFlow(t *testing.T) {
// failing with ErrATXChallengeExpired.
func TestBuilder_Loop_WaitsOnStaleChallenge(t *testing.T) {
// Arrange
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{
PhaseShift: layerDuration * 4,
}))
sig := maps.Values(tab.signers)[0]

// current layer is too late to be able to build a nipost on time
Expand Down Expand Up @@ -429,7 +434,9 @@ func TestBuilder_Loop_WaitsOnStaleChallenge(t *testing.T) {
}

func TestBuilder_PublishActivationTx_FaultyNet(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{
PhaseShift: layerDuration * 4,
}))
sig := maps.Values(tab.signers)[0]

posEpoch := postGenesisEpoch
Expand Down Expand Up @@ -501,9 +508,7 @@ func TestBuilder_PublishActivationTx_FaultyNet(t *testing.T) {
}

func TestBuilder_PublishActivationTx_UsesExistingChallengeOnLatePublish(t *testing.T) {
poetCfg := PoetConfig{
PhaseShift: layerDuration * 4,
}
poetCfg := PoetConfig{PhaseShift: layerDuration * 4}
tab := newTestBuilder(t, 1, WithPoetConfig(poetCfg))
sig := maps.Values(tab.signers)[0]

Expand Down Expand Up @@ -1140,7 +1145,7 @@ func TestBuilder_RetryPublishActivationTx(t *testing.T) {
}

func TestBuilder_InitialProofGeneratedOnce(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration}))
fasmat marked this conversation as resolved.
Show resolved Hide resolved
sig := maps.Values(tab.signers)[0]

post := nipost.Post{
Expand Down Expand Up @@ -1179,7 +1184,7 @@ func TestBuilder_InitialProofGeneratedOnce(t *testing.T) {
}

func TestBuilder_InitialPostIsPersisted(t *testing.T) {
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration * 4}))
tab := newTestBuilder(t, 1, WithPoetConfig(PoetConfig{PhaseShift: layerDuration}))
fasmat marked this conversation as resolved.
Show resolved Hide resolved
sig := maps.Values(tab.signers)[0]

commitmentATX := types.RandomATXID()
Expand Down
6 changes: 3 additions & 3 deletions activation/e2e/activation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,12 @@ func Test_BuilderWithMultipleClients(t *testing.T) {
epoch := layerDuration * layersPerEpoch
genesis := time.Now().Add(layerDuration).Round(layerDuration)
poetCfg := activation.PoetConfig{
PhaseShift: epoch,
CycleGap: 3 * epoch / 4,
GracePeriod: epoch / 4,
RequestTimeout: epoch / 5,
RequestRetryDelay: epoch / 50,
MaxRequestRetries: 10,
PhaseShift: epoch,
CycleGap: 3 * epoch / 4,
GracePeriod: epoch / 4,
}

scrypt := testPostSetupOpts(t).Scrypt
Expand Down
8 changes: 2 additions & 6 deletions activation/nipost.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ func (nb *NIPostBuilder) BuildNIPost(
}

events.EmitPoetWaitProof(signer.NodeID(), postChallenge.PublishEpoch, poetRoundEnd)
poetProofRef, membership, err = nb.getBestProof(ctx, signer.NodeID(), challenge, postChallenge.PublishEpoch)
poetProofRef, membership, err = nb.getBestProof(ctx, signer.NodeID(), challenge)
if err != nil {
return nil, &PoetSvcUnstableError{msg: "getBestProof failed", source: err}
}
Expand Down Expand Up @@ -372,10 +372,7 @@ func (nb *NIPostBuilder) submitPoetChallenge(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, nb.poetCfg.RequestTimeout)
defer cancel()

round, err := client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID)
round, err := client.Submit(ctx, deadline, prefix, challenge, signature, nodeID)
if err != nil {
return &PoetSvcUnstableError{msg: "failed to submit challenge to poet service", source: err}
}
Expand Down Expand Up @@ -451,7 +448,6 @@ func (nb *NIPostBuilder) getBestProof(
ctx context.Context,
nodeID types.NodeID,
challenge types.Hash32,
publishEpoch types.EpochID,
) (types.PoetProofRef, *types.MerkleProof, error) {
type poetProof struct {
poet *types.PoetProof
Expand Down
20 changes: 9 additions & 11 deletions activation/poet.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,9 +340,10 @@ type certifierInfo struct {
// poetService is a higher-level interface to communicate with a PoET service.
// It wraps the HTTP client, adding additional functionality.
type poetService struct {
db poetDbAPI
logger *zap.Logger
client PoetClient
db poetDbAPI
logger *zap.Logger
client PoetClient

requestTimeout time.Duration

// Used to avoid concurrent requests for proof.
Expand Down Expand Up @@ -480,9 +481,7 @@ func (c *poetService) Submit(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
defer cancel()
round, err := c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
round, err := c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
switch {
case err == nil:
return round, nil
Expand All @@ -492,15 +491,14 @@ func (c *poetService) Submit(
if err != nil {
return nil, fmt.Errorf("recertifying: %w", err)
}
return c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
return c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
case errors.Is(err, context.DeadlineExceeded):
logger.Warn("failed to submit challenge: probably submit challenge timeout is too short", zap.Error(err))
fasmat marked this conversation as resolved.
Show resolved Hide resolved
}
return nil, fmt.Errorf("submitting challenge: %w", err)
}

func (c *poetService) Proof(ctx context.Context, roundID string) (*types.PoetProof, []types.Hash32, error) {
getProofsCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
defer cancel()

fasmat marked this conversation as resolved.
Show resolved Hide resolved
c.gettingProof.Lock()
defer c.gettingProof.Unlock()

Expand All @@ -513,7 +511,7 @@ func (c *poetService) Proof(ctx context.Context, roundID string) (*types.PoetPro
c.logger.Warn("cached members found but proof not found in db", zap.String("round_id", roundID), zap.Error(err))
}

proof, members, err := c.client.Proof(getProofsCtx, roundID)
proof, members, err := c.client.Proof(ctx, roundID)
if err != nil {
return nil, nil, fmt.Errorf("getting proof: %w", err)
}
Expand Down
27 changes: 15 additions & 12 deletions activation/poet_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,22 @@ func Test_HTTPPoetClient_ParsesURL(t *testing.T) {

t.Run("add http if missing", func(t *testing.T) {
t.Parallel()
client, err := NewHTTPPoetClient(types.PoetServer{Address: "bla"}, PoetConfig{
PhaseShift: cfg.PhaseShift,
CycleGap: cfg.CycleGap,
})
client, err := NewHTTPPoetClient(types.PoetServer{Address: "bla"},
PoetConfig{
PhaseShift: cfg.PhaseShift,
CycleGap: cfg.CycleGap,
})
require.NoError(t, err)
require.Equal(t, "http://bla", client.baseURL.String())
})

t.Run("do not change scheme if present", func(t *testing.T) {
t.Parallel()
client, err := NewHTTPPoetClient(types.PoetServer{Address: "https://bla"}, PoetConfig{
PhaseShift: cfg.PhaseShift,
CycleGap: cfg.CycleGap,
})
client, err := NewHTTPPoetClient(types.PoetServer{Address: "https://bla"},
PoetConfig{
PhaseShift: cfg.PhaseShift,
CycleGap: cfg.CycleGap,
})
require.NoError(t, err)
require.Equal(t, "https://bla", client.baseURL.String())
})
Expand Down Expand Up @@ -113,10 +115,11 @@ func Test_HTTPPoetClient_Address_Mainnet(t *testing.T) {

for _, url := range poETServers {
t.Run(url, func(t *testing.T) {
client, err := NewHTTPPoetClient(types.PoetServer{Address: url}, PoetConfig{
PhaseShift: poetCfg.PhaseShift,
CycleGap: poetCfg.CycleGap,
})
client, err := NewHTTPPoetClient(types.PoetServer{Address: url},
PoetConfig{
PhaseShift: poetCfg.PhaseShift,
CycleGap: poetCfg.CycleGap,
})
require.NoError(t, err)
require.Equal(t, url, client.Address())
})
Expand Down
10 changes: 5 additions & 5 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,13 @@ func AddFlags(flagSet *pflag.FlagSet, cfg *config.Config) (configPath *string) {
/**======================== PoET Flags ========================== **/

flagSet.DurationVar(&cfg.POET.PhaseShift, "phase-shift",
cfg.POET.PhaseShift, "phase shift of poet server")
cfg.POET.PhaseShift, "phase shift of poet server: duration after epoch start, at which poet round starts")
flagSet.DurationVar(&cfg.POET.CycleGap, "cycle-gap",
cfg.POET.CycleGap, "cycle gap of poet server")
cfg.POET.CycleGap, "cycle gap of poet server: gap between poet rounds")
flagSet.DurationVar(&cfg.POET.GracePeriod, "grace-period",
cfg.POET.GracePeriod, "time before PoET round starts when the node builds and submits a challenge")
flagSet.DurationVar(&cfg.POET.RequestTimeout, "poet-request-timeout",
cfg.POET.RequestTimeout, "timeout for poet requests")
cfg.POET.GracePeriod, "time before poet round starts, when the node builds and submits a challenge")
flagSet.DurationVar(&cfg.POET.RequestTimeout, "poet-default-request-timeout",
cfg.POET.RequestTimeout, "default timeout for poet requests")
fasmat marked this conversation as resolved.
Show resolved Hide resolved

/**======================== bootstrap data updater Flags ========================== **/

Expand Down
7 changes: 3 additions & 4 deletions config/mainnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,13 @@ func MainnetConfig() Config {
BeaconSyncWeightUnits: 800,
},
POET: activation.PoetConfig{
PhaseShift: 240 * time.Hour,
CycleGap: 12 * time.Hour,
GracePeriod: 1 * time.Hour,
PositioningATXSelectionTimeout: 50 * time.Minute,
fasmat marked this conversation as resolved.
Show resolved Hide resolved
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
RequestTimeout: 1100 * time.Second,
RequestRetryDelay: 10 * time.Second,
MaxRequestRetries: 10,
PhaseShift: 240 * time.Hour,
CycleGap: 12 * time.Hour,
GracePeriod: 1 * time.Hour,
},
POST: activation.PostConfig{
MinNumUnits: 4,
Expand Down
3 changes: 2 additions & 1 deletion config/presets/fastnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ func fastnet() config.Config {
conf.POET.GracePeriod = 10 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3
return conf
Expand Down
3 changes: 2 additions & 1 deletion config/presets/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ func standalone() config.Config {
conf.POET.GracePeriod = 12 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3

Expand Down
7 changes: 4 additions & 3 deletions config/presets/testnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,13 @@ func testnet() config.Config {
BeaconSyncWeightUnits: 800,
},
POET: activation.PoetConfig{
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
RequestTimeout: 550 * time.Second,
RequestRetryDelay: 5 * time.Second,
MaxRequestRetries: 10,
PhaseShift: 12 * time.Hour,
CycleGap: 2 * time.Hour,
GracePeriod: 10 * time.Minute,
RequestTimeout: 550 * time.Second, // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
RequestRetryDelay: 5 * time.Second,
MaxRequestRetries: 10,
},
POST: activation.PostConfig{
MinNumUnits: 2,
Expand Down
Loading