From 16597ebba56a8e7db16e11582cde3199584fa687 Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Thu, 27 Feb 2025 14:58:32 +0800 Subject: [PATCH 01/21] fix(test): Use a timer in test_checkpoint_executor_crash_recovery --- .../checkpoints/checkpoint_executor/tests.rs | 53 +++++++++++++------ 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs b/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs index aba6719606e..511a5e0f80b 100644 --- a/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs +++ b/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs @@ -64,19 +64,30 @@ pub async fn test_checkpoint_executor_crash_recovery() { let epoch_store = state.epoch_store_for_testing().clone(); let executor_handle = spawn_monitored_task!(async move { executor.run_epoch(epoch_store, None).await }); - tokio::time::sleep(Duration::from_secs(5)).await; - // ensure we executed all synced checkpoints - let highest_executed = checkpoint_store - .get_highest_executed_checkpoint_seq_number() - .unwrap() - .expect("Expected highest executed to not be None"); - assert_eq!(highest_executed, 2 * (buffer_size as u64) - 1,); + // Use a timer to ensure all checkpoints are executed + let timeout_duration = Duration::from_secs(20); + tokio::time::timeout(timeout_duration, async { + loop { + let highest_executed = checkpoint_store + .get_highest_executed_checkpoint_seq_number() + .unwrap() + .unwrap_or_default(); + + if highest_executed == 2 * (buffer_size as u64) - 1 { + break; + } + + tokio::time::sleep(Duration::from_millis(100)).await; + } + }) + .await + .expect("Timeout waiting for checkpoints to be executed"); // Simulate node restart executor_handle.abort(); - // sync more checkpoints in the meantime + // Sync more checkpoints in the meantime let _ = sync_new_checkpoints( &checkpoint_store, &checkpoint_sender, @@ -85,8 +96,7 @@ pub async fn test_checkpoint_executor_crash_recovery() { &committee, ); - // restart checkpoint executor and ensure that it picks - // up where it left off + // Restart checkpoint executor and ensure that it picks up where it left off let mut executor = CheckpointExecutor::new_for_tests( checkpoint_sender.subscribe(), checkpoint_store.clone(), @@ -97,13 +107,24 @@ pub async fn test_checkpoint_executor_crash_recovery() { let epoch_store = state.epoch_store_for_testing().clone(); let executor_handle = spawn_monitored_task!(async move { executor.run_epoch(epoch_store, None).await }); - tokio::time::sleep(Duration::from_secs(15)).await; - let highest_executed = checkpoint_store - .get_highest_executed_checkpoint_seq_number() - .unwrap() - .expect("Expected highest executed to not be None"); - assert_eq!(highest_executed, 4 * (buffer_size as u64) - 1); + // Use a timer to ensure all checkpoints are executed + tokio::time::timeout(timeout_duration, async { + loop { + let highest_executed = checkpoint_store + .get_highest_executed_checkpoint_seq_number() + .unwrap() + .unwrap_or_default(); + + if highest_executed == 4 * (buffer_size as u64) - 1 { + break; + } + + tokio::time::sleep(Duration::from_millis(100)).await; + } + }) + .await + .expect("Timeout waiting for checkpoints to be executed after restart"); executor_handle.abort(); } From 208299d06f8be17ce7e08273186e6c3d641582de Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Thu, 27 Feb 2025 19:50:12 +0800 Subject: [PATCH 02/21] fix(test): Reduce number of synced checkpoints in test_checkpoint_executor_crash_recovery --- .../src/checkpoints/checkpoint_executor/tests.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs b/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs index 511a5e0f80b..3fa2225b4c7 100644 --- a/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs +++ b/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs @@ -56,7 +56,7 @@ pub async fn test_checkpoint_executor_crash_recovery() { let checkpoints = sync_new_checkpoints( &checkpoint_store, &checkpoint_sender, - 2 * buffer_size, + buffer_size, None, &committee, ); @@ -74,7 +74,7 @@ pub async fn test_checkpoint_executor_crash_recovery() { .unwrap() .unwrap_or_default(); - if highest_executed == 2 * (buffer_size as u64) - 1 { + if highest_executed == (buffer_size as u64) - 1 { break; } @@ -91,7 +91,7 @@ pub async fn test_checkpoint_executor_crash_recovery() { let _ = sync_new_checkpoints( &checkpoint_store, &checkpoint_sender, - 2 * buffer_size, + buffer_size, Some(checkpoints.last().cloned().unwrap()), &committee, ); @@ -116,7 +116,7 @@ pub async fn test_checkpoint_executor_crash_recovery() { .unwrap() .unwrap_or_default(); - if highest_executed == 4 * (buffer_size as u64) - 1 { + if highest_executed == 2 * (buffer_size as u64) - 1 { break; } From 492c8775eda548c6db9c2ce401ab30c55460a4c4 Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Thu, 27 Feb 2025 20:44:08 +0800 Subject: [PATCH 03/21] fix(test): Increase timeout in test_checkpoint_executor_crash_recovery --- crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs b/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs index 3fa2225b4c7..7c895699a99 100644 --- a/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs +++ b/crates/iota-core/src/checkpoints/checkpoint_executor/tests.rs @@ -66,7 +66,7 @@ pub async fn test_checkpoint_executor_crash_recovery() { spawn_monitored_task!(async move { executor.run_epoch(epoch_store, None).await }); // Use a timer to ensure all checkpoints are executed - let timeout_duration = Duration::from_secs(20); + let timeout_duration = Duration::from_secs(60); tokio::time::timeout(timeout_duration, async { loop { let highest_executed = checkpoint_store From e6f13ac36858db232f56bff5280f4058d6d629b6 Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Mon, 3 Mar 2025 21:15:12 +0800 Subject: [PATCH 04/21] fix(network): Add timeout to recv() in test_byzantine_peer_handling --- crates/iota-network/src/randomness/tests.rs | 30 +++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index ff6906369bb..58aac6014d1 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -438,12 +438,29 @@ async fn test_byzantine_peer_handling() { None, ); } + + // Use tokio timeout to ensure the test has sometime to meet expected results. + async fn receive_with_timeout( + rx: &mut mpsc::Receiver<(u64, RandomnessRound, Vec<u8>)>, + expected_epoch: u64, + expected_round: u64, + ) -> Result<(), ()> { + let timeout = std::time::Duration::from_secs(30); + let start = std::time::Instant::now(); + while start.elapsed() < timeout { + if let Some((epoch, round, bytes)) = rx.recv().await { + if epoch == expected_epoch && round.0 == expected_round && !bytes.is_empty() { + return Ok(()); + } + } + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + Err(()) + } + for rx in &mut randomness_rxs[2..] { // Validators (2, 3) can communicate normally. - let (epoch, round, bytes) = rx.recv().await.unwrap(); - assert_eq!(0, epoch); - assert_eq!(0, round.0); - assert_ne!(0, bytes.len()); + receive_with_timeout(rx, 0, 0).await.unwrap(); } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) are byzantine. @@ -473,10 +490,7 @@ async fn test_byzantine_peer_handling() { } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) can communicate normally in new epoch. - let (epoch, round, bytes) = rx.recv().await.unwrap(); - assert_eq!(1, epoch); - assert_eq!(0, round.0); - assert_ne!(0, bytes.len()); + receive_with_timeout(rx, 1, 0).await.unwrap(); } for rx in &mut randomness_rxs[2..] { // Validators (2, 3) are still on old epoch. From 6d15e20e4586ad3830960623d15e5b0ce04d04cd Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Tue, 4 Mar 2025 14:32:41 +0800 Subject: [PATCH 05/21] fix: Resolve comments --- crates/iota-network/src/randomness/tests.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 58aac6014d1..a538250e080 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -439,7 +439,7 @@ async fn test_byzantine_peer_handling() { ); } - // Use tokio timeout to ensure the test has sometime to meet expected results. + // Use tokio timeout to ensure the test has some time to meet expected results. async fn receive_with_timeout( rx: &mut mpsc::Receiver<(u64, RandomnessRound, Vec<u8>)>, expected_epoch: u64, @@ -460,7 +460,9 @@ async fn test_byzantine_peer_handling() { for rx in &mut randomness_rxs[2..] { // Validators (2, 3) can communicate normally. - receive_with_timeout(rx, 0, 0).await.unwrap(); + receive_with_timeout(rx, 0, 0) + .await + .expect("Validators (2, 3) should receive randomness in epoch 0, round 0"); } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) are byzantine. @@ -490,7 +492,9 @@ async fn test_byzantine_peer_handling() { } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) can communicate normally in new epoch. - receive_with_timeout(rx, 1, 0).await.unwrap(); + receive_with_timeout(rx, 1, 0) + .await + .expect("Validators (0, 1) should receive randomness in epoch 1, round 0"); } for rx in &mut randomness_rxs[2..] { // Validators (2, 3) are still on old epoch. From 09a2989ddbf99ff11d0949853d8a447f7e8dd6b2 Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Wed, 5 Mar 2025 14:42:23 +0800 Subject: [PATCH 06/21] fix: Check the first received result in receive_with_timeout --- crates/iota-network/src/randomness/tests.rs | 22 +++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index a538250e080..31aa337c8f2 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -445,17 +445,23 @@ async fn test_byzantine_peer_handling() { expected_epoch: u64, expected_round: u64, ) -> Result<(), ()> { - let timeout = std::time::Duration::from_secs(30); - let start = std::time::Instant::now(); - while start.elapsed() < timeout { - if let Some((epoch, round, bytes)) = rx.recv().await { - if epoch == expected_epoch && round.0 == expected_round && !bytes.is_empty() { - return Ok(()); + loop { + tokio::select! { + received = rx.recv() => match received { + Some((epoch, round, bytes)) => { + assert_eq!(expected_epoch, epoch); + assert_eq!(expected_round, round.0); + assert_ne!(0, bytes.len()); + + return Ok(()); + }, + None => tokio::time::sleep(std::time::Duration::from_millis(100)).await, + }, + _ = tokio::time::sleep(std::time::Duration::from_secs(30)) => { + return Err(()); } } - tokio::time::sleep(std::time::Duration::from_millis(100)).await; } - Err(()) } for rx in &mut randomness_rxs[2..] { From 0d711edfdc77d6bb56ab154b1d1612e2e480925d Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Wed, 5 Mar 2025 15:14:44 +0800 Subject: [PATCH 07/21] fix: Add err message to receive_with_timeout --- crates/iota-network/src/randomness/tests.rs | 30 ++++++++++----------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 31aa337c8f2..41e0eb81b4b 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -444,22 +444,22 @@ async fn test_byzantine_peer_handling() { rx: &mut mpsc::Receiver<(u64, RandomnessRound, Vec<u8>)>, expected_epoch: u64, expected_round: u64, - ) -> Result<(), ()> { - loop { - tokio::select! { - received = rx.recv() => match received { - Some((epoch, round, bytes)) => { - assert_eq!(expected_epoch, epoch); - assert_eq!(expected_round, round.0); - assert_ne!(0, bytes.len()); - - return Ok(()); - }, - None => tokio::time::sleep(std::time::Duration::from_millis(100)).await, + ) -> Result<(), String> { + tokio::select! { + received = rx.recv() => match received { + Some((epoch, round, bytes)) => { + assert_eq!(expected_epoch, epoch); + assert_eq!(expected_round, round.0); + assert_ne!(0, bytes.len()); + + Ok(()) }, - _ = tokio::time::sleep(std::time::Duration::from_secs(30)) => { - return Err(()); - } + None => { + Err("Randomness channels has been closed".to_string()) + }, + }, + _ = tokio::time::sleep(std::time::Duration::from_secs(30)) => { + return Err("Timeout expired to receive randomness".to_string()); } } } From 9e44698a5f761958ab32750c952eaca4a782b4ab Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Wed, 5 Mar 2025 15:48:54 +0800 Subject: [PATCH 08/21] fix: Fix clippy errors --- crates/iota-network/src/randomness/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 41e0eb81b4b..11198fa3f77 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -459,7 +459,7 @@ async fn test_byzantine_peer_handling() { }, }, _ = tokio::time::sleep(std::time::Duration::from_secs(30)) => { - return Err("Timeout expired to receive randomness".to_string()); + Err("Timeout expired to receive randomness".to_string()) } } } From 03348694ee0ce39105ec11b285b35c9ffb2058a9 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Thu, 6 Mar 2025 22:08:59 +0300 Subject: [PATCH 09/21] fix: increase client send_signatures request timeout --- crates/iota-network/src/randomness/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iota-network/src/randomness/mod.rs b/crates/iota-network/src/randomness/mod.rs index 3ada502b18e..06690023998 100644 --- a/crates/iota-network/src/randomness/mod.rs +++ b/crates/iota-network/src/randomness/mod.rs @@ -982,7 +982,7 @@ impl RandomnessEventLoop { continue; // don't send partial sigs to self } let mut client = RandomnessClient::new(peer.clone()); - const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(10); + const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(30); let full_sig = full_sig.get().cloned(); let request = anemo::Request::new(SendSignaturesRequest { epoch, From c927c33f51b47652044a6fb84558e7eccbef0541 Mon Sep 17 00:00:00 2001 From: jkrvivian <jkrvivian@gmail.com> Date: Fri, 7 Mar 2025 18:35:01 +0800 Subject: [PATCH 10/21] revert: Revert adding receive_with_timeout --- crates/iota-network/src/randomness/tests.rs | 37 ++++++--------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 11198fa3f77..46b6aaced88 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -439,36 +439,15 @@ async fn test_byzantine_peer_handling() { ); } - // Use tokio timeout to ensure the test has some time to meet expected results. - async fn receive_with_timeout( - rx: &mut mpsc::Receiver<(u64, RandomnessRound, Vec<u8>)>, - expected_epoch: u64, - expected_round: u64, - ) -> Result<(), String> { - tokio::select! { - received = rx.recv() => match received { - Some((epoch, round, bytes)) => { - assert_eq!(expected_epoch, epoch); - assert_eq!(expected_round, round.0); - assert_ne!(0, bytes.len()); - - Ok(()) - }, - None => { - Err("Randomness channels has been closed".to_string()) - }, - }, - _ = tokio::time::sleep(std::time::Duration::from_secs(30)) => { - Err("Timeout expired to receive randomness".to_string()) - } - } - } - for rx in &mut randomness_rxs[2..] { // Validators (2, 3) can communicate normally. - receive_with_timeout(rx, 0, 0) + let (epoch, round, bytes) = rx + .recv() .await .expect("Validators (2, 3) should receive randomness in epoch 0, round 0"); + assert_eq!(0, epoch); + assert_eq!(0, round.0); + assert_ne!(0, bytes.len()); } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) are byzantine. @@ -498,9 +477,13 @@ async fn test_byzantine_peer_handling() { } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) can communicate normally in new epoch. - receive_with_timeout(rx, 1, 0) + let (epoch, round, bytes) = rx + .recv() .await .expect("Validators (0, 1) should receive randomness in epoch 1, round 0"); + assert_eq!(1, epoch); + assert_eq!(0, round.0); + assert_ne!(0, bytes.len()); } for rx in &mut randomness_rxs[2..] { // Validators (2, 3) are still on old epoch. From 21260a2db1c0f1669d369b7a3e2f15df131c42dd Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Fri, 7 Mar 2025 14:55:04 +0300 Subject: [PATCH 11/21] fix: add safety timeouts as test_byzantine_peer_handling can loop forever --- crates/iota-network/src/randomness/tests.rs | 49 ++++++++++++++------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 46b6aaced88..8606ec7da59 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -439,15 +439,25 @@ async fn test_byzantine_peer_handling() { ); } - for rx in &mut randomness_rxs[2..] { + // This test can just deadlock, ie. run indefinitely without making any progress. + // We can control it by waiting on expected randomness for `timeout` secs. + // For some reason it takes so much time for honest peers to produce randomness in presence of byzantine peers. + let timeout = std::time::Duration::from_secs(60); + let (rx2_mut, rx3_mut) = randomness_rxs.split_at_mut(3); + let rnd2_fut = rx2_mut[2].recv(); + let rnd3_fut = rx3_mut[0].recv(); + tokio::select! { // Validators (2, 3) can communicate normally. - let (epoch, round, bytes) = rx - .recv() - .await - .expect("Validators (2, 3) should receive randomness in epoch 0, round 0"); - assert_eq!(0, epoch); - assert_eq!(0, round.0); - assert_ne!(0, bytes.len()); + rnds = futures::future::join_all([rnd2_fut, rnd3_fut]) => { + for rnd in rnds { + let (epoch, round, bytes) = rnd + .expect("Validators (2, 3) should receive randomness in epoch 0, round 0"); + assert_eq!(0, epoch); + assert_eq!(0, round.0); + assert_ne!(0, bytes.len()); + } + }, + _ = tokio::time::sleep(timeout) => panic!("Timeout expired to receive randomness"), } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) are byzantine. @@ -475,15 +485,22 @@ async fn test_byzantine_peer_handling() { None, ); } - for rx in &mut randomness_rxs[..2] { + + let (rx0_mut, rx1_mut) = randomness_rxs.split_at_mut(1); + let rnd0_fut = rx0_mut[0].recv(); + let rnd1_fut = rx1_mut[0].recv(); + tokio::select! { // Validators (0, 1) can communicate normally in new epoch. - let (epoch, round, bytes) = rx - .recv() - .await - .expect("Validators (0, 1) should receive randomness in epoch 1, round 0"); - assert_eq!(1, epoch); - assert_eq!(0, round.0); - assert_ne!(0, bytes.len()); + rnds = futures::future::join_all([rnd0_fut, rnd1_fut]) => { + for rnd in rnds { + let (epoch, round, bytes) = rnd + .expect("Validators (0, 1) should receive randomness in epoch 1, round 0"); + assert_eq!(1, epoch); + assert_eq!(0, round.0); + assert_ne!(0, bytes.len()); + } + }, + _ = tokio::time::sleep(timeout) => panic!("Timeout expired to receive randomness"), } for rx in &mut randomness_rxs[2..] { // Validators (2, 3) are still on old epoch. From 5dbe3840d2329355b6dbf0237d6660bed61ff96b Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Fri, 7 Mar 2025 15:32:53 +0300 Subject: [PATCH 12/21] fix: ci-fmt --- crates/iota-network/src/randomness/tests.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 8606ec7da59..e75df778af9 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -439,9 +439,10 @@ async fn test_byzantine_peer_handling() { ); } - // This test can just deadlock, ie. run indefinitely without making any progress. - // We can control it by waiting on expected randomness for `timeout` secs. - // For some reason it takes so much time for honest peers to produce randomness in presence of byzantine peers. + // This test can just deadlock, ie. run indefinitely without making any + // progress. We can control it by waiting on expected randomness for + // `timeout` secs. For some reason it takes so much time for honest peers to + // produce randomness in presence of byzantine peers. let timeout = std::time::Duration::from_secs(60); let (rx2_mut, rx3_mut) = randomness_rxs.split_at_mut(3); let rnd2_fut = rx2_mut[2].recv(); From 81f71e6b9e4a906c07459b6659fe6ca23a5fcdd8 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Fri, 7 Mar 2025 15:55:13 +0300 Subject: [PATCH 13/21] fix: use timeout function --- crates/iota-network/src/randomness/tests.rs | 46 ++++++++++----------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index e75df778af9..6ddc6d75a2e 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -447,18 +447,15 @@ async fn test_byzantine_peer_handling() { let (rx2_mut, rx3_mut) = randomness_rxs.split_at_mut(3); let rnd2_fut = rx2_mut[2].recv(); let rnd3_fut = rx3_mut[0].recv(); - tokio::select! { - // Validators (2, 3) can communicate normally. - rnds = futures::future::join_all([rnd2_fut, rnd3_fut]) => { - for rnd in rnds { - let (epoch, round, bytes) = rnd - .expect("Validators (2, 3) should receive randomness in epoch 0, round 0"); - assert_eq!(0, epoch); - assert_eq!(0, round.0); - assert_ne!(0, bytes.len()); - } - }, - _ = tokio::time::sleep(timeout) => panic!("Timeout expired to receive randomness"), + // Validators (2, 3) can communicate normally. + let rnds = tokio::time::timeout(timeout, futures::future::join_all([rnd2_fut, rnd3_fut])) + .await + .expect("Honest peers (2, 3) should produce randomness in time"); + for rnd in rnds { + let (epoch, round, bytes) = rnd.expect("Channel is not closed and randomness is produced"); + assert_eq!(0, epoch, "Honest peers produce randomness in epoch 0"); + assert_eq!(0, round.0, "Honest peers produce randomness in round 0"); + assert_ne!(0, bytes.len(), "Honest peers produce non-empty randomness"); } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) are byzantine. @@ -490,18 +487,19 @@ async fn test_byzantine_peer_handling() { let (rx0_mut, rx1_mut) = randomness_rxs.split_at_mut(1); let rnd0_fut = rx0_mut[0].recv(); let rnd1_fut = rx1_mut[0].recv(); - tokio::select! { - // Validators (0, 1) can communicate normally in new epoch. - rnds = futures::future::join_all([rnd0_fut, rnd1_fut]) => { - for rnd in rnds { - let (epoch, round, bytes) = rnd - .expect("Validators (0, 1) should receive randomness in epoch 1, round 0"); - assert_eq!(1, epoch); - assert_eq!(0, round.0); - assert_ne!(0, bytes.len()); - } - }, - _ = tokio::time::sleep(timeout) => panic!("Timeout expired to receive randomness"), + // Validators (0, 1) can communicate normally in new epoch. + let rnds = tokio::time::timeout(timeout, futures::future::join_all([rnd0_fut, rnd1_fut])) + .await + .expect("Byzantine peers (0, 1) should produce randomness in time"); + for rnd in rnds { + let (epoch, round, bytes) = rnd.expect("Channel is not closed and randomness is produced"); + assert_eq!(1, epoch, "Byzantine peers produce randomness in epoch 1"); + assert_eq!(0, round.0, "Byzantine peers produce randomness in round 0"); + assert_ne!( + 0, + bytes.len(), + "Byzantine peers produce non-empty randomness" + ); } for rx in &mut randomness_rxs[2..] { // Validators (2, 3) are still on old epoch. From 1666d916443e8b8ca8d26cb00323f37bca119074 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Fri, 7 Mar 2025 21:41:07 +0300 Subject: [PATCH 14/21] fix: increase timeout for tests only --- crates/iota-network/src/randomness/mod.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/crates/iota-network/src/randomness/mod.rs b/crates/iota-network/src/randomness/mod.rs index 06690023998..7006f87c12a 100644 --- a/crates/iota-network/src/randomness/mod.rs +++ b/crates/iota-network/src/randomness/mod.rs @@ -982,7 +982,15 @@ impl RandomnessEventLoop { continue; // don't send partial sigs to self } let mut client = RandomnessClient::new(peer.clone()); - const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(30); + // `test_byzantine_peer_handling` built in debug mode takes + // longer to verify invalid signatures and thus needs larger + // timeouts. + #[cfg(test)] + const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(100); + // In release signature verification should take less, so + // smaller timeout should be enough. + #[cfg(not(test))] + const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(10); let full_sig = full_sig.get().cloned(); let request = anemo::Request::new(SendSignaturesRequest { epoch, From 2d1f0b5b26b41f22cc410c15116c5a56b3242530 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Mon, 10 Mar 2025 16:07:51 +0300 Subject: [PATCH 15/21] fix: revert test timeout overkill --- crates/iota-network/src/randomness/tests.rs | 52 +++++++-------------- 1 file changed, 18 insertions(+), 34 deletions(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 6ddc6d75a2e..46b6aaced88 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -439,23 +439,15 @@ async fn test_byzantine_peer_handling() { ); } - // This test can just deadlock, ie. run indefinitely without making any - // progress. We can control it by waiting on expected randomness for - // `timeout` secs. For some reason it takes so much time for honest peers to - // produce randomness in presence of byzantine peers. - let timeout = std::time::Duration::from_secs(60); - let (rx2_mut, rx3_mut) = randomness_rxs.split_at_mut(3); - let rnd2_fut = rx2_mut[2].recv(); - let rnd3_fut = rx3_mut[0].recv(); - // Validators (2, 3) can communicate normally. - let rnds = tokio::time::timeout(timeout, futures::future::join_all([rnd2_fut, rnd3_fut])) - .await - .expect("Honest peers (2, 3) should produce randomness in time"); - for rnd in rnds { - let (epoch, round, bytes) = rnd.expect("Channel is not closed and randomness is produced"); - assert_eq!(0, epoch, "Honest peers produce randomness in epoch 0"); - assert_eq!(0, round.0, "Honest peers produce randomness in round 0"); - assert_ne!(0, bytes.len(), "Honest peers produce non-empty randomness"); + for rx in &mut randomness_rxs[2..] { + // Validators (2, 3) can communicate normally. + let (epoch, round, bytes) = rx + .recv() + .await + .expect("Validators (2, 3) should receive randomness in epoch 0, round 0"); + assert_eq!(0, epoch); + assert_eq!(0, round.0); + assert_ne!(0, bytes.len()); } for rx in &mut randomness_rxs[..2] { // Validators (0, 1) are byzantine. @@ -483,23 +475,15 @@ async fn test_byzantine_peer_handling() { None, ); } - - let (rx0_mut, rx1_mut) = randomness_rxs.split_at_mut(1); - let rnd0_fut = rx0_mut[0].recv(); - let rnd1_fut = rx1_mut[0].recv(); - // Validators (0, 1) can communicate normally in new epoch. - let rnds = tokio::time::timeout(timeout, futures::future::join_all([rnd0_fut, rnd1_fut])) - .await - .expect("Byzantine peers (0, 1) should produce randomness in time"); - for rnd in rnds { - let (epoch, round, bytes) = rnd.expect("Channel is not closed and randomness is produced"); - assert_eq!(1, epoch, "Byzantine peers produce randomness in epoch 1"); - assert_eq!(0, round.0, "Byzantine peers produce randomness in round 0"); - assert_ne!( - 0, - bytes.len(), - "Byzantine peers produce non-empty randomness" - ); + for rx in &mut randomness_rxs[..2] { + // Validators (0, 1) can communicate normally in new epoch. + let (epoch, round, bytes) = rx + .recv() + .await + .expect("Validators (0, 1) should receive randomness in epoch 1, round 0"); + assert_eq!(1, epoch); + assert_eq!(0, round.0); + assert_ne!(0, bytes.len()); } for rx in &mut randomness_rxs[2..] { // Validators (2, 3) are still on old epoch. From c859223ac8ad19cfaeb69681418f6bb4f4f673c2 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Mon, 10 Mar 2025 16:09:21 +0300 Subject: [PATCH 16/21] test(ignore): test_byzantine_peer_handling still fails on arm64, needs investigation --- crates/iota-network/src/randomness/tests.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 46b6aaced88..058061c7bd5 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -371,6 +371,7 @@ async fn test_restart_recovery() { } #[tokio::test] +#[ignore = "https://github.com/iotaledger/iota/issues/5620"] async fn test_byzantine_peer_handling() { telemetry_subscribers::init_for_testing(); let committee_fixture = CommitteeFixture::generate(rand::rngs::OsRng, 0, 4); From 39237fe7cb66785740eadb15f14af73cb2b76747 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Mon, 10 Mar 2025 16:41:03 +0300 Subject: [PATCH 17/21] fix: increate timeout value just of arm64 runners to check if it works --- crates/iota-network/src/randomness/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iota-network/src/randomness/mod.rs b/crates/iota-network/src/randomness/mod.rs index 7006f87c12a..997e9f1e823 100644 --- a/crates/iota-network/src/randomness/mod.rs +++ b/crates/iota-network/src/randomness/mod.rs @@ -986,7 +986,7 @@ impl RandomnessEventLoop { // longer to verify invalid signatures and thus needs larger // timeouts. #[cfg(test)] - const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(100); + const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(500); // In release signature verification should take less, so // smaller timeout should be enough. #[cfg(not(test))] From d897114787a911f02d5ab4c932d2cbcd59cb5544 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Mon, 10 Mar 2025 18:43:28 +0300 Subject: [PATCH 18/21] Revert "test(ignore): test_byzantine_peer_handling still fails on arm64, needs investigation" This reverts commit 91be397ff8c7171161a0f1a2a8c60aaee8166bde. --- crates/iota-network/src/randomness/tests.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/iota-network/src/randomness/tests.rs b/crates/iota-network/src/randomness/tests.rs index 058061c7bd5..46b6aaced88 100644 --- a/crates/iota-network/src/randomness/tests.rs +++ b/crates/iota-network/src/randomness/tests.rs @@ -371,7 +371,6 @@ async fn test_restart_recovery() { } #[tokio::test] -#[ignore = "https://github.com/iotaledger/iota/issues/5620"] async fn test_byzantine_peer_handling() { telemetry_subscribers::init_for_testing(); let committee_fixture = CommitteeFixture::generate(rand::rngs::OsRng, 0, 4); From 9b54c4a4408503a668b493b9fa0ff04b9d594507 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Tue, 11 Mar 2025 18:14:11 +0300 Subject: [PATCH 19/21] ci: run rust tests on self-hosted-arm64 to test if randomness timeout fix works --- .github/workflows/_rust_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_rust_tests.yml b/.github/workflows/_rust_tests.yml index 22903ca8b9c..824cd858bda 100644 --- a/.github/workflows/_rust_tests.yml +++ b/.github/workflows/_rust_tests.yml @@ -61,7 +61,7 @@ jobs: if: | !cancelled() && (inputs.isRust || inputs.isPgIntegration || inputs.isMoveExampleUsedByOthers) timeout-minutes: 90 - runs-on: [self-hosted-x64] + runs-on: [self-hosted-arm64] env: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgrespw From 69163f262169b447147dac811d9d527af83b59f6 Mon Sep 17 00:00:00 2001 From: Vlad Semenov <vlad.semenov@iota.org> Date: Tue, 11 Mar 2025 19:46:37 +0300 Subject: [PATCH 20/21] Revert "ci: run rust tests on self-hosted-arm64 to test if randomness timeout fix works" test_byzantine_peer_handling succeeded on selfhosted-arm64 with 500 sec timeout: https://github.com/iotaledger/iota/actions/runs/13791724155/job/38573592500#step:9:2993 This reverts commit a4511f2c08ba0366774e142567ca56f43a35b3f7. --- .github/workflows/_rust_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_rust_tests.yml b/.github/workflows/_rust_tests.yml index 824cd858bda..22903ca8b9c 100644 --- a/.github/workflows/_rust_tests.yml +++ b/.github/workflows/_rust_tests.yml @@ -61,7 +61,7 @@ jobs: if: | !cancelled() && (inputs.isRust || inputs.isPgIntegration || inputs.isMoveExampleUsedByOthers) timeout-minutes: 90 - runs-on: [self-hosted-arm64] + runs-on: [self-hosted-x64] env: POSTGRES_USER: postgres POSTGRES_PASSWORD: postgrespw From c4f4882e8e15620fddac6c28bc99798054ea182e Mon Sep 17 00:00:00 2001 From: muXxer <git@muxxer.de> Date: Wed, 12 Mar 2025 12:42:00 +0100 Subject: [PATCH 21/21] fix: align SEND_PARTIAL_SIGNATURES_TIMEOUT to nextest timeout --- crates/iota-network/src/randomness/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iota-network/src/randomness/mod.rs b/crates/iota-network/src/randomness/mod.rs index 997e9f1e823..30085346d0a 100644 --- a/crates/iota-network/src/randomness/mod.rs +++ b/crates/iota-network/src/randomness/mod.rs @@ -986,7 +986,7 @@ impl RandomnessEventLoop { // longer to verify invalid signatures and thus needs larger // timeouts. #[cfg(test)] - const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(500); + const SEND_PARTIAL_SIGNATURES_TIMEOUT: Duration = Duration::from_secs(300); // In release signature verification should take less, so // smaller timeout should be enough. #[cfg(not(test))]