Skip to content

Commit

Permalink
fix: remove potential flakiness from a snapshot_hosts test (#10317)
Browse files Browse the repository at this point in the history
The test `snapshot_hosts::invalid_signature_not_broadcast` sends out a
message containing three instances of `SnapshotHostInfo`, one of which
has an invalid signature, and then verifies that this message is
rejected.

There is a subtelty here that I didn't take into account when I wrote
this test. When the `PeerManager` receives a message with multiple
`SnapshotHostInfos`, it can accept the valid instances, even in the case
where there are other invalid instances in the message. Because of this
the test could fail - `PeerManager` could accept `ok_info_a` or
`ok_info_b` and then the final assert would fail. Whether it accepts
them or not depends on the timing, as the instances are processed on
multiple threads and the processing stops at the first invalid instance.
Because of that the test could be flaky.

Let's fix it by filtering out `ok_info_a` and `ok_info_b` in the final
check. This way even if the `PeerManager` accepts them, it won't break
the test. The code is the same as in
`snapshot_hosts::too_many_shards_not_broadcast()`.
  • Loading branch information
jancionear authored Dec 15, 2023
1 parent c202bfc commit 2b975fc
Showing 1 changed file with 28 additions and 22 deletions.
50 changes: 28 additions & 22 deletions chain/network/src/peer_manager/tests/snapshot_hosts.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::network_protocol::testonly as data;
use crate::network_protocol::SnapshotHostInfo;
use crate::network_protocol::SyncSnapshotHosts;
use crate::network_protocol::MAX_SHARDS_PER_SNAPSHOT_HOST_INFO;
Expand All @@ -10,6 +9,7 @@ use crate::testonly::{make_rng, AsSet as _};
use crate::types::NetworkRequests;
use crate::types::PeerManagerMessageRequest;
use crate::types::PeerMessage;
use crate::{network_protocol::testonly as data, peer::testonly::PeerHandle};
use near_async::time;
use near_crypto::SecretKey;
use near_o11y::testonly::init_test_logger;
Expand Down Expand Up @@ -51,6 +51,25 @@ fn take_sync_snapshot_msg(event: crate::peer::testonly::Event) -> Option<SyncSna
}
}

/// Receives events from the given PeerHandle until the desired target_info is found
/// Ignores infos defined in allowed_ignorable_infos. Any other unexpected info will trigger a panic.
async fn wait_for_host_info(
peer: &mut PeerHandle,
target_info: &SnapshotHostInfo,
allowed_ignorable_infos: &[Arc<SnapshotHostInfo>],
) {
loop {
let msg = peer.events.recv_until(take_sync_snapshot_msg).await;
for info in msg.hosts {
if info.as_ref() == target_info {
return;
} else if !allowed_ignorable_infos.contains(&info) {
panic!("wait_for_host_info: received unexpected SnapshotHostInfo: {:?}", info);
}
}
}
}

/// Test that PeerManager broadcasts SnapshotHostInfo messages to all connected peers
#[tokio::test]
async fn broadcast() {
Expand Down Expand Up @@ -174,7 +193,7 @@ async fn invalid_signature_not_broadcast() {
let ok_info_b = make_snapshot_host_info(&peer1_config.node_id(), &peer1_config.node_key, rng);

let invalid_message = PeerMessage::SyncSnapshotHosts(SyncSnapshotHosts {
hosts: vec![ok_info_a, invalid_info, ok_info_b],
hosts: vec![ok_info_a.clone(), invalid_info, ok_info_b.clone()],
});
peer1.send(invalid_message).await;

Expand All @@ -186,10 +205,11 @@ async fn invalid_signature_not_broadcast() {
.send(PeerMessage::SyncSnapshotHosts(SyncSnapshotHosts { hosts: vec![info2.clone()] }))
.await;

tracing::info!(target:"test", "Make sure that only the valid message is broadcast.");
tracing::info!(target:"test", "Make sure that only the valid messages are broadcast.");

let msg = peer2.events.recv_until(take_sync_snapshot_msg).await;
assert_eq!(msg.hosts, vec![info2]);
// Wait until peer2 receives info2. Ignore ok_info_a and ok_info_b,
// as the PeerManager could accept and broadcast them despite the neighbouring invalid_info.
wait_for_host_info(&mut peer2, &info2, &[ok_info_a, ok_info_b]).await;
}

/// Test that a SnapshotHostInfo message with more shards than allowed isn't broadcast by PeerManager.
Expand Down Expand Up @@ -258,23 +278,9 @@ async fn too_many_shards_not_broadcast() {

tracing::info!(target:"test", "Make sure that only valid messages are broadcast.");

loop {
let msg = peer2.events.recv_until(take_sync_snapshot_msg).await;
for info in msg.hosts {
if info == info2 {
return; // Received the expected message - end the test
}

// Even though `ok_info_a` and `ok_info_b` were sent in an invalid message,
// they were themselevs valid so the PeerManager can optionally accept them
// and broadcast them to other peers. This is an expected behavior, let's filter them out here.
if info == ok_info_a || info == ok_info_b {
continue;
}

panic!("Unexpected host info received: {:#?}", info);
}
}
// Wait until peer2 receives info2. Ignore ok_info_a and ok_info_b,
// as the PeerManager could accept and broadcast them despite the neighbouring invalid_info.
wait_for_host_info(&mut peer2, &info2, &[ok_info_a, ok_info_b]).await;
}

/// Test that SyncSnapshotHosts message is propagated between many PeerManager instances.
Expand Down

0 comments on commit 2b975fc

Please sign in to comment.