From 20658bb5e2d7926ad4003b0ddf4933f9a60b10c7 Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Wed, 10 Apr 2024 15:27:15 -0400 Subject: [PATCH] fix(mirror): wait 15 seconds before sending transactions (#10990) The test pytest/tools/mirror/offline_test.py often fails with many fewer transactions observed than wanted, and the logs reveal that many transactions are invalid because the access keys used do not exist in the target chain. This happens because some early transactions that should have added those keys never make it on chain. These transactions are sent successfully from the perspective of the ClientActor, but the logs show that they're dropped by the peer manager: ``` DEBUG handle{handler="PeerManagerMessageRequest" actor="PeerManagerActor" msg_type="NetworkRequests"}: network: Failed sending message: peer not connected to=ed25519:Fz7d1xkkt3XsvTPiwk4JRhMuPru4Ss7cLS8fdhshDRj3 num_connected_peers=1 msg=Routed(RoutedMessageV2 { msg: RoutedMessage { ... body: tx GFW8HgTndXVKdcLHdsCXxURjHxDnnEqHadrbxvsLKVQb ... ``` So, the peer manager is dropping the transaction instead of routing it, and the test fails because many subsequent transactions depended on that one. A git bisect shows that this behavior starts after https://github.com/near/nearcore/pull/9651. It seems that this failure to route messages happens for a bit longer after startup after that PR. The proper way to handle this might be to implement a mechanism whereby these messages won't just silently be dropped, and the ClientActor can receive a notification that it wasn't successful so that we can retry it later. But for now a workaround is to just wait a little bit before sending transactions. So we'll set a 15 second timer for the first batch of transactions, and then proceed normally with the others --- tools/mirror/src/chain_tracker.rs | 47 ++++++++++++++++++------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/tools/mirror/src/chain_tracker.rs b/tools/mirror/src/chain_tracker.rs index c310052b71d..29641131bdc 100644 --- a/tools/mirror/src/chain_tracker.rs +++ b/tools/mirror/src/chain_tracker.rs @@ -129,7 +129,6 @@ pub(crate) enum SentBatch { // TODO: the separation between what's in here and what's in the main file with struct TxMirror is not // that clear and doesn't make that much sense. Should refactor -#[derive(Default)] pub(crate) struct TxTracker { sent_txs: HashMap, txs_by_signer: HashMap<(AccountId, PublicKey), BTreeSet>, @@ -147,7 +146,7 @@ pub(crate) struct TxTracker { nonempty_height_queued: Option, height_popped: Option, height_seen: Option, - send_time: Option>>, + send_time: Pin>, // Config value in the target chain, used to judge how long to wait before sending a new batch of txs min_block_production_delay: Duration, // timestamps in the target chain, used to judge how long to wait before sending a new batch of txs @@ -169,7 +168,25 @@ impl TxTracker { I: IntoIterator, { let next_heights = next_heights.into_iter().map(Clone::clone).collect(); - Self { min_block_production_delay, next_heights, stop_height, ..Default::default() } + Self { + min_block_production_delay, + next_heights, + stop_height, + // Wait at least 15 seconds before sending any transactions because for + // a few seconds after the node starts, transaction routing requests + // will be silently dropped by the peer manager. + send_time: Box::pin(tokio::time::sleep(std::time::Duration::from_secs(15))), + sent_txs: HashMap::new(), + txs_by_signer: HashMap::new(), + queued_blocks: VecDeque::new(), + updater_to_keys: HashMap::new(), + nonces: HashMap::new(), + height_queued: None, + nonempty_height_queued: None, + height_popped: None, + height_seen: None, + recent_block_timestamps: VecDeque::new(), + } } pub(crate) async fn next_heights( @@ -441,10 +458,7 @@ impl TxTracker { } pub(crate) fn next_batch_time(&self) -> Instant { - match &self.send_time { - Some(t) => t.as_ref().deadline().into_std(), - None => Instant::now(), - } + self.send_time.as_ref().deadline().into_std() } pub(crate) async fn next_batch( @@ -455,10 +469,10 @@ impl TxTracker { // sleep until 20 milliseconds before we want to send transactions before we check for nonces // in the target chain. In the second or so between now and then, we might process another block // that will set the nonces. - if let Some(s) = &self.send_time { - tokio::time::sleep_until(s.as_ref().deadline() - std::time::Duration::from_millis(20)) - .await; - } + tokio::time::sleep_until( + self.send_time.as_ref().deadline() - std::time::Duration::from_millis(20), + ) + .await; let mut needed_access_keys = HashSet::new(); for c in self.queued_blocks[0].chunks.iter_mut() { for tx in c.txs.iter_mut() { @@ -518,9 +532,7 @@ impl TxTracker { }; } } - if let Some(sleep) = &mut self.send_time { - sleep.await; - } + (&mut self.send_time).await; Ok(self.queued_blocks.pop_front().unwrap()) } @@ -1134,12 +1146,7 @@ impl TxTracker { let block_delay = self .second_longest_recent_block_delay() .unwrap_or(self.min_block_production_delay + Duration::from_millis(100)); - match &mut self.send_time { - Some(t) => t.as_mut().reset(tokio::time::Instant::now() + block_delay), - None => { - self.send_time = Some(Box::pin(tokio::time::sleep(block_delay))); - } - } + self.send_time.as_mut().reset(tokio::time::Instant::now() + block_delay); crate::set_last_source_height(db, b.source_height)?; let txs = b .chunks