fix(state-parts-dump-check): bound process_part by timeout (#10215)

Problem: Occasionally, the monitoring will be stuck. What happens is there are a few parts for which the process_part() is called, but never finished, i.e. not retried, not success, not failed. My guess is that the get_part().await somehow is stuck waiting forever, and since there was no timeout bound for the process_part(), the tokio task that runs process_part could be hung forever, thus the monitoring app is stuck. I tried making the task for part_id = 100 sleep for 1000000 seconds, and the program finishes all other parts except this one and hangs. This is similar to what I see on monitoring node. Solution: bound process_part by a timeout, and initiate retry if timeout passed. Thus no matter for which reason the task hangs, it will retry itself. Note, the timeout is set to 10 mins because the timeout starts counting once the task is spawn, and counts real time instead of cpu time for the task, i.e. if this task is not running, time still elapse. So we need to make sure within timeout * MAX_RETRIES, we are able to finish processing all parts.
near · Nov 20, 2023 · debc26a · debc26a
1 parent 1102b37
commit debc26a
Showing 1 changed file with 24 additions and 14 deletions.
diff --git a/tools/state-parts-dump-check/src/cli.rs b/tools/state-parts-dump-check/src/cli.rs
@@ -19,8 +19,10 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use std::thread::sleep;
 use std::time::{Duration, Instant};
+use tokio::time::timeout;
 
-const MAX_RETRIES: u32 = 3;
+// This isn't set to a huge number because if we keep retrying for too long, we will miss the next epoch's check
+const MAX_RETRIES: u32 = 5;
 
 #[derive(clap::Parser)]
 pub struct StatePartsDumpCheckCommand {
@@ -566,28 +568,36 @@ async fn process_part_with_3_retries(
         let chain_id = chain_id.clone();
         let epoch_id = epoch_id.clone();
         let external = external.clone();
-        res = process_part(
-            part_id,
-            chain_id,
-            epoch_id,
-            epoch_height,
-            shard_id,
-            state_root,
-            num_parts,
-            external,
+        // timeout is needed to deal with edge cases where process_part awaits forever, i.e. the get_part().await somehow waits forever
+        // this is set to a long duration because the timer for each task, i.e. process_part, starts when the task is started, i.e. tokio::spawn is called,
+        // and counts actual time instead of CPU time.
+        // The bottom line is this timeout * MAX_RETRIES > time it takes for all parts to be processed, which is typically < 30 mins on monitoring nodes
+        let timeout_duration = tokio::time::Duration::from_secs(600);
+        res = timeout(
+            timeout_duration,
+            process_part(
+                part_id,
+                chain_id,
+                epoch_id,
+                epoch_height,
+                shard_id,
+                state_root,
+                num_parts,
+                external,
+            ),
         )
         .await;
         match res {
-            Ok(_) => {
+            Ok(Ok(_)) => {
                 tracing::info!(shard_id, epoch_height, part_id, "process_part success.",);
                 break;
             }
-            Err(_) if retries < MAX_RETRIES => {
+            _ if retries < MAX_RETRIES => {
                 tracing::info!(shard_id, epoch_height, part_id, "process_part failed. Will retry.",);
                 retries += 1;
                 tokio::time::sleep(Duration::from_secs(5)).await;
             }
-            Err(_) => {
+            _ => {
                 tracing::info!(
                     shard_id,
                     epoch_height,
@@ -598,7 +608,7 @@ async fn process_part_with_3_retries(
             }
         }
     }
-    res
+    res?
 }
 
 async fn process_part(