Skip to content

Commit

Permalink
feat: [NODE-1435] Add backoff and jitter to HostOS upgrades (#395)
Browse files Browse the repository at this point in the history
  • Loading branch information
Bownairo authored Jul 17, 2024
1 parent fe231b3 commit f5491f4
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 12 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions rs/orchestrator/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ rust_library(
"//rs/sys",
"//rs/types/management_canister_types",
"//rs/types/types",
"@crate_index//:backoff",
"@crate_index//:candid",
"@crate_index//:clap_3_2_25",
"@crate_index//:exec",
Expand Down
1 change: 1 addition & 0 deletions rs/orchestrator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ documentation.workspace = true

[dependencies]
async-trait = { workspace = true }
backoff = { workspace = true }
candid = { workspace = true }
clap = { version = "3.2.25", features = ["derive"] }
exec = "0.3.1"
Expand Down
19 changes: 14 additions & 5 deletions rs/orchestrator/src/hostos_upgrade.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::{
error::{OrchestratorError, OrchestratorResult},
registry_helper::RegistryHelper,
};
use backoff::{backoff::Backoff, ExponentialBackoff};
use ic_logger::{info, warn, ReplicaLogger};
use ic_protobuf::registry::hostos_version::v1::HostosVersionRecord;
use ic_sys::utility_command::UtilityCommand;
Expand Down Expand Up @@ -39,16 +40,24 @@ impl HostosUpgrader {
pub async fn upgrade_loop(
&mut self,
mut exit_signal: Receiver<bool>,
interval: Duration,
timeout: Duration,
mut backoff: ExponentialBackoff,
liveness_timeout: Duration,
) {
while !*exit_signal.borrow() {
match tokio::time::timeout(timeout, self.check_for_upgrade()).await {
Ok(Ok(())) => {}
match tokio::time::timeout(liveness_timeout, self.check_for_upgrade()).await {
Ok(Ok(())) => backoff.reset(),
e => warn!(&self.logger, "Check for HostOS upgrade failed: {:?}", e),
}

// NOTE: We currently do not and should not set `max_elapsed_time`,
// so that we never run out of backoffs. If `max_elapsed_time` _is_
// ever set, repeat the `max_interval` instead. This is technically
// not the same behavior as if `max_elapsed_time` was unset, because
// we will not be including jitter, but it should be close enough,
// and safe.
let safe_backoff = backoff.next_backoff().unwrap_or(backoff.max_interval);
tokio::select! {
_ = tokio::time::sleep(interval) => {}
_ = tokio::time::sleep(safe_backoff) => {}
_ = exit_signal.changed() => {}
};
}
Expand Down
28 changes: 21 additions & 7 deletions rs/orchestrator/src/orchestrator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::{
ssh_access_manager::SshAccessManager,
upgrade::Upgrade,
};
use backoff::ExponentialBackoffBuilder;
use get_if_addrs::get_if_addrs;
use ic_config::metrics::{Config as MetricsConfig, Exporter};
use ic_crypto::CryptoComponent;
Expand Down Expand Up @@ -400,14 +401,27 @@ impl Orchestrator {
// registry some time to catch up, after starting.
tokio::time::sleep(Duration::from_secs(60)).await;

// Run the HostOS upgrade loop with a 15 minute timeout, waiting 1
// minute between checks. This timeout is a last resort trying to
// revive the upgrade monitoring in case it gets stuck in an
// unexpected situation.
let interval = Duration::from_secs(60);
let timeout = Duration::from_secs(60 * 15);
// Run the HostOS upgrade loop with an exponential backoff. A 15
// minute liveness timeout will restart the loop if no progress is
// made, to ensure the upgrade loop does not get stuck.
//
// The exponential backoff between retries starts at 1 minute, and
// increases by a factor of 1.75, maxing out at two hours.
// e.g. (roughly) 1, 1.75, 3, 5.25, 9.5, 16.5, 28.75, 50.25, 88, 120, 120
//
// Additionally, there's a random +=50% range added to each delay, for jitter.
let backoff = ExponentialBackoffBuilder::new()
.with_initial_interval(Duration::from_secs(60))
.with_randomization_factor(0.5)
.with_multiplier(1.75)
.with_max_interval(Duration::from_secs(2 * 60 * 60))
.with_max_elapsed_time(None)
.build();
let liveness_timeout = Duration::from_secs(15 * 60);

upgrade.upgrade_loop(exit_signal, interval, timeout).await;
upgrade
.upgrade_loop(exit_signal, backoff, liveness_timeout)
.await;
info!(log, "Shut down the HostOS upgrade loop");
}

Expand Down

0 comments on commit f5491f4

Please sign in to comment.