diff --git a/src/cart/README.env b/src/cart/README.env index 6457ea19467..6bad14ad41e 100644 --- a/src/cart/README.env +++ b/src/cart/README.env @@ -211,3 +211,5 @@ This file lists the environment variables used in CaRT. traffic congestion. Available options are: "unspec" (default), "best_effort", "low_latency", "bulk_data". + . CRT_CXI_INIT_RETRY + Retry count for HG_Init_opt2() when initializing the CXI provider (default = 3). diff --git a/src/cart/crt_hg.c b/src/cart/crt_hg.c index 8f2395a44ff..f1dd6a3f2c3 100644 --- a/src/cart/crt_hg.c +++ b/src/cart/crt_hg.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -834,6 +835,7 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ char addr_str[CRT_ADDR_STR_MAX_LEN] = {'\0'}; size_t str_size = CRT_ADDR_STR_MAX_LEN; struct crt_prov_gdata *prov_data; + uint32_t retry_count = 0; int rc = DER_SUCCESS; prov_data = crt_get_prov_gdata(primary, provider); @@ -869,9 +871,17 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_ init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc; if (thread_mode_single) init_info.na_init_info.thread_mode = NA_THREAD_MODE_SINGLE; - +retry: hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info); if (hg_class == NULL) { + /** workaround for DAOS-16990, DAOS-17011 - retry a few times on init */ + if (provider == CRT_PROV_OFI_CXI && !crt_is_service() && + retry_count < crt_gdata.cg_hg_init_retry_cnt) { + retry_count++; + D_WARN("Could not initialize HG class; retrying (%d)\n", retry_count); + sleep(retry_count * 5); + goto retry; + } D_ERROR("Could not initialize HG class.\n"); D_GOTO(out, rc = -DER_HG); } diff --git a/src/cart/crt_init.c b/src/cart/crt_init.c index be43b8d1f1a..d66d99cd65c 100644 --- a/src/cart/crt_init.c +++ b/src/cart/crt_init.c @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -281,11 +282,17 @@ data_init(int server, crt_init_options_t *opt) if (mem_pin_enable == 1) mem_pin_workaround(); } else { + int retry_count = 3; + /* * Client-side envariable to indicate that the cluster * is running using a secondary provider */ crt_env_get(CRT_SECONDARY_PROVIDER, &is_secondary); + + /** Client side env for hg_init() retries */ + crt_env_get(CRT_CXI_INIT_RETRY, &retry_count); + crt_gdata.cg_hg_init_retry_cnt = retry_count; } crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1; diff --git a/src/cart/crt_internal_types.h b/src/cart/crt_internal_types.h index f10bf38d7c8..d35148c2bfe 100644 --- a/src/cart/crt_internal_types.h +++ b/src/cart/crt_internal_types.h @@ -1,5 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -170,6 +171,8 @@ struct crt_gdata { long cg_num_cores; /** Inflight rpc quota limit */ uint32_t cg_rpc_quota; + /** Retry count of HG_Init_opt2() on failure when using CXI provider */ + uint32_t cg_hg_init_retry_cnt; }; extern struct crt_gdata crt_gdata; @@ -197,6 +200,7 @@ struct crt_event_cb_priv { ENV_STR(CRT_ATTACH_INFO_PATH) \ ENV(CRT_CREDIT_EP_CTX) \ ENV(CRT_CTX_NUM) \ + ENV(CRT_CXI_INIT_RETRY) \ ENV(CRT_ENABLE_MEM_PIN) \ ENV_STR(CRT_L_GRP_CFG) \ ENV(CRT_L_RANK) \ diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 4ebe4aecd10..2718cbbc84a 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1792,8 +1792,9 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid)); retry: - rc = ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | - PO_COMP_ST_DOWNOUT, cont_open_one, &arg, 0); + rc = ds_pool_thread_collective(pool_uuid, + PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT, + cont_open_one, &arg, DSS_ULT_DEEP_STACK); if (rc != 0) { if (rc == -DER_AGAIN) { dss_sleep(50); @@ -2105,9 +2106,9 @@ ds_cont_tgt_snapshots_update(uuid_t pool_uuid, uuid_t cont_uuid, * the up targets in this scenario. The target property will be updated * upon initiating container aggregation. */ - return ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | - PO_COMP_ST_DOWNOUT | PO_COMP_ST_UP, - cont_snap_update_one, &args, 0); + return ds_pool_thread_collective( + pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_UP, + cont_snap_update_one, &args, DSS_ULT_DEEP_STACK); } void diff --git a/src/pool/srv_target.c b/src/pool/srv_target.c index bd6ec4f9c11..3dccac41e1b 100644 --- a/src/pool/srv_target.c +++ b/src/pool/srv_target.c @@ -2256,8 +2256,9 @@ ds_pool_tgt_prop_update(struct ds_pool *pool, struct pool_iv_prop *iv_prop) arg.uvp_checkpoint_props_changed = 1; } - ret = ds_pool_thread_collective(pool->sp_uuid, PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | - PO_COMP_ST_NEW, update_vos_prop_on_targets, &arg, 0); + ret = ds_pool_thread_collective(pool->sp_uuid, + PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW, + update_vos_prop_on_targets, &arg, DSS_ULT_DEEP_STACK); if (ret != 0) return ret; diff --git a/utils/build.config b/utils/build.config index fd34f7441e9..e90e4c8c7c6 100644 --- a/utils/build.config +++ b/utils/build.config @@ -27,6 +27,6 @@ ucx=https://github.com/openucx/ucx.git [patch_versions] spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff -mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch +mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch,https://raw.githubusercontent.com/daos-stack/mercury/48df263212604336b2dbe0430dcab4482eb43437/na_ucx_ep_flush.patch pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff argobots=https://github.com/pmodels/argobots/pull/397/commits/411e5b344642ebc82190fd8b125db512e5b449d1.diff,https://github.com/pmodels/argobots/commit/bb0c908abfac4bfe37852eee621930634183c6aa.diff