Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into ckochhof/fix/master…
Browse files Browse the repository at this point in the history
…/daos-16557-v2

Quick-Functional: true
Test-tag: NvmeEnospace
Required-githooks: true
  • Loading branch information
kanard38 authored and knard38 committed Feb 5, 2025
2 parents 115170b + 8fe77b9 commit 43d6fd0
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 9 deletions.
2 changes: 2 additions & 0 deletions src/cart/README.env
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,5 @@ This file lists the environment variables used in CaRT.
traffic congestion. Available options are: "unspec" (default), "best_effort",
"low_latency", "bulk_data".

. CRT_CXI_INIT_RETRY
Retry count for HG_Init_opt2() when initializing the CXI provider (default = 3).
12 changes: 11 additions & 1 deletion src/cart/crt_hg.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -834,6 +835,7 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
char addr_str[CRT_ADDR_STR_MAX_LEN] = {'\0'};
size_t str_size = CRT_ADDR_STR_MAX_LEN;
struct crt_prov_gdata *prov_data;
uint32_t retry_count = 0;
int rc = DER_SUCCESS;

prov_data = crt_get_prov_gdata(primary, provider);
Expand Down Expand Up @@ -869,9 +871,17 @@ crt_hg_class_init(crt_provider_t provider, int ctx_idx, bool primary, int iface_
init_info.traffic_class = (enum na_traffic_class)crt_gdata.cg_swim_tc;
if (thread_mode_single)
init_info.na_init_info.thread_mode = NA_THREAD_MODE_SINGLE;

retry:
hg_class = HG_Init_opt2(info_string, crt_is_service(), HG_VERSION(2, 4), &init_info);
if (hg_class == NULL) {
/** workaround for DAOS-16990, DAOS-17011 - retry a few times on init */
if (provider == CRT_PROV_OFI_CXI && !crt_is_service() &&
retry_count < crt_gdata.cg_hg_init_retry_cnt) {
retry_count++;
D_WARN("Could not initialize HG class; retrying (%d)\n", retry_count);
sleep(retry_count * 5);
goto retry;
}
D_ERROR("Could not initialize HG class.\n");
D_GOTO(out, rc = -DER_HG);
}
Expand Down
7 changes: 7 additions & 0 deletions src/cart/crt_init.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -281,11 +282,17 @@ data_init(int server, crt_init_options_t *opt)
if (mem_pin_enable == 1)
mem_pin_workaround();
} else {
int retry_count = 3;

/*
* Client-side envariable to indicate that the cluster
* is running using a secondary provider
*/
crt_env_get(CRT_SECONDARY_PROVIDER, &is_secondary);

/** Client side env for hg_init() retries */
crt_env_get(CRT_CXI_INIT_RETRY, &retry_count);
crt_gdata.cg_hg_init_retry_cnt = retry_count;
}
crt_gdata.cg_provider_is_primary = (is_secondary) ? 0 : 1;

Expand Down
4 changes: 4 additions & 0 deletions src/cart/crt_internal_types.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
/*
* (C) Copyright 2016-2024 Intel Corporation.
* (C) Copyright 2025 Hewlett Packard Enterprise Development LP
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -170,6 +171,8 @@ struct crt_gdata {
long cg_num_cores;
/** Inflight rpc quota limit */
uint32_t cg_rpc_quota;
/** Retry count of HG_Init_opt2() on failure when using CXI provider */
uint32_t cg_hg_init_retry_cnt;
};

extern struct crt_gdata crt_gdata;
Expand Down Expand Up @@ -197,6 +200,7 @@ struct crt_event_cb_priv {
ENV_STR(CRT_ATTACH_INFO_PATH) \
ENV(CRT_CREDIT_EP_CTX) \
ENV(CRT_CTX_NUM) \
ENV(CRT_CXI_INIT_RETRY) \
ENV(CRT_ENABLE_MEM_PIN) \
ENV_STR(CRT_L_GRP_CFG) \
ENV(CRT_L_RANK) \
Expand Down
11 changes: 6 additions & 5 deletions src/container/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -1792,8 +1792,9 @@ ds_cont_tgt_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid,
DP_UUID(pool_uuid), DP_UUID(cont_uuid), DP_UUID(cont_hdl_uuid));

retry:
rc = ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN |
PO_COMP_ST_DOWNOUT, cont_open_one, &arg, 0);
rc = ds_pool_thread_collective(pool_uuid,
PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT,
cont_open_one, &arg, DSS_ULT_DEEP_STACK);
if (rc != 0) {
if (rc == -DER_AGAIN) {
dss_sleep(50);
Expand Down Expand Up @@ -2105,9 +2106,9 @@ ds_cont_tgt_snapshots_update(uuid_t pool_uuid, uuid_t cont_uuid,
* the up targets in this scenario. The target property will be updated
* upon initiating container aggregation.
*/
return ds_pool_thread_collective(pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN |
PO_COMP_ST_DOWNOUT | PO_COMP_ST_UP,
cont_snap_update_one, &args, 0);
return ds_pool_thread_collective(
pool_uuid, PO_COMP_ST_NEW | PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_UP,
cont_snap_update_one, &args, DSS_ULT_DEEP_STACK);
}

void
Expand Down
5 changes: 3 additions & 2 deletions src/pool/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -2256,8 +2256,9 @@ ds_pool_tgt_prop_update(struct ds_pool *pool, struct pool_iv_prop *iv_prop)
arg.uvp_checkpoint_props_changed = 1;
}

ret = ds_pool_thread_collective(pool->sp_uuid, PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT |
PO_COMP_ST_NEW, update_vos_prop_on_targets, &arg, 0);
ret = ds_pool_thread_collective(pool->sp_uuid,
PO_COMP_ST_DOWN | PO_COMP_ST_DOWNOUT | PO_COMP_ST_NEW,
update_vos_prop_on_targets, &arg, DSS_ULT_DEEP_STACK);
if (ret != 0)
return ret;

Expand Down
2 changes: 1 addition & 1 deletion utils/build.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@ ucx=https://github.com/openucx/ucx.git

[patch_versions]
spdk=https://github.com/spdk/spdk/commit/b0aba3fcd5aceceea530a702922153bc75664978.diff,https://github.com/spdk/spdk/commit/445a4c808badbad3942696ecf16fa60e8129a747.diff
mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch
mercury=https://raw.githubusercontent.com/daos-stack/mercury/f3dc286fb40ec1a3a38a2e17c45497bc2aa6290d/na_ucx.patch,https://raw.githubusercontent.com/daos-stack/mercury/48df263212604336b2dbe0430dcab4482eb43437/na_ucx_ep_flush.patch
pmdk=https://github.com/pmem/pmdk/commit/2abe15ac0b4eed894b6768cd82a3b0a7c4336284.diff
argobots=https://github.com/pmodels/argobots/pull/397/commits/411e5b344642ebc82190fd8b125db512e5b449d1.diff,https://github.com/pmodels/argobots/commit/bb0c908abfac4bfe37852eee621930634183c6aa.diff

0 comments on commit 43d6fd0

Please sign in to comment.