From 57971d26429b6b524553d79e9e4081ce74bcee6e Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Thu, 24 Aug 2023 12:40:49 -0700 Subject: [PATCH] Add Fenix_Process_detect_failures --- include/fenix.h | 2 ++ include/fenix_ext.h | 3 +++ include/fenix_process_recovery.h | 2 ++ src/fenix.c | 4 ++++ src/fenix_process_recovery.c | 20 ++++++++++++++++++++ 5 files changed, 31 insertions(+) diff --git a/include/fenix.h b/include/fenix.h index 3ecb938..77d573b 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -231,6 +231,8 @@ int Fenix_Process_fail_list(int** fail_list); int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status); +int Fenix_Process_detect_failures(int do_recovery); + #if defined(c_plusplus) || defined(__cplusplus) } #endif diff --git a/include/fenix_ext.h b/include/fenix_ext.h index fd4b1a6..ef4dcc4 100644 --- a/include/fenix_ext.h +++ b/include/fenix_ext.h @@ -96,6 +96,9 @@ typedef struct { //Manage state of the comms. Necessary when failures happen rapidly, mussing up state int new_world_exists, user_world_exists; + int dummy_recv_buffer; + MPI_Request check_failures_req; + MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API diff --git a/include/fenix_process_recovery.h b/include/fenix_process_recovery.h index 5243ae4..9b85e04 100644 --- a/include/fenix_process_recovery.h +++ b/include/fenix_process_recovery.h @@ -118,6 +118,8 @@ void __fenix_set_rank_role(int FenixRankRole); void __fenix_postinit(int *); +int __fenix_detect_failures(int do_recovery); + void __fenix_finalize(); void __fenix_finalize_spare(); diff --git a/src/fenix.c b/src/fenix.c index a8e5e28..70c55d5 100644 --- a/src/fenix.c +++ b/src/fenix.c @@ -209,3 +209,7 @@ int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status){ //Request was (potentially) cancelled if ret is MPI_ERR_PROC_FAILED return ret == MPI_ERR_PROC_FAILED || ret == MPI_ERR_REVOKED; } + +int Fenix_Process_detect_failures(int do_recovery){ + return __fenix_detect_failures(do_recovery); +} diff --git a/src/fenix_process_recovery.c b/src/fenix_process_recovery.c index 52b57ff..8d746d4 100644 --- a/src/fenix_process_recovery.c +++ b/src/fenix_process_recovery.c @@ -686,6 +686,11 @@ void __fenix_postinit(int *error) // fenix.role); //} + if(fenix.new_world_exists){ + //Set up dummy irecv to use for checking for failures. + MPI_Irecv(&fenix.dummy_recv_buffer, 1, MPI_INT, MPI_ANY_SOURCE, + 34095347, fenix.new_world, &fenix.check_failures_req); + } if (fenix.repair_result != 0) { *error = fenix.repair_result; @@ -707,6 +712,21 @@ void __fenix_postinit(int *error) } } +int __fenix_detect_failures(int do_recovery){ + if(!fenix.new_world_exists) return FENIX_ERROR_UNINITIALIZED; + + int old_ignore_errs = fenix.ignore_errs; + fenix.ignore_errs = !do_recovery; + + int req_completed; + int ret = MPI_Test(&fenix.check_failures_req, &req_completed, MPI_STATUS_IGNORE); + + if(req_completed) ret = FENIX_ERROR_INTERN; + + fenix.ignore_errs = old_ignore_errs; + return ret; +} + void __fenix_finalize() { int location = FENIX_FINALIZE_LOC;