From 0a5b8f669cd5ed1cf25101a05cbb0906f2b30e8c Mon Sep 17 00:00:00 2001 From: Matthew Whitlock Date: Thu, 28 Apr 2022 16:48:31 -0400 Subject: [PATCH] Update ULFM, improve localized restore functionality, bugfixes (#61) * Update run command to newest recommended flags * Update instructions to latest ULFM/OpenMPI recommended version * Repair files from revert * Add lrestore function for local-data-only restoring * Fix bug in recovery when spare ranks fail * Improve safety when using commit_barrier Co-authored-by: Matthew Whitlock --- README.md | 2 +- examples/01_hello_world/fenix/CMakeLists.txt | 4 +- examples/02_send_recv/fenix/CMakeLists.txt | 2 +- examples/05_subset_create/CMakeLists.txt | 2 +- examples/06_subset_createv/CMakeLists.txt | 2 +- include/fenix.h | 3 + include/fenix_comm_list.h | 2 +- include/fenix_data_group.h | 4 + include/fenix_data_member.h | 2 +- include/fenix_data_packet.h | 2 +- include/fenix_data_recovery.h | 3 +- include/fenix_f.h | 2 +- include/fenix_opt.h | 2 +- include/fenix_process_recovery.h | 2 +- include/fenix_process_recovery_global.h | 2 +- include/fenix_util.h | 4 +- src/fenix.c | 4 + src/fenix_callbacks.c | 2 +- src/fenix_comm_list.c | 2 +- src/fenix_data_policy.c | 3 +- src/fenix_data_policy_in_memory_raid.c | 222 +++++++++++++++---- src/fenix_data_recovery.c | 74 ++++++- src/fenix_mpi_override.c | 2 +- src/fenix_opt.c | 2 +- src/fenix_process_recovery.c | 32 ++- src/fenix_util.c | 4 +- src/globals.c | 2 +- test/failed_spares/CMakeLists.txt | 15 ++ test/failed_spares/fenix_failed_spares.c | 147 ++++++++++++ test/issend/CMakeLists.txt | 2 +- test/issend/fenix_issend_test.c | 1 - test/no_jump/CMakeLists.txt | 2 +- test/request_cancelled/CMakeLists.txt | 2 +- 33 files changed, 480 insertions(+), 78 deletions(-) create mode 100644 test/failed_spares/CMakeLists.txt create mode 100644 test/failed_spares/fenix_failed_spares.c diff --git a/README.md b/README.md index f2a2fa3..09efb60 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ These instructions assume you are in your home directory. * For example: ` git clone
` 2. Create a build directory. * For example: ` mkdir -p ~/build/fenix/ && cd ~/build/fenix/ ` -3. Specify the MPI C compiler to use. [ULFM2 Open MPI](https://bitbucket.org/icldistcomp/ulfm2) is the required version. +3. Specify the MPI C compiler to use. [Open MPI 5+](https://github.com/open-mpi/ompi/tree/v5.0.x) is the required version. * To manually indicate which compiler `cmake` should use, set the `MPICC` variable to point to it. * For example: ` export MPICC=~/install/mpi-ulfm/bin/mpicc ` * If the `MPICC` environment variable is not there, `cmake` will try to guess where the MPI implementation is. To help, make sure you include the installation directory of MPI in your `PATH`. diff --git a/examples/01_hello_world/fenix/CMakeLists.txt b/examples/01_hello_world/fenix/CMakeLists.txt index df8d7a1..2dad662 100644 --- a/examples/01_hello_world/fenix/CMakeLists.txt +++ b/examples/01_hello_world/fenix/CMakeLists.txt @@ -12,9 +12,9 @@ add_executable(fenix_hello_world fenix_hello_world.c) target_link_libraries(fenix_hello_world fenix ${MPI_C_LIBRARIES}) if(BUILD_TESTING) - set(CMAKE_BUILD_TYPE Debug) + #set(CMAKE_BUILD_TYPE Debug) add_executable(fenix_hello_world-debug fenix_hello_world.c) target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME hello_world - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 3 fenix_hello_world-debug "1") + COMMAND mpirun --with-ft mpi -n 3 fenix_hello_world-debug "1") endif() diff --git a/examples/02_send_recv/fenix/CMakeLists.txt b/examples/02_send_recv/fenix/CMakeLists.txt index 78b07d5..aa5dc65 100644 --- a/examples/02_send_recv/fenix/CMakeLists.txt +++ b/examples/02_send_recv/fenix/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_ring-debug fenix_ring.c) target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME ring - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_ring-debug 1 2) + COMMAND mpirun --with-ft mpi -np 5 fenix_ring-debug 1 2) set_tests_properties(ring PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/examples/05_subset_create/CMakeLists.txt b/examples/05_subset_create/CMakeLists.txt index 10d9864..bf2da45 100644 --- a/examples/05_subset_create/CMakeLists.txt +++ b/examples/05_subset_create/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_subset_create-debug subset_create.c) target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME subset_create - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_create-debug 1) + COMMAND mpirun --with-ft mpi -np 5 fenix_subset_create-debug 1) set_tests_properties(subset_create PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/examples/06_subset_createv/CMakeLists.txt b/examples/06_subset_createv/CMakeLists.txt index 72112eb..3a935a7 100644 --- a/examples/06_subset_createv/CMakeLists.txt +++ b/examples/06_subset_createv/CMakeLists.txt @@ -16,7 +16,7 @@ if(BUILD_TESTING) add_executable(fenix_subset_createv-debug subset_createv.c) target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES}) add_test(NAME subset_createv - COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_createv-debug 1) + COMMAND mpirun --with-ft mpi -np 5 fenix_subset_createv-debug 1) set_tests_properties(subset_createv PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") endif() diff --git a/include/fenix.h b/include/fenix.h index 7a1e382..4d7ca67 100644 --- a/include/fenix.h +++ b/include/fenix.h @@ -181,6 +181,9 @@ int Fenix_Data_barrier(int group_id); int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset* found_data); +int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, + int max_count, int time_stamp, Fenix_Data_subset* found_data); + int Fenix_Data_member_restore_from_rank(int member_id, void *data, int max_count, int time_stamp, int group_id, int source_rank); diff --git a/include/fenix_comm_list.h b/include/fenix_comm_list.h index 1bd6210..c84f5c7 100644 --- a/include/fenix_comm_list.h +++ b/include/fenix_comm_list.h @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Rob Van der Wijngaart, and Michael Heroux +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/include/fenix_data_group.h b/include/fenix_data_group.h index cb37e25..67cb079 100644 --- a/include/fenix_data_group.h +++ b/include/fenix_data_group.h @@ -101,6 +101,10 @@ typedef struct __fenix_group_vtbl { void* target_buffer, int max_count, int time_stamp, Fenix_Data_subset* data_found); + int (*member_lrestore)(fenix_group_t* group, int member_id, + void* target_buffer, int max_count, int time_stamp, + Fenix_Data_subset* data_found); + int (*member_restore_from_rank)(fenix_group_t* group, int member_id, void* target_buffer, int max_count, int time_stamp, int source_rank); diff --git a/include/fenix_data_member.h b/include/fenix_data_member.h index 6be2196..b37c652 100644 --- a/include/fenix_data_member.h +++ b/include/fenix_data_member.h @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// and Michael Heroux +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/include/fenix_data_packet.h b/include/fenix_data_packet.h index 018e9bc..372f58a 100644 --- a/include/fenix_data_packet.h +++ b/include/fenix_data_packet.h @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// and Michael Heroux +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/include/fenix_data_recovery.h b/include/fenix_data_recovery.h index 79d9553..856dbe5 100644 --- a/include/fenix_data_recovery.h +++ b/include/fenix_data_recovery.h @@ -107,8 +107,6 @@ typedef struct __data_entry_packet { } fenix_data_entry_packet_t; -int store_counter; - int __fenix_group_create(int, MPI_Comm, int, int, int, void*, int*); int __fenix_group_get_redundancy_policy(int, int*, int*, int*); int __fenix_member_create(int, int, void *, int, MPI_Datatype); @@ -122,6 +120,7 @@ int __fenix_data_commit(int, int *); int __fenix_data_commit_barrier(int, int *); int __fenix_data_barrier(int); int __fenix_member_restore(int, int, void *, int, int, Fenix_Data_subset*); +int __fenix_member_lrestore(int, int, void *, int, int, Fenix_Data_subset*); int __fenix_member_restore_from_rank(int, int, void *, int, int, int); int __fenix_get_number_of_members(int, int *); int __fenix_get_member_at_position(int, int *, int); diff --git a/include/fenix_f.h b/include/fenix_f.h index 69b84a6..a8f06c0 100644 --- a/include/fenix_f.h +++ b/include/fenix_f.h @@ -45,7 +45,7 @@ !// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. !// !// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -!// and Michael Heroux +!// Michael Heroux, and Matthew Whitlock !// !// Questions? Contact Keita Teranishi (knteran@sandia.gov) and !// Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/include/fenix_opt.h b/include/fenix_opt.h index 521f885..b032b02 100644 --- a/include/fenix_opt.h +++ b/include/fenix_opt.h @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// and Michael Heroux +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/include/fenix_process_recovery.h b/include/fenix_process_recovery.h index 90f2075..bb9d63a 100644 --- a/include/fenix_process_recovery.h +++ b/include/fenix_process_recovery.h @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Rob Van der Wijngaart, and Michael Heroux +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/include/fenix_process_recovery_global.h b/include/fenix_process_recovery_global.h index 4b7d0b5..bd06cc8 100644 --- a/include/fenix_process_recovery_global.h +++ b/include/fenix_process_recovery_global.h @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// Rob Van der Wijngaart, and Michael Heroux +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/include/fenix_util.h b/include/fenix_util.h index 1a99ca1..8f76275 100644 --- a/include/fenix_util.h +++ b/include/fenix_util.h @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// and Michael Heroux +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) @@ -75,7 +75,7 @@ #include #include -char *logname; +extern char *logname; #define LDEBUG(f...) {LLIND("debug",f);} #define LLIND(t,f...) {fprintf(stderr,"%s - %s (%i): %s: \n",logname,__PRETTY_FUNCTION__,getpid(),t); fprintf(stderr,f);} diff --git a/src/fenix.c b/src/fenix.c index 3590297..93f29f9 100644 --- a/src/fenix.c +++ b/src/fenix.c @@ -130,6 +130,10 @@ int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer, return __fenix_member_restore(group_id, member_id, target_buffer, max_count, time_stamp, data_found); } +int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset* data_found) { + return __fenix_member_lrestore(group_id, member_id, target_buffer, max_count, time_stamp, data_found); +} + int Fenix_Data_member_resore_from_rank(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, int source_rank) { return 0; } diff --git a/src/fenix_callbacks.c b/src/fenix_callbacks.c index f693080..885058d 100644 --- a/src/fenix_callbacks.c +++ b/src/fenix_callbacks.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Rob Van der Wijngaart, and Michael Heroux +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/src/fenix_comm_list.c b/src/fenix_comm_list.c index f9fe0cf..d1b56d2 100644 --- a/src/fenix_comm_list.c +++ b/src/fenix_comm_list.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Rob Van der Wijngaart, and Michael Heroux +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/src/fenix_data_policy.c b/src/fenix_data_policy.c index b368223..603aff1 100644 --- a/src/fenix_data_policy.c +++ b/src/fenix_data_policy.c @@ -44,7 +44,8 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Author Matthew Whitlock +// Authors Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// and Matthew Whitloc // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/src/fenix_data_policy_in_memory_raid.c b/src/fenix_data_policy_in_memory_raid.c index d9af83f..40b265d 100644 --- a/src/fenix_data_policy_in_memory_raid.c +++ b/src/fenix_data_policy_in_memory_raid.c @@ -88,6 +88,9 @@ int __imr_barrier(fenix_group_t* group); int __imr_member_restore(fenix_group_t* group, int member_id, void* target_buffer, int max_count, int time_stamp, Fenix_Data_subset* data_found); +int __imr_member_lrestore(fenix_group_t* group, int member_id, + void* target_buffer, int max_count, int time_stamp, + Fenix_Data_subset* data_found); int __imr_member_restore_from_rank(fenix_group_t* group, int member_id, void* target_buffer, int max_count, int time_stamp, int source_rank); @@ -138,6 +141,7 @@ void __fenix_policy_in_memory_raid_get_group(fenix_group_t** group, MPI_Comm com new_group->base.vtbl.snapshot_delete = *__imr_snapshot_delete; new_group->base.vtbl.barrier = *__imr_barrier; new_group->base.vtbl.member_restore = *__imr_member_restore; + new_group->base.vtbl.member_lrestore = *__imr_member_lrestore; new_group->base.vtbl.member_restore_from_rank = *__imr_member_restore_from_rank; new_group->base.vtbl.member_get_attribute = *__imr_member_get_attribute; new_group->base.vtbl.member_set_attribute = *__imr_member_set_attribute; @@ -154,16 +158,82 @@ void __fenix_policy_in_memory_raid_get_group(fenix_group_t** group, MPI_Comm com MPI_Comm_rank(comm, &my_rank); if(new_group->raid_mode == 1){ + //Set up the person who's data I am storing as partner 0 + //Set up the person who is storing my data as partner 1 new_group->partners = (int*) malloc(sizeof(int) * 2); + + //odd-sized groups take some extra handling. + bool isOdd = ((comm_size%2) != 0); + + int remaining_size = comm_size; + if(isOdd) remaining_size -= 3; + + //We want to form groups of rank_separation*2 to pair within + int n_full_groups = remaining_size / (new_group->rank_separation*2); + + //We don't always get what we want though, one group may need to be smaller. + int mini_group_size = (remaining_size - n_full_groups*new_group->rank_separation*2)/2; - //Set up the person who's data I am storing - //We need to add comm size to the value since otherwise we might be modding a negative number, - // which is implementation-dependent behavior. - new_group->partners[0] = (comm_size + my_rank - new_group->rank_separation)%comm_size; - //Set up the person who is storing my data - new_group->partners[1] = (my_rank + new_group->rank_separation)%comm_size; - + int start_rank = mini_group_size + (isOdd?1:0); + int mid_rank = comm_size/2; //Only used when isOdd + + int end_mini_group_start = comm_size-mini_group_size-(isOdd?1:0); + int start_mini_group_start = (isOdd?1:0); + bool in_start_mini=false, in_end_mini=false; + + if(my_rank >= start_mini_group_start && my_rank < start_mini_group_start+mini_group_size){ + in_start_mini = true; + } else if(my_rank >= end_mini_group_start && my_rank < comm_size-(isOdd?1:0)){ + in_end_mini = true; + } + + //Allocate the "normal" ranks + if(my_rank >= start_rank && my_rank < end_mini_group_start && (!isOdd || my_rank != mid_rank)){ + //"effective" rank for determining which group I'm in and if I look forward or backward for a partner. + int e_rank = my_rank - start_rank; + if(isOdd && my_rank > mid_rank) --e_rank; //We skip the middle rank when isOdd + + int my_partner; + if(((e_rank/new_group->rank_separation)%2) == 0){ + //Look forward for partner. + my_partner = my_rank + new_group->rank_separation; + if(isOdd && my_rank < mid_rank && my_partner >= mid_rank) ++my_partner; + } else { + my_partner = my_rank - new_group->rank_separation; + if(isOdd && my_rank > mid_rank && my_partner <= mid_rank) --my_partner; + } + + new_group->partners[0] = my_partner; + new_group->partners[1] = my_partner; + } else if(in_start_mini) { + int e_rank = my_rank - start_mini_group_start; + int partner = end_mini_group_start + e_rank; + new_group->partners[0] = partner; + new_group->partners[1] = partner; + } else if(in_end_mini) { + int e_rank = my_rank - end_mini_group_start; + int partner = start_mini_group_start + e_rank; + new_group->partners[0] = partner; + new_group->partners[1] = partner; + } else { //Only things left are the three ranks that must be paired to handle odd-sized comms + if(my_rank == 0){ + new_group->partners[0] = comm_size-1; + new_group->partners[1] = mid_rank; + } else if(my_rank == mid_rank){ + new_group->partners[0] = 0; + new_group->partners[1] = comm_size-1; + } else if(my_rank == comm_size-1){ + new_group->partners[0] = mid_rank; + new_group->partners[1] = 0; + } else { + fprintf(stderr, "FENIX_IMR Fatal error: Rank <%d> no partner assigned, this is a bug in IMR!\n", my_rank); + *flag = FENIX_ERROR_GROUP_CREATE; + return; + } + } + + } else if(new_group->raid_mode == 5){ new_group->set_size = policy_vals[2]; new_group->partners = (int*) malloc(sizeof(int) * new_group->set_size); @@ -642,7 +712,7 @@ int __imr_member_restore(fenix_group_t* g, int member_id, int my_data_found, partner_data_found; //We need to know if both partners found their data. - //First send to partner 1 and recv from partner 0, then flip. + //First send to partner 0 and recv from partner 1, then flip. MPI_Sendrecv(&found_member, 1, MPI_INT, group->partners[0], PARTNER_STATUS_TAG, &my_data_found, 1, MPI_INT, group->partners[1], PARTNER_STATUS_TAG, group->base.comm, NULL); @@ -650,53 +720,63 @@ int __imr_member_restore(fenix_group_t* g, int member_id, &partner_data_found, 1, MPI_INT, group->partners[0], PARTNER_STATUS_TAG, group->base.comm, NULL); - if(found_member && partner_data_found){ + if(found_member && partner_data_found && my_data_found){ //I have my data, and the person who's data I am backing up has theirs. We're good to go. retval = FENIX_SUCCESS; - } else if (!found_member && !my_data_found) { - //I lost my data, and my partner 1 doesn't have a copy for me to restore from. - debug_print("ERROR Fenix_Data_member_restore: member_id <%d> does not exist at <%d> or partner <%d>\n", - member_id, group->base.current_rank, group->partners[0]); + } else if (!found_member && (!my_data_found || !partner_data_found)){ + //I lost my data, and my partner doesn't have a copy for me to restore from. + debug_print("ERROR Fenix_Data_member_restore: member_id <%d> does not exist at <%d> or partner(s) <%d> <%d>\n", + member_id, group->base.current_rank, group->partners[0], group->partners[1]); retval = FENIX_ERROR_INVALID_MEMBERID; - } else if(found_member && !partner_data_found){ - //My partner needs info on this member. This policy does nothing special w/ extra input params, so + } else if(found_member){ + //My partner(s) need info on this member. This policy does nothing special w/ extra input params, so //I can just send the basic member metadata. - __fenix_data_member_send_metadata(group->base.groupid, member_id, group->partners[0]); + if(!partner_data_found) + __fenix_data_member_send_metadata(group->base.groupid, member_id, group->partners[0]); //Now my partner will need all of the entries. First they'll need to know how many snapshots //to expect. - MPI_Send((void*) &(group->num_snapshots), 1, MPI_INT, group->partners[0], - RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); + if(!partner_data_found) + MPI_Send((void*) &(group->num_snapshots), 1, MPI_INT, group->partners[0], + RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); //They also need the timestamps for each snapshot, as well as the value for the next. - MPI_Send((void*)mentry->timestamp, group->num_snapshots+1, MPI_INT, group->partners[0], - RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); + if(!partner_data_found) + MPI_Send((void*)mentry->timestamp, group->num_snapshots+1, MPI_INT, group->partners[0], + RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); for(int snapshot = 0; snapshot < group->num_snapshots; snapshot++){ //send data region info next - __fenix_data_subset_send(mentry->data_regions + snapshot, group->partners[0], - __IMR_RECOVER_DATA_REGION_TAG ^ group->base.groupid, group->base.comm); + if(!partner_data_found) + __fenix_data_subset_send(mentry->data_regions + snapshot, group->partners[0], + __IMR_RECOVER_DATA_REGION_TAG ^ group->base.groupid, group->base.comm); - //send my data, to maintain resiliency on my data size_t size; - void* toSend = __fenix_data_subset_serialize(mentry->data_regions+snapshot, - mentry->data[snapshot], member_data.datatype_size, member_data.current_count, - &size); - MPI_Send(toSend, member_data.datatype_size*size, MPI_BYTE, group->partners[0], - RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); + void* toSend; + //send my data, to maintain resiliency on my data + if(!my_data_found){ + toSend = __fenix_data_subset_serialize(mentry->data_regions+snapshot, + mentry->data[snapshot], member_data.datatype_size, member_data.current_count, + &size); + MPI_Send(toSend, member_data.datatype_size*size, MPI_BYTE, group->partners[1], + RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); + free(toSend); + } //send their data - toSend = __fenix_data_subset_serialize(mentry->data_regions+snapshot, - ((char*)mentry->data[snapshot]) + member_data.datatype_size*member_data.current_count, - member_data.datatype_size, member_data.current_count, &size); - MPI_Send(toSend, member_data.datatype_size*size, MPI_BYTE, group->partners[0], - RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); + if(!partner_data_found){ + toSend = __fenix_data_subset_serialize(mentry->data_regions+snapshot, + ((char*)mentry->data[snapshot]) + member_data.datatype_size*member_data.current_count, + member_data.datatype_size, member_data.current_count, &size); + MPI_Send(toSend, member_data.datatype_size*size, MPI_BYTE, group->partners[0], + RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm); + free(toSend); + } - free(toSend); } - } else if(!found_member && partner_data_found) { + } else if(!found_member) { //I need info on this member. fenix_member_entry_packet_t packet; __fenix_data_member_recv_metadata(group->base.groupid, group->partners[1], &packet); @@ -731,13 +811,13 @@ int __imr_member_restore(fenix_group_t* g, int member_id, if(recv_size > 0){ void* recv_buf = malloc(member_data.datatype_size * recv_size); //first recieve their data, so store in the resiliency section. - MPI_Recv(recv_buf, recv_size*member_data.datatype_size, MPI_BYTE, group->partners[1], + MPI_Recv(recv_buf, recv_size*member_data.datatype_size, MPI_BYTE, group->partners[0], RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm, NULL); __fenix_data_subset_deserialize(mentry->data_regions + snapshot, recv_buf, ((char*)mentry->data[snapshot]) + member_data.current_count*member_data.datatype_size, member_data.current_count, member_data.datatype_size); - //first recieve their data, so store in the resiliency section. + //Now receive my data. MPI_Recv(recv_buf, recv_size*member_data.datatype_size, MPI_BYTE, group->partners[1], RECOVER_MEMBER_ENTRY_TAG^group->base.groupid, group->base.comm, NULL); __fenix_data_subset_deserialize(mentry->data_regions + snapshot, recv_buf, @@ -749,7 +829,7 @@ int __imr_member_restore(fenix_group_t* g, int member_id, } - recovery_locally_possible = found_member || my_data_found; + recovery_locally_possible = found_member || (my_data_found && partner_data_found); } else if (group->raid_mode == 5){ int* set_results = malloc(sizeof(int) * group->set_size); @@ -912,7 +992,7 @@ int __imr_member_restore(fenix_group_t* g, int member_id, __fenix_data_subset_init(1, data_found); //Don't try to restore if we weren't able to get the relevant data. - if(recovery_locally_possible){ + if(recovery_locally_possible && target_buffer != NULL){ data_found->specifier = __FENIX_SUBSET_EMPTY; int oldest_snapshot; @@ -956,6 +1036,70 @@ int __imr_member_restore(fenix_group_t* g, int member_id, return retval; } +int __imr_member_lrestore(fenix_group_t* g, int member_id, + void* target_buffer, int max_count, int time_stamp, Fenix_Data_subset* data_found){ + int retval = -1; + + fenix_imr_group_t* group = (fenix_imr_group_t*)g; + + fenix_imr_mentry_t* mentry; + //find_mentry returns the error status. We found the member (and corresponding data) if there are no errors. + int found_member = !(__imr_find_mentry(group, member_id, &mentry)); + + if(!found_member){ + return FENIX_ERROR_INVALID_MEMBERID; + } + + int member_data_index = __fenix_search_memberid(group->base.member, member_id); + fenix_member_entry_t member_data = group->base.member->member_entry[member_data_index]; + + + + int return_found_data; + if(data_found == NULL){ + data_found = (Fenix_Data_subset*) malloc(sizeof(Fenix_Data_subset)); + return_found_data = 0; + } else { + return_found_data = 1; + } + __fenix_data_subset_init(1, data_found); + + data_found->specifier = __FENIX_SUBSET_EMPTY; + + + int oldest_snapshot; + for(oldest_snapshot = (mentry->current_head - 1); oldest_snapshot >= 0; oldest_snapshot--){ + __fenix_data_subset_merge_inplace(data_found, mentry->data_regions + oldest_snapshot); + + if(__fenix_data_subset_is_full(data_found, member_data.current_count)){ + //The snapshots have formed a full set of data, not need to add older snapshots. + break; + } + } + + //If there isn't a full set of data, don't try to pull from nonexistent snapshot. + if(oldest_snapshot == -1){ + oldest_snapshot = 0; + } + + for(int i = oldest_snapshot; i < mentry->current_head; i++){ + __fenix_data_subset_copy_data(&mentry->data_regions[i], target_buffer, + mentry->data[i], member_data.datatype_size, member_data.current_count); + } + + if(__fenix_data_subset_is_full(data_found, member_data.current_count)){ + retval = FENIX_SUCCESS; + } else { + retval = FENIX_WARNING_PARTIAL_RESTORE; + } + + //Dont forget to clear the commit buffer + mentry->data_regions[mentry->current_head].specifier = __FENIX_SUBSET_EMPTY; + + return retval; + +} + int __imr_member_restore_from_rank(fenix_group_t* group, int member_id, void* target_buffer, int max_count, int time_stamp, diff --git a/src/fenix_data_recovery.c b/src/fenix_data_recovery.c index 7c1c706..da87c30 100644 --- a/src/fenix_data_recovery.c +++ b/src/fenix_data_recovery.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Michael Heroux, and Matthew Whitloc +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) @@ -63,6 +63,7 @@ #include "fenix_util.h" #include "fenix_ext.h" +#include /** * @brief create new group or recover group data for lost processes @@ -583,12 +584,41 @@ int __fenix_data_commit_barrier(int groupid, int *timestamp) { retval = FENIX_ERROR_INVALID_GROUPID; } else { fenix_group_t *group = (fenix.data_recovery->group[group_index]); + + + //We want to make sure there aren't any revocations and also do a barrier. + //Start by disabling Fenix error handling so we don't generate any new revokations here. + int old_failure_handling = fenix.ignore_errs; + fenix.ignore_errs = 1; + + //We'll use comm_agree as a resilient barrier, which should also give time for + //any revocations to propogate + int tmp_throwaway = 1; + MPIX_Comm_agree(group->comm, &tmp_throwaway); + //Now use iprobe to check for revocations. + MPI_Status status; + int ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, group->comm, + &tmp_throwaway, &status); + + fenix.ignore_errs = old_failure_handling; + + + if(ret != MPI_ERR_REVOKED){ + retval = group->vtbl.commit(group); + } - retval = group->vtbl.commit(group); - - int min_timestamp; - MPI_Allreduce( &(group->timestamp), &min_timestamp, 1, MPI_INT, MPI_MIN, group->comm ); + //Now that we've (hopefully) commited, we want to handle any errors we've + //learned about w.r.t failures or revocations. No reason to put handling those off. + if(ret != MPI_SUCCESS){ + retval = ret; + //Just re-calling should have Fenix handle things according to whatever method + //has been assigned. + MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, group->comm, + &tmp_throwaway, &status); + } + + if (timestamp != NULL) { *timestamp = group->timestamp; } @@ -631,6 +661,40 @@ int __fenix_member_restore(int groupid, int memberid, void *data, int maxcount, return retval; } +/** + * @brief + * @param group_id + * @param member_id + * @param data + * @param max_count + * @param time_stamp + */ +int __fenix_member_lrestore(int groupid, int memberid, void *data, int maxcount, int timestamp, Fenix_Data_subset* data_found) { + + int retval = FENIX_SUCCESS; + int group_index = __fenix_search_groupid(groupid, fenix.data_recovery); + int member_index = -1; + + if(group_index != -1) member_index = __fenix_search_memberid(fenix.data_recovery->group[group_index]->member, memberid); + + + if (fenix.options.verbose == 25) { + verbose_print("c-rank: %d, role: %d, group_index: %d, member_index: %d\n", + __fenix_get_current_rank(fenix.new_world), fenix.role, group_index, + member_index); + } + + if (group_index == -1) { + debug_print("ERROR Fenix_Data_member_lrestore: group_id <%d> does not exist\n", + groupid); + retval = FENIX_ERROR_INVALID_GROUPID; + } else { + fenix_group_t *group = (fenix.data_recovery->group[group_index]); + retval = group->vtbl.member_lrestore(group, memberid, data, maxcount, timestamp, data_found); + } + return retval; +} + /** * @brief * @param group_id diff --git a/src/fenix_mpi_override.c b/src/fenix_mpi_override.c index a3592a7..3761348 100644 --- a/src/fenix_mpi_override.c +++ b/src/fenix_mpi_override.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Rob Van der Wijngaart, and Michael Heroux +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/src/fenix_opt.c b/src/fenix_opt.c index 03e07f2..8d5bfcb 100644 --- a/src/fenix_opt.c +++ b/src/fenix_opt.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// and Michael Heroux +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/src/fenix_process_recovery.c b/src/fenix_process_recovery.c index 6cc42de..5609326 100644 --- a/src/fenix_process_recovery.c +++ b/src/fenix_process_recovery.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, -// Rob Van der Wijngaart, and Michael Heroux +// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) @@ -67,6 +67,8 @@ #include #include +#include + int __fenix_preinit(int *role, MPI_Comm comm, MPI_Comm *new_comm, int *argc, char ***argv, int spare_ranks, int spawn, @@ -260,7 +262,12 @@ int __fenix_create_new_world() } ret = PMPI_Comm_split(fenix.world, 0, current_rank, &fenix.new_world); - if (ret != MPI_SUCCESS) { debug_print("MPI_Comm_split: %d\n", ret); } + if (ret != MPI_SUCCESS){ + int len; + char errstr[MPI_MAX_ERROR_STRING]; + MPI_Error_string(ret, errstr, &len); + debug_print("MPI_Comm_split: %s\n", errstr); + } } return ret; @@ -402,7 +409,6 @@ int __fenix_repair_ranks() if(fenix.role != FENIX_ROLE_INITIAL_RANK){ free(fenix.fail_world); } - fenix.fail_world = (int *) s_malloc(fenix.fail_world_size * sizeof(int)); fenix.fail_world = __fenix_get_fail_ranks(survivor_world, survivor_world_size, fenix.fail_world_size); @@ -426,6 +432,11 @@ int __fenix_repair_ranks() /* Assign new rank for reordering */ if (current_rank >= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); + + for(int fail_i = 0; fail_i < fenix.fail_world_size; fail_i++){ + if(fenix.fail_world[fail_i] > current_rank) rank_offset--; + } + if (rank_offset < fenix.fail_world_size) { if (fenix.options.verbose == 11) { verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n", @@ -513,10 +524,15 @@ int __fenix_repair_ranks() if (current_rank >= active_ranks) { // reorder ranks int rank_offset = ((world_size - 1) - current_rank); + + for(int fail_i = 0; fail_i < fenix.fail_world_size; fail_i++){ + if(fenix.fail_world[fail_i] > current_rank) rank_offset--; + } + if (rank_offset < fenix.fail_world_size) { if (fenix.options.verbose == 2) { - verbose_print("reorder ranks; current_rank: %d -> new_rank: %d\n", - current_rank, fenix.fail_world[rank_offset]); + verbose_print("reorder ranks; current_rank: %d -> new_rank: %d (offset %d)\n", + current_rank, fenix.fail_world[rank_offset], rank_offset); } current_rank = fenix.fail_world[rank_offset]; } @@ -587,9 +603,11 @@ int* __fenix_get_fail_ranks(int *survivor_world, int survivor_world_size, int fa { qsort(survivor_world, survivor_world_size, sizeof(int), __fenix_comparator); int failed_pos = 0; + int *fail_ranks = calloc(fail_world_size, sizeof(int)); + int i; - for (i = 0; i < survivor_world_size; i++) { + for (i = 0; i < survivor_world_size + fail_world_size; i++) { if (__fenix_binary_search(survivor_world, survivor_world_size, i) != 1) { if (fenix.options.verbose == 14) { verbose_print("fail_rank: %d, fail_ranks[%d]: %d\n", i, failed_pos, @@ -738,6 +756,7 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...) } switch (ret) { + case MPI_ERR_PROC_FAILED_PENDING: case MPI_ERR_PROC_FAILED: MPIX_Comm_revoke(fenix.world); MPIX_Comm_revoke(fenix.new_world); @@ -773,6 +792,7 @@ void __fenix_test_MPI(MPI_Comm *pcomm, int *pret, ...) #endif } + fenix.role = FENIX_ROLE_SURVIVOR_RANK; if(!fenix.finalized) { switch(fenix.resume_mode) { diff --git a/src/fenix_util.c b/src/fenix_util.c index 3b40933..b56d237 100644 --- a/src/fenix_util.c +++ b/src/fenix_util.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// and Michael Heroux +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) @@ -58,6 +58,8 @@ #include "fenix_process_recovery.h" #include "fenix_util.h" +char* logname; + /** * @brief * @param invec diff --git a/src/globals.c b/src/globals.c index e834a97..e812a08 100644 --- a/src/globals.c +++ b/src/globals.c @@ -45,7 +45,7 @@ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar -// and Michael Heroux +// Michael Heroux, and Matthew Whitlock // // Questions? Contact Keita Teranishi (knteran@sandia.gov) and // Marc Gamell (mgamell@cac.rutgers.edu) diff --git a/test/failed_spares/CMakeLists.txt b/test/failed_spares/CMakeLists.txt new file mode 100644 index 0000000..96827f3 --- /dev/null +++ b/test/failed_spares/CMakeLists.txt @@ -0,0 +1,15 @@ +# +# This file is part of Fenix +# Copyright (c) 2016 Rutgers University and Sandia Corporation. +# This software is distributed under the BSD License. +# Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +# the U.S. Government retains certain rights in this software. +# For more information, see the LICENSE file in the top Fenix +# directory. +# + +#set(CMAKE_BUILD_TYPE Debug) +add_executable(fenix_failed_spares fenix_failed_spares.c) +target_link_libraries(fenix_failed_spares fenix ${MPI_C_LIBRARIES}) +add_test(NAME failed_spares + COMMAND mpirun --with-ft mpi -n 6 fenix_failed_spares 3 1 3 4 ) diff --git a/test/failed_spares/fenix_failed_spares.c b/test/failed_spares/fenix_failed_spares.c new file mode 100644 index 0000000..02d18a4 --- /dev/null +++ b/test/failed_spares/fenix_failed_spares.c @@ -0,0 +1,147 @@ +/* +//@HEADER +// ************************************************************************ +// +// +// _|_|_|_| _|_|_|_| _| _| _|_|_| _| _| +// _| _| _|_| _| _| _| _| +// _|_|_| _|_|_| _| _| _| _| _| +// _| _| _| _|_| _| _| _| +// _| _|_|_|_| _| _| _|_|_| _| _| +// +// +// +// +// Copyright (C) 2016 Rutgers University and Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar, +// Michael Heroux, and Matthew Whitlock +// +// Questions? Contact Keita Teranishi (knteran@sandia.gov) and +// Marc Gamell (mgamell@cac.rutgers.edu) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include +#include +#include +#include + +const int kKillID = 1; + +void* exitThread(void* should_exit){ + sleep(1); + if( ((int)should_exit) == 1){ + pid_t pid = getpid(); + kill(pid, SIGTERM); + } + return NULL; +} + +int main(int argc, char **argv) { + + if (argc < 3) { + printf("Usage: %s <# spare ranks> ... \n", *argv); + exit(0); + } + + int old_world_size, new_world_size = - 1; + int old_rank = 1, new_rank = - 1; + int spare_ranks = atoi(argv[1]); + + MPI_Init(&argc, &argv); + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Comm world_comm; + MPI_Comm_dup(MPI_COMM_WORLD, &world_comm); + MPI_Comm_size(world_comm, &old_world_size); + MPI_Comm_rank(world_comm, &old_rank); + + int should_cancel = 0; + for(int i = 2; i < argc; i++){ + if(atoi(argv[i]) == old_rank) should_cancel = 1; + } + pthread_t thread_id; + pthread_create(&thread_id, NULL, exitThread, (void*)should_cancel); + + int fenix_status; + int recovered = 0; + MPI_Comm new_comm; + int error; + Fenix_Init(&fenix_status, world_comm, &new_comm, &argc, &argv, spare_ranks, 0, MPI_INFO_NULL, &error); + + if (fenix_status != FENIX_ROLE_INITIAL_RANK) { + MPI_Comm_size(new_comm, &new_world_size); + MPI_Comm_rank(new_comm, &new_rank); + recovered = 1; + } + + if (recovered == 0) { + //Give time for exit thread to work (which needed to give time for fenix init) + sleep(2); + } + + MPI_Barrier(new_comm); + + char processor_name[MPI_MAX_PROCESSOR_NAME]; + int name_len; + MPI_Get_processor_name(processor_name, &name_len); + + printf("hello world: %s, old rank (MPI_COMM_WORLD): %d, new rank: %d, active ranks: %d, ranks before process failure: %d\n", + processor_name, old_rank, new_rank, new_world_size, old_world_size); + + int *fails, num_fails; + num_fails = Fenix_Process_fail_list(&fails); + + char fails_str[100]; + sprintf(fails_str, "Rank %d sees failed processes [", new_rank); + for(int i = 0; i < num_fails; i++){ + sprintf(fails_str, "%s%s%d", fails_str, (i==0 ? "" : ", "), fails[i]); + } + sprintf(fails_str, "%s]\n", fails_str); + printf(fails_str); + + + + Fenix_Finalize(); + pthread_join(thread_id, NULL); + + MPI_Finalize(); + + return 0; +} diff --git a/test/issend/CMakeLists.txt b/test/issend/CMakeLists.txt index 1375b5d..c4f6918 100644 --- a/test/issend/CMakeLists.txt +++ b/test/issend/CMakeLists.txt @@ -12,4 +12,4 @@ set(CMAKE_BUILD_TYPE Debug) add_executable(fenix_issend_test fenix_issend_test.c) target_link_libraries(fenix_issend_test fenix ${MPI_C_LIBRARIES}) -add_test(NAME issend COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_issend_test "1") +add_test(NAME issend COMMAND mpirun --with-ft mpi -np 5 fenix_issend_test "1") diff --git a/test/issend/fenix_issend_test.c b/test/issend/fenix_issend_test.c index 7e45e5c..0159297 100644 --- a/test/issend/fenix_issend_test.c +++ b/test/issend/fenix_issend_test.c @@ -66,7 +66,6 @@ const int kKillID = 1; int main(int argc, char **argv) { -#warning "It's a good idea to complain when not enough parameters! Should add this code to other examples too." if (argc < 2) { printf("Usage: %s <# spare ranks> \n", *argv); exit(0); diff --git a/test/no_jump/CMakeLists.txt b/test/no_jump/CMakeLists.txt index fb830f5..b3258dd 100644 --- a/test/no_jump/CMakeLists.txt +++ b/test/no_jump/CMakeLists.txt @@ -12,4 +12,4 @@ set(CMAKE_BUILD_TYPE Debug) add_executable(fenix_no_jump_test fenix_no_jump_test.c) target_link_libraries(fenix_no_jump_test fenix ${MPI_C_LIBRARIES}) -add_test(NAME no_jump COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_no_jump_test "1") +add_test(NAME no_jump COMMAND mpirun --with-ft mpi -np 5 fenix_no_jump_test "1") diff --git a/test/request_cancelled/CMakeLists.txt b/test/request_cancelled/CMakeLists.txt index 88af22b..a59af59 100644 --- a/test/request_cancelled/CMakeLists.txt +++ b/test/request_cancelled/CMakeLists.txt @@ -12,4 +12,4 @@ set(CMAKE_BUILD_TYPE Debug) add_executable(fenix_request_cancelled_test fenix_req_cancelled_test.c) target_link_libraries(fenix_request_cancelled_test fenix ${MPI_C_LIBRARIES}) -add_test(NAME request_cancelled COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_request_cancelled_test "1") +add_test(NAME request_cancelled COMMAND mpirun --with-ft mpi -np 5 fenix_request_cancelled_test "1")