Skip to content

Commit

Permalink
Update ULFM, improve localized restore functionality, bugfixes (#61)
Browse files Browse the repository at this point in the history
* Update run command to newest recommended flags
* Update instructions to latest ULFM/OpenMPI recommended version
* Repair files from revert
* Add lrestore function for local-data-only restoring
* Fix bug in recovery when spare ranks fail
* Improve safety when using commit_barrier

Co-authored-by: Matthew Whitlock <[email protected]>
  • Loading branch information
Matthew-Whitlock and Matthew-Whitlock authored Apr 28, 2022
1 parent 571f488 commit 0a5b8f6
Show file tree
Hide file tree
Showing 33 changed files with 480 additions and 78 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ These instructions assume you are in your home directory.
* For example: ` git clone <address of this repo> `
2. Create a build directory.
* For example: ` mkdir -p ~/build/fenix/ && cd ~/build/fenix/ `
3. Specify the MPI C compiler to use. [ULFM2 Open MPI](https://bitbucket.org/icldistcomp/ulfm2) is the required version.
3. Specify the MPI C compiler to use. [Open MPI 5+](https://github.com/open-mpi/ompi/tree/v5.0.x) is the required version.
* To manually indicate which compiler `cmake` should use, set the `MPICC` variable to point to it.
* For example: ` export MPICC=~/install/mpi-ulfm/bin/mpicc `
* If the `MPICC` environment variable is not there, `cmake` will try to guess where the MPI implementation is. To help, make sure you include the installation directory of MPI in your `PATH`.
Expand Down
4 changes: 2 additions & 2 deletions examples/01_hello_world/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ add_executable(fenix_hello_world fenix_hello_world.c)
target_link_libraries(fenix_hello_world fenix ${MPI_C_LIBRARIES})

if(BUILD_TESTING)
set(CMAKE_BUILD_TYPE Debug)
#set(CMAKE_BUILD_TYPE Debug)
add_executable(fenix_hello_world-debug fenix_hello_world.c)
target_link_libraries(fenix_hello_world-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME hello_world
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 3 fenix_hello_world-debug "1")
COMMAND mpirun --with-ft mpi -n 3 fenix_hello_world-debug "1")
endif()
2 changes: 1 addition & 1 deletion examples/02_send_recv/fenix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_ring-debug fenix_ring.c)
target_link_libraries(fenix_ring-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME ring
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_ring-debug 1 2)
COMMAND mpirun --with-ft mpi -np 5 fenix_ring-debug 1 2)
set_tests_properties(ring PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
2 changes: 1 addition & 1 deletion examples/05_subset_create/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_create-debug subset_create.c)
target_link_libraries(fenix_subset_create-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_create
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_create-debug 1)
COMMAND mpirun --with-ft mpi -np 5 fenix_subset_create-debug 1)
set_tests_properties(subset_create PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
2 changes: 1 addition & 1 deletion examples/06_subset_createv/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ if(BUILD_TESTING)
add_executable(fenix_subset_createv-debug subset_createv.c)
target_link_libraries(fenix_subset_createv-debug fenix ${MPI_C_LIBRARIES})
add_test(NAME subset_createv
COMMAND mpirun -mca mpi_ft_detector_timeout 1 -np 5 fenix_subset_createv-debug 1)
COMMAND mpirun --with-ft mpi -np 5 fenix_subset_createv-debug 1)
set_tests_properties(subset_createv PROPERTIES
FAIL_REGULAR_EXPRESSION "FAILURE")
endif()
3 changes: 3 additions & 0 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,9 @@ int Fenix_Data_barrier(int group_id);
int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer,
int max_count, int time_stamp, Fenix_Data_subset* found_data);

int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer,
int max_count, int time_stamp, Fenix_Data_subset* found_data);

int Fenix_Data_member_restore_from_rank(int member_id, void *data, int max_count,
int time_stamp, int group_id,
int source_rank);
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_comm_list.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Rob Van der Wijngaart, and Michael Heroux
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
4 changes: 4 additions & 0 deletions include/fenix_data_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ typedef struct __fenix_group_vtbl {
void* target_buffer, int max_count, int time_stamp,
Fenix_Data_subset* data_found);

int (*member_lrestore)(fenix_group_t* group, int member_id,
void* target_buffer, int max_count, int time_stamp,
Fenix_Data_subset* data_found);

int (*member_restore_from_rank)(fenix_group_t* group, int member_id,
void* target_buffer, int max_count, int time_stamp,
int source_rank);
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_data_member.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar
// and Michael Heroux
// Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_data_packet.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar
// and Michael Heroux
// Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
3 changes: 1 addition & 2 deletions include/fenix_data_recovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,6 @@ typedef struct __data_entry_packet {
} fenix_data_entry_packet_t;


int store_counter;

int __fenix_group_create(int, MPI_Comm, int, int, int, void*, int*);
int __fenix_group_get_redundancy_policy(int, int*, int*, int*);
int __fenix_member_create(int, int, void *, int, MPI_Datatype);
Expand All @@ -122,6 +120,7 @@ int __fenix_data_commit(int, int *);
int __fenix_data_commit_barrier(int, int *);
int __fenix_data_barrier(int);
int __fenix_member_restore(int, int, void *, int, int, Fenix_Data_subset*);
int __fenix_member_lrestore(int, int, void *, int, int, Fenix_Data_subset*);
int __fenix_member_restore_from_rank(int, int, void *, int, int, int);
int __fenix_get_number_of_members(int, int *);
int __fenix_get_member_at_position(int, int *, int);
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_f.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
!// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
!//
!// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar
!// and Michael Heroux
!// Michael Heroux, and Matthew Whitlock
!//
!// Questions? Contact Keita Teranishi ([email protected]) and
!// Marc Gamell ([email protected])
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar
// and Michael Heroux
// Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_process_recovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Rob Van der Wijngaart, and Michael Heroux
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
2 changes: 1 addition & 1 deletion include/fenix_process_recovery_global.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar
// Rob Van der Wijngaart, and Michael Heroux
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
4 changes: 2 additions & 2 deletions include/fenix_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar
// and Michael Heroux
// Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down Expand Up @@ -75,7 +75,7 @@
#include <signal.h>
#include <libgen.h>

char *logname;
extern char *logname;

#define LDEBUG(f...) {LLIND("debug",f);}
#define LLIND(t,f...) {fprintf(stderr,"%s - %s (%i): %s: \n",logname,__PRETTY_FUNCTION__,getpid(),t); fprintf(stderr,f);}
Expand Down
4 changes: 4 additions & 0 deletions src/fenix.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,10 @@ int Fenix_Data_member_restore(int group_id, int member_id, void *target_buffer,
return __fenix_member_restore(group_id, member_id, target_buffer, max_count, time_stamp, data_found);
}

int Fenix_Data_member_lrestore(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, Fenix_Data_subset* data_found) {
return __fenix_member_lrestore(group_id, member_id, target_buffer, max_count, time_stamp, data_found);
}

int Fenix_Data_member_resore_from_rank(int group_id, int member_id, void *target_buffer, int max_count, int time_stamp, int source_rank) {
return 0;
}
Expand Down
2 changes: 1 addition & 1 deletion src/fenix_callbacks.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Rob Van der Wijngaart, and Michael Heroux
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
2 changes: 1 addition & 1 deletion src/fenix_comm_list.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// Rob Van der Wijngaart, and Michael Heroux
// Rob Van der Wijngaart, Michael Heroux, and Matthew Whitlock
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
3 changes: 2 additions & 1 deletion src/fenix_data_policy.c
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Author Matthew Whitlock
// Authors Marc Gamell, Eric Valenzuela, Keita Teranishi, Manish Parashar,
// and Matthew Whitloc
//
// Questions? Contact Keita Teranishi ([email protected]) and
// Marc Gamell ([email protected])
Expand Down
Loading

0 comments on commit 0a5b8f6

Please sign in to comment.