Skip to content

Commit

Permalink
fix barrier for no async write
Browse files Browse the repository at this point in the history
  • Loading branch information
chinthysl committed Jun 27, 2024
1 parent b063b9b commit 35d60fe
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions train_gpt2.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,16 @@ void write_checkpoint(const char* output_log_dir, int step, GPT2* model, DataLoa
// all ranks write their state file
snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, step, rank);
save_state(filename_buffer, step, model, train_loader, async_write);

if (async_write == 0) {
// DONE file is a signal that this checkpoint as a whole is complete
multi_gpu_barrier(multi_gpu_config);
if (rank == 0) {
snprintf(filename_buffer, sizeof(filename_buffer), "%s/DONE_%08d", output_log_dir, step);
FILE* done_file = fopenCheck(filename_buffer, "w");
fcloseCheck(done_file);
}
}
}

void delete_checkpoint(const char* output_log_dir, int step, MultiGpuConfig* multi_gpu_config) {
Expand Down

0 comments on commit 35d60fe

Please sign in to comment.