diff --git a/profile_gpt2.cu b/profile_gpt2.cu index 62f93541d..f53de88cc 100644 --- a/profile_gpt2.cu +++ b/profile_gpt2.cu @@ -61,7 +61,7 @@ int main(int argc, char *argv[]) { // do a training step gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); - gpt2_backward(&model, x, true); + gpt2_backward_and_reduce(&model, x, true); gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config); cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings diff --git a/test_gpt2.cu b/test_gpt2.cu index 45964d5b6..6b78a0050 100644 --- a/test_gpt2.cu +++ b/test_gpt2.cu @@ -218,7 +218,7 @@ int main(int argc, char *argv[]) { clock_gettime(CLOCK_MONOTONIC, &start); gpt2_forward(&model, x, y, B, T); gpt2_zero_grad(&model); - gpt2_backward(&model, x, true); + gpt2_backward_and_reduce(&model, x, true); clock_gettime(CLOCK_MONOTONIC, &end); double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9; @@ -334,7 +334,7 @@ int main(int argc, char *argv[]) { dataloader_next_batch(&loader); gpt2_forward(&model, loader.inputs, loader.targets, B, T); gpt2_zero_grad(&model); - gpt2_backward(&model, loader.inputs, true); + gpt2_backward_and_reduce(&model, loader.inputs, true); gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+11, &multi_gpu_config); losses[step] = model.mean_loss; tokens[step] = loader.inputs[0]; @@ -349,7 +349,7 @@ int main(int argc, char *argv[]) { dataloader_next_batch(&loader); gpt2_forward(&model, loader.inputs, loader.targets, B, T); gpt2_zero_grad(&model); - gpt2_backward(&model, loader.inputs, true); + gpt2_backward_and_reduce(&model, loader.inputs, true); gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+11, &multi_gpu_config); if(loader.inputs[0] != tokens[step]) { diff --git a/train_gpt2.cu b/train_gpt2.cu index 43f561723..301cbea82 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -723,7 +723,7 @@ void gpt2_zero_grad(GPT2 *model) { cudaCheck(cudaDeviceSynchronize()); } -void gpt2_backward(GPT2 *model, int* inputs, bool last_step) { +void gpt2_backward_and_reduce(GPT2 *model, int* inputs, bool last_step) { NVTX_RANGE_FN(); // double check we forwarded previously, with targets if (model->mean_loss == -1.0f) { @@ -1722,7 +1722,7 @@ int main(int argc, char *argv[]) { gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps); lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward // backward pass. all model params accumulate gradients with += inside this inner loop - gpt2_backward(&model, train_loader.inputs, micro_step == grad_accum_steps - 1); + gpt2_backward_and_reduce(&model, train_loader.inputs, micro_step == grad_accum_steps - 1); } // override the mean loss, accounting for the gradient accumulation loop // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced