diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index 62f93541d..f53de88cc 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -61,7 +61,7 @@ int main(int argc, char *argv[]) {
     // do a training step
     gpt2_forward(&model, x, y, B, T);
     gpt2_zero_grad(&model);
-    gpt2_backward(&model, x, true);
+    gpt2_backward_and_reduce(&model, x, true);
     gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
 
diff --git a/test_gpt2.cu b/test_gpt2.cu
index 45964d5b6..6b78a0050 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -218,7 +218,7 @@ int main(int argc, char *argv[]) {
         clock_gettime(CLOCK_MONOTONIC, &start);
         gpt2_forward(&model, x, y, B, T);
         gpt2_zero_grad(&model);
-        gpt2_backward(&model, x, true);
+        gpt2_backward_and_reduce(&model, x, true);
         clock_gettime(CLOCK_MONOTONIC, &end);
         double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
 
@@ -334,7 +334,7 @@ int main(int argc, char *argv[]) {
         dataloader_next_batch(&loader);
         gpt2_forward(&model, loader.inputs, loader.targets, B, T);
         gpt2_zero_grad(&model);
-        gpt2_backward(&model, loader.inputs, true);
+        gpt2_backward_and_reduce(&model, loader.inputs, true);
         gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+11, &multi_gpu_config);
         losses[step] = model.mean_loss;
         tokens[step] = loader.inputs[0];
@@ -349,7 +349,7 @@ int main(int argc, char *argv[]) {
         dataloader_next_batch(&loader);
         gpt2_forward(&model, loader.inputs, loader.targets, B, T);
         gpt2_zero_grad(&model);
-        gpt2_backward(&model, loader.inputs, true);
+        gpt2_backward_and_reduce(&model, loader.inputs, true);
         gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+11, &multi_gpu_config);
 
         if(loader.inputs[0] != tokens[step]) {
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 43f561723..301cbea82 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -723,7 +723,7 @@ void gpt2_zero_grad(GPT2 *model) {
     cudaCheck(cudaDeviceSynchronize());
 }
 
-void gpt2_backward(GPT2 *model, int* inputs, bool last_step) {
+void gpt2_backward_and_reduce(GPT2 *model, int* inputs, bool last_step) {
     NVTX_RANGE_FN();
     // double check we forwarded previously, with targets
     if (model->mean_loss == -1.0f) {
@@ -1722,7 +1722,7 @@ int main(int argc, char *argv[]) {
             gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps);
             lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward
             // backward pass. all model params accumulate gradients with += inside this inner loop
-            gpt2_backward(&model, train_loader.inputs, micro_step == grad_accum_steps - 1);
+            gpt2_backward_and_reduce(&model, train_loader.inputs, micro_step == grad_accum_steps - 1);
         }
         // override the mean loss, accounting for the gradient accumulation loop
         // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced