diff --git a/llmc/cuda_common.h b/llmc/cuda_common.h
index 2d973e456..5f031cb5f 100644
--- a/llmc/cuda_common.h
+++ b/llmc/cuda_common.h
@@ -118,6 +118,9 @@ class NvtxRange {
 };
 #define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
 
+// ----------------------------------------------------------------------------
+// Utilities to Read & Write between CUDA memory <-> files
+
 // copy num_bytes from device pointer src into file dest, using double buffering running on the given stream.
 inline void device_to_file(FILE* dest, void* src, size_t num_bytes, size_t buffer_size, cudaStream_t stream) {
     // allocate pinned buffer for faster, async transfer
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 4460e7403..574c95c9f 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -387,7 +387,7 @@ void gpt2_write_to_checkpoint(GPT2 *model, const char* checkpoint_path) {
     fwriteCheck(model_header, sizeof(int), 256, model_file);
     // write the parameters
     device_to_file(model_file, model->params_memory, model->num_parameters_bytes,
-                   1024*1024*32, main_stream);
+                   IO_BUF_SIZE, main_stream);
     // close file, we're done
     fcloseCheck(model_file);
 }
@@ -1224,19 +1224,16 @@ void load_state(int* step, GPT2* model, DataLoader* loader, const char* filename
         printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
         cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float)));
     }
-
     if(use_master_weights == 1 && !model->use_master_weights) {
         printf0("Warning: Master weights are present in state, but not enabled for current run.");
     } else if (use_master_weights == 0 && model->use_master_weights) {
         printf0("Error: Master weights requested, but not present in state file.");
         exit(EXIT_FAILURE);
     }
-
     if (model->master_weights == NULL && use_master_weights == 1) {
         printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
         cudaCheck(cudaMalloc((void**)&model->master_weights, shard_num_parameters * sizeof(float)));
     }
-
     file_to_device(model->m_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
     file_to_device(model->v_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
     if(model->use_master_weights) {