From 70f85903f4f386d414498090e5e0892752b1e67d Mon Sep 17 00:00:00 2001
From: Chinthaka Gamanayakege <chinthaka.gamanayakege@ahrefs.com>
Date: Thu, 13 Jun 2024 10:16:29 +0000
Subject: [PATCH] fix out of bound issue when n_proc is larger

---
 llmc/dataloader.h | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llmc/dataloader.h b/llmc/dataloader.h
index 73073872e..30e80f603 100644
--- a/llmc/dataloader.h
+++ b/llmc/dataloader.h
@@ -204,8 +204,9 @@ Copy pasting the section on the eval datafile format, from data_common.py:
 
 // for now, could relax later
 #define ASSUMED_NUM_COMPLETIONS 4
-// helper macro for ceildiv
+// helper macro for ceildiv and floordiv
 #define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+#define FLOOR_DIV(M, N) ((M) / (N))
 
 typedef struct {
     // variables related to distributed training
@@ -236,16 +237,17 @@ void evalloader_reset(EvalLoader *loader) {
     // For example if there are N examples in the file and 4 processes,
     // then process 0 should start at 0, process 1 at N/4, process 2 at N/2, etc.
     // determine how much work there is for all processes
-    int examples_per_process = CEIL_DIV(loader->num_examples, loader->num_processes);
-    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
-    loader->num_batches = CEIL_DIV(examples_per_process, can_fit_examples);
+    int examples_per_process = FLOOR_DIV(loader->num_examples, loader->num_processes);
     // determine the start and end example indices for this process
     loader->start_example_index = examples_per_process * loader->process_rank;
     loader->end_example_index = examples_per_process * (loader->process_rank + 1);
-    // crop the end example index to the total number of examples
-    if (loader->end_example_index > loader->num_examples) {
+    // extend the end example index to the total number of examples
+    if (loader->process_rank == loader->num_processes - 1) {
         loader->end_example_index = loader->num_examples;
+        examples_per_process = loader->end_example_index - loader->start_example_index;
     }
+    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
+    loader->num_batches = CEIL_DIV(examples_per_process, can_fit_examples);
     // now seek through the file to the start of that example
     // utilize <EXAMPLE_BYTES> for efficiency
     int64_t header_bytes = HEADER_SIZE * sizeof(int);