slight docs tweaks to dev/cuda

ahrefs · Apr 27, 2024 · 12da2c1 · 12da2c1
1 parent a5d23e7
commit 12da2c1
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 16 deletions.
diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
@@ -1,39 +1,40 @@
 # Makefile for building dev/cuda kernels
+# Collects all the make commands in one file but each file also
+# has the compile and run commands in the header comments section.
 
-# Find nvcc
+# Find nvcc (NVIDIA CUDA compiler)
 NVCC := $(shell which nvcc 2>/dev/null)
 ifeq ($(NVCC),)
 		$(error nvcc not found.)
 endif
 
 # Compiler flags
 CFLAGS = -O3 --use_fast_math
-MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ 
-
+MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
 
+# Default rule for our CUDA files
 %: %.cu
 	$(NVCC) $(CFLAGS) $< -o $@ -lcublas
 
+# Build all targets
 TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward
-
 all: $(TARGETS)
 
-# Forward kernels
-
+# Individual targets: forward pass
 attention_forward: attention_forward.cu
 classifier_fused: classifier_fused.cu
 crossentropy_forward: crossentropy_forward.cu
 encoder_forward: encoder_forward.cu
 gelu_forward: gelu_forward.cu
 layernorm_forward: layernorm_forward.cu
-matmul_forward: matmul_forward.cu
-	$(NVCC) $(CFLAGS) -Xcompiler -fopenmp matmul_forward.cu -o matmul_forward -lcublas -lcublasLt
 residual_forward: residual_forward.cu
 softmax_forward: softmax_forward.cu
 trimat_forward: trimat_forward.cu
+# matmul fwd/bwd also uses OpenMP (optionally) and cuBLASLt libs
+matmul_forward: matmul_forward.cu
+	$(NVCC) $(CFLAGS) -Xcompiler -fopenmp matmul_forward.cu -o matmul_forward -lcublas -lcublasLt
 
-# Backward kernels
-
+# Individual targets: backward pass
 attention_backward: attention_backward.cu
 crossentropy_softmax_backward: crossentropy_softmax_backward.cu
 encoder_backward: encoder_backward.cu
@@ -43,14 +44,13 @@ matmul_backward: matmul_backward.cu
 	$(NVCC) $(CFLAGS) -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward -lcublas
 
 # Update kernels
-
 adamw: adamw.cu
 
-# NCCL
-
+# NCCL communication kernels
 nccl_all_reduce: nccl_all_reduce.cu
 	$(NVCC) -lmpi -lnccl $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce
 
+# Run all targets
 run_all: all
 	@for target in $(TARGETS); do \
 		echo "\n========================================"; \
@@ -59,5 +59,6 @@ run_all: all
 		./$$target; \
 	done
 
+# Clean up
 clean:
 	rm -f $(TARGETS)
diff --git a/dev/cuda/README.md b/dev/cuda/README.md
@@ -1,7 +1,33 @@
 # dev/cuda
 
-This directory is scratch space for developing various versions of the needed CUDA kernels. Each file develops a kernel, see the top of each file for instructions on how to compile and run each one using the `nvcc` compiler.
+This directory is scratch space for developing various versions of the needed CUDA kernels. Each file develops a kernel, and usually multiple versions of that kernel that could have different running times and of different code or time complexity.
 
-An alternative to invoking `nvcc` manually is to use `make` with the accompanying `Makefile` in this directory. Each kernel has its own `make` build target, invoking `make` for the target builds the associated binary. 
+See the top of each file for how to compile and run the kernel. Alternatively, the commands are also all grouped in the `Makefile` in this directory for convenience.
 
-For example, `make gelu_forward` builds the forward GELU kernel, creating a binary that can be executed by running `./gelu_forward`. `make` or `make all` builds all the kernels in this directory. To delete all binary build targets, run `make clean`.
+For example, we can look at the top of `layernorm_forward.cu` to build the forward pass kernels for the LayerNorm:
+
+```bash
+nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward
+```
+
+or simply
+
+```bash
+make layernorm_forward
+```
+
+The comments at the top then document the different versions of this kernel available, usually these are in increasing complexity and decreasing running times. For example, inspecting the comments in the file on top, the most naive kernel we can then run as:
+
+```bash
+./layernorm_forward 1
+```
+
+You'll see that this first forwards the reference code on the CPU, then it runs kernel 1 on the GPU, compares the results to check for correctness, and then runs a number of configurations of this kernel (most often and most notably the block size), to time the kernel in these launch configurations. We can then run one of the faster kernels (kernel 4) instead:
+
+```bash
+./layernorm_forward 4
+```
+
+You'll see that this matches all the CPU results but runs much much faster. The typical process from here on is we copy paste the kernel that ran fastest, adjust it manually (e.g. to hardcode the best block size) and drop it into the training code file, e.g. `train_gpt2.cu`.
+
+To add a new version of a kernel, add the kernel to the corresponding file and adjust the docs. To add a new kernel, add the new file and adjust the Makefile. Run `make clean` to clean up binaries from your directory.