Merge branch 'master' of github.com:karpathy/llm.c

ahrefs · May 11, 2024 · 1cfe899 · 1cfe899
2 parents fb86c12 + 5dbda7e
commit 1cfe899
Show file tree

Hide file tree

Showing 18 changed files with 20 additions and 20 deletions.
diff --git a/dev/cuda/adamw.cu b/dev/cuda/adamw.cu
@@ -6,8 +6,8 @@ References:
   * https://github.com/nvidia/apex/blob/master/csrc/multi_tensor_adam.cu
 
 Compile example:
-nvcc adamw.cu -o adamw
-nvcc -O3 --use_fast_math adamw.cu -o adamw
+nvcc -lcublas -lcublasLt adamw.cu -o adamw
+nvcc -O3 --use_fast_math -lcublas -lcublasLt adamw.cu -o adamw
 
 ./adamw
 

diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu
@@ -2,7 +2,7 @@
 Kernels for attention backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math attention_backward.cu -o attention_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_backward.cu -o attention_backward
 
 version 1 is a naive first version
 OMP_NUM_THREADS=32 ./attention_backward 1

diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
@@ -6,10 +6,10 @@ If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels
 See the README for cuDNN install instructions
 
 Compile example with cuDNN:
-nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward
+nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math --lcublas -lcublasLt -lcudnn attention_forward.cu -o attention_forward
 
 Compile example without cuDNN:
-nvcc -O3 --use_fast_math -lcublas attention_forward.cu -o attention_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_forward.cu -o attention_forward
 
 version 1 is naive port from CPU code to kernel, parallelize over batch, time, heads only
 ./attention_forward 1

diff --git a/dev/cuda/crossentropy_forward.cu b/dev/cuda/crossentropy_forward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_forward.cu -o crossentropy_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_forward.cu -o crossentropy_forward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_forward 1

diff --git a/dev/cuda/crossentropy_softmax_backward.cu b/dev/cuda/crossentropy_softmax_backward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_softmax_backward 1

diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_backward.cu -o encoder_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_backward.cu -o encoder_backward
 
 version 1 is naive port from CPU code to kernel
 parallelizes over B,T,C, uses atomics to add to dwte, dwpe

diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_forward.cu -o encoder_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_forward.cu -o encoder_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./encoder_forward 1

diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu
@@ -2,7 +2,7 @@
 Kernels for residual forward pass fused with layernorm
 
 Compile example:
-nvcc -O3 --use_fast_math fused_residual_forward.cu -o fused_residual_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt fused_residual_forward.cu -o fused_residual_forward
 
 version 1 is naive port from CPU code to kernel
 ./fused_residual_forward 1

diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_backward.cu -o gelu_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_backward.cu -o gelu_backward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 

diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_forward.cu -o gelu_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_forward.cu -o gelu_forward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 

diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_backward.cu -o layernorm_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_backward.cu -o layernorm_backward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_backward 1

diff --git a/dev/cuda/layernorm_forward.cu b/dev/cuda/layernorm_forward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_forward 1

diff --git a/dev/cuda/matmul_backward.cu b/dev/cuda/matmul_backward.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward
 
 OMP_NUM_THREADS=32 ./matmul_backward 1
 */

diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass bias only.
 
 Compile example:
-nvcc -O3 matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
+nvcc -O3 -lcublas -lcublasLt matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
 
 ./matmul_backward_bias 1
 ./matmul_backward_bias 2

diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
@@ -5,7 +5,7 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc.
 Then aggregates the values in the resulting vectors.
 
 Compile example:
-nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ nccl_all_reduce.cu -o nccl_all_reduce
+nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce
 
 Run on 2 local GPUs (set -np to a different value to change GPU count):
 mpirun -np 2 ./nccl_all_reduce

diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu
@@ -2,7 +2,7 @@
 Kernels for residual forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math residual_forward.cu -o residual_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt residual_forward.cu -o residual_forward
 
 version 1 is naive port from CPU code to kernel
 ./residual_forward 1

diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
@@ -2,7 +2,7 @@
 Kernels for softmax forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math softmax_forward.cu -o softmax_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt softmax_forward.cu -o softmax_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./softmax_forward 1

diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu
@@ -3,7 +3,7 @@ Triangular matrix multiplication as in autoregressive attention. A short story.
 by @ngc92
 
 Compile:
-nvcc -O3 --use_fast_math trimat_forward.cu -o trimat_forward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt trimat_forward.cu -o trimat_forward -lcublas
 
 Run: