From 5c90845f7a3e9e81d480edbc7db4fd58278d19fb Mon Sep 17 00:00:00 2001
From: lancer <tangshao28@gmail.com>
Date: Thu, 9 May 2024 22:04:02 -0700
Subject: [PATCH] update the -lcublas -lcublasLt flag in the comment

---
 dev/cuda/adamw.cu                         | 4 ++--
 dev/cuda/attention_backward.cu            | 2 +-
 dev/cuda/attention_forward.cu             | 4 ++--
 dev/cuda/crossentropy_forward.cu          | 2 +-
 dev/cuda/crossentropy_softmax_backward.cu | 2 +-
 dev/cuda/encoder_backward.cu              | 2 +-
 dev/cuda/encoder_forward.cu               | 2 +-
 dev/cuda/fused_residual_forward.cu        | 2 +-
 dev/cuda/gelu_backward.cu                 | 2 +-
 dev/cuda/gelu_forward.cu                  | 2 +-
 dev/cuda/layernorm_backward.cu            | 2 +-
 dev/cuda/layernorm_forward.cu             | 2 +-
 dev/cuda/matmul_backward.cu               | 2 +-
 dev/cuda/matmul_backward_bias.cu          | 2 +-
 dev/cuda/nccl_all_reduce.cu               | 2 +-
 dev/cuda/residual_forward.cu              | 2 +-
 dev/cuda/softmax_forward.cu               | 2 +-
 dev/cuda/trimat_forward.cu                | 2 +-
 18 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/dev/cuda/adamw.cu b/dev/cuda/adamw.cu
index 23770b2c3..20a6560dd 100644
--- a/dev/cuda/adamw.cu
+++ b/dev/cuda/adamw.cu
@@ -6,8 +6,8 @@ References:
   * https://github.com/nvidia/apex/blob/master/csrc/multi_tensor_adam.cu
 
 Compile example:
-nvcc adamw.cu -o adamw
-nvcc -O3 --use_fast_math adamw.cu -o adamw
+nvcc -lcublas -lcublasLt adamw.cu -o adamw
+nvcc -O3 --use_fast_math -lcublas -lcublasLt adamw.cu -o adamw
 
 ./adamw
 
diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu
index 8e673d79f..c97dbeee8 100644
--- a/dev/cuda/attention_backward.cu
+++ b/dev/cuda/attention_backward.cu
@@ -2,7 +2,7 @@
 Kernels for attention backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math attention_backward.cu -o attention_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_backward.cu -o attention_backward
 
 version 1 is a naive first version
 OMP_NUM_THREADS=32 ./attention_backward 1
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
index a7b6fff34..b632b4a66 100644
--- a/dev/cuda/attention_forward.cu
+++ b/dev/cuda/attention_forward.cu
@@ -6,10 +6,10 @@ If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels
 See the README for cuDNN install instructions
 
 Compile example with cuDNN:
-nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward
+nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math --lcublas -lcublasLt -lcudnn attention_forward.cu -o attention_forward
 
 Compile example without cuDNN:
-nvcc -O3 --use_fast_math -lcublas attention_forward.cu -o attention_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_forward.cu -o attention_forward
 
 version 1 is naive port from CPU code to kernel, parallelize over batch, time, heads only
 ./attention_forward 1
diff --git a/dev/cuda/crossentropy_forward.cu b/dev/cuda/crossentropy_forward.cu
index 2385a6c4f..ca312ba36 100644
--- a/dev/cuda/crossentropy_forward.cu
+++ b/dev/cuda/crossentropy_forward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_forward.cu -o crossentropy_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_forward.cu -o crossentropy_forward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_forward 1
diff --git a/dev/cuda/crossentropy_softmax_backward.cu b/dev/cuda/crossentropy_softmax_backward.cu
index 164bceddf..27521bf60 100644
--- a/dev/cuda/crossentropy_softmax_backward.cu
+++ b/dev/cuda/crossentropy_softmax_backward.cu
@@ -2,7 +2,7 @@
 Kernels for crossentropy forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_softmax_backward.cu -o crossentropy_softmax_backward
 
 version 1 is a straight-forward port from CPU code to kernel, parallel over B,T
 ./crossentropy_softmax_backward 1
diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu
index 8c96eaf46..53221878e 100644
--- a/dev/cuda/encoder_backward.cu
+++ b/dev/cuda/encoder_backward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_backward.cu -o encoder_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_backward.cu -o encoder_backward
 
 version 1 is naive port from CPU code to kernel
 parallelizes over B,T,C, uses atomics to add to dwte, dwpe
diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu
index e901fd654..39d5f0fa3 100644
--- a/dev/cuda/encoder_forward.cu
+++ b/dev/cuda/encoder_forward.cu
@@ -2,7 +2,7 @@
 Kernels for the positional encoder forward pass in GPT-2.
 
 Compile example:
-nvcc -O3 --use_fast_math encoder_forward.cu -o encoder_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_forward.cu -o encoder_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./encoder_forward 1
diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu
index f228503af..b98a67c4b 100644
--- a/dev/cuda/fused_residual_forward.cu
+++ b/dev/cuda/fused_residual_forward.cu
@@ -2,7 +2,7 @@
 Kernels for residual forward pass fused with layernorm
 
 Compile example:
-nvcc -O3 --use_fast_math fused_residual_forward.cu -o fused_residual_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt fused_residual_forward.cu -o fused_residual_forward
 
 version 1 is naive port from CPU code to kernel
 ./fused_residual_forward 1
diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu
index bbd81c4bc..3d12dd864 100644
--- a/dev/cuda/gelu_backward.cu
+++ b/dev/cuda/gelu_backward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_backward.cu -o gelu_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_backward.cu -o gelu_backward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 
diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu
index e07ad663a..01abfe2b5 100644
--- a/dev/cuda/gelu_forward.cu
+++ b/dev/cuda/gelu_forward.cu
@@ -2,7 +2,7 @@
 Kernels for gelu forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math gelu_forward.cu -o gelu_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_forward.cu -o gelu_forward
 
 If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file:
 
diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index 904a57e0c..575e0a962 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_backward.cu -o layernorm_backward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_backward.cu -o layernorm_backward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_backward 1
diff --git a/dev/cuda/layernorm_forward.cu b/dev/cuda/layernorm_forward.cu
index 5cefd408e..3e948289a 100644
--- a/dev/cuda/layernorm_forward.cu
+++ b/dev/cuda/layernorm_forward.cu
@@ -2,7 +2,7 @@
 Kernels for layernorm forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./layernorm_forward 1
diff --git a/dev/cuda/matmul_backward.cu b/dev/cuda/matmul_backward.cu
index 9d3763930..dece1f6dc 100644
--- a/dev/cuda/matmul_backward.cu
+++ b/dev/cuda/matmul_backward.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward
 
 OMP_NUM_THREADS=32 ./matmul_backward 1
 */
diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 7aef54547..65b331699 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass bias only.
 
 Compile example:
-nvcc -O3 matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
+nvcc -O3 -lcublas -lcublasLt matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
 
 ./matmul_backward_bias 1
 ./matmul_backward_bias 2
diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
index 3bc9564f1..260ba02ba 100644
--- a/dev/cuda/nccl_all_reduce.cu
+++ b/dev/cuda/nccl_all_reduce.cu
@@ -5,7 +5,7 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc.
 Then aggregates the values in the resulting vectors.
 
 Compile example:
-nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ nccl_all_reduce.cu -o nccl_all_reduce
+nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce
 
 Run on 2 local GPUs (set -np to a different value to change GPU count):
 mpirun -np 2 ./nccl_all_reduce
diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu
index f07871a29..fd7d1fb8e 100644
--- a/dev/cuda/residual_forward.cu
+++ b/dev/cuda/residual_forward.cu
@@ -2,7 +2,7 @@
 Kernels for residual forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math residual_forward.cu -o residual_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt residual_forward.cu -o residual_forward
 
 version 1 is naive port from CPU code to kernel
 ./residual_forward 1
diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index d0d38850d..279549b28 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -2,7 +2,7 @@
 Kernels for softmax forward pass.
 
 Compile example:
-nvcc -O3 --use_fast_math softmax_forward.cu -o softmax_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt softmax_forward.cu -o softmax_forward
 
 version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C
 ./softmax_forward 1
diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu
index 133ced16f..1c093e2a1 100644
--- a/dev/cuda/trimat_forward.cu
+++ b/dev/cuda/trimat_forward.cu
@@ -3,7 +3,7 @@ Triangular matrix multiplication as in autoregressive attention. A short story.
 by @ngc92
 
 Compile:
-nvcc -O3 --use_fast_math trimat_forward.cu -o trimat_forward -lcublas
+nvcc -O3 --use_fast_math -lcublas -lcublasLt trimat_forward.cu -o trimat_forward -lcublas
 
 Run: