From 5c90845f7a3e9e81d480edbc7db4fd58278d19fb Mon Sep 17 00:00:00 2001 From: lancer Date: Thu, 9 May 2024 22:04:02 -0700 Subject: [PATCH] update the -lcublas -lcublasLt flag in the comment --- dev/cuda/adamw.cu | 4 ++-- dev/cuda/attention_backward.cu | 2 +- dev/cuda/attention_forward.cu | 4 ++-- dev/cuda/crossentropy_forward.cu | 2 +- dev/cuda/crossentropy_softmax_backward.cu | 2 +- dev/cuda/encoder_backward.cu | 2 +- dev/cuda/encoder_forward.cu | 2 +- dev/cuda/fused_residual_forward.cu | 2 +- dev/cuda/gelu_backward.cu | 2 +- dev/cuda/gelu_forward.cu | 2 +- dev/cuda/layernorm_backward.cu | 2 +- dev/cuda/layernorm_forward.cu | 2 +- dev/cuda/matmul_backward.cu | 2 +- dev/cuda/matmul_backward_bias.cu | 2 +- dev/cuda/nccl_all_reduce.cu | 2 +- dev/cuda/residual_forward.cu | 2 +- dev/cuda/softmax_forward.cu | 2 +- dev/cuda/trimat_forward.cu | 2 +- 18 files changed, 20 insertions(+), 20 deletions(-) diff --git a/dev/cuda/adamw.cu b/dev/cuda/adamw.cu index 23770b2c3..20a6560dd 100644 --- a/dev/cuda/adamw.cu +++ b/dev/cuda/adamw.cu @@ -6,8 +6,8 @@ References: * https://github.com/nvidia/apex/blob/master/csrc/multi_tensor_adam.cu Compile example: -nvcc adamw.cu -o adamw -nvcc -O3 --use_fast_math adamw.cu -o adamw +nvcc -lcublas -lcublasLt adamw.cu -o adamw +nvcc -O3 --use_fast_math -lcublas -lcublasLt adamw.cu -o adamw ./adamw diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu index 8e673d79f..c97dbeee8 100644 --- a/dev/cuda/attention_backward.cu +++ b/dev/cuda/attention_backward.cu @@ -2,7 +2,7 @@ Kernels for attention backward pass. Compile example: -nvcc -O3 --use_fast_math attention_backward.cu -o attention_backward -lcublas +nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_backward.cu -o attention_backward version 1 is a naive first version OMP_NUM_THREADS=32 ./attention_backward 1 diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu index a7b6fff34..b632b4a66 100644 --- a/dev/cuda/attention_forward.cu +++ b/dev/cuda/attention_forward.cu @@ -6,10 +6,10 @@ If you do not have CUDNN, you can remove ENABLE_CUDNN to run the other kernels See the README for cuDNN install instructions Compile example with cuDNN: -nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math -lcublas -lcudnn attention_forward.cu -o attention_forward +nvcc -I/PATH/TO/cudnn-frontend/include -DENABLE_CUDNN -O3 --use_fast_math --lcublas -lcublasLt -lcudnn attention_forward.cu -o attention_forward Compile example without cuDNN: -nvcc -O3 --use_fast_math -lcublas attention_forward.cu -o attention_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt attention_forward.cu -o attention_forward version 1 is naive port from CPU code to kernel, parallelize over batch, time, heads only ./attention_forward 1 diff --git a/dev/cuda/crossentropy_forward.cu b/dev/cuda/crossentropy_forward.cu index 2385a6c4f..ca312ba36 100644 --- a/dev/cuda/crossentropy_forward.cu +++ b/dev/cuda/crossentropy_forward.cu @@ -2,7 +2,7 @@ Kernels for crossentropy forward pass. Compile example: -nvcc -O3 --use_fast_math crossentropy_forward.cu -o crossentropy_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_forward.cu -o crossentropy_forward version 1 is a straight-forward port from CPU code to kernel, parallel over B,T ./crossentropy_forward 1 diff --git a/dev/cuda/crossentropy_softmax_backward.cu b/dev/cuda/crossentropy_softmax_backward.cu index 164bceddf..27521bf60 100644 --- a/dev/cuda/crossentropy_softmax_backward.cu +++ b/dev/cuda/crossentropy_softmax_backward.cu @@ -2,7 +2,7 @@ Kernels for crossentropy forward pass. Compile example: -nvcc -O3 --use_fast_math crossentropy_softmax_backward.cu -o crossentropy_softmax_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt crossentropy_softmax_backward.cu -o crossentropy_softmax_backward version 1 is a straight-forward port from CPU code to kernel, parallel over B,T ./crossentropy_softmax_backward 1 diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu index 8c96eaf46..53221878e 100644 --- a/dev/cuda/encoder_backward.cu +++ b/dev/cuda/encoder_backward.cu @@ -2,7 +2,7 @@ Kernels for the positional encoder forward pass in GPT-2. Compile example: -nvcc -O3 --use_fast_math encoder_backward.cu -o encoder_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_backward.cu -o encoder_backward version 1 is naive port from CPU code to kernel parallelizes over B,T,C, uses atomics to add to dwte, dwpe diff --git a/dev/cuda/encoder_forward.cu b/dev/cuda/encoder_forward.cu index e901fd654..39d5f0fa3 100644 --- a/dev/cuda/encoder_forward.cu +++ b/dev/cuda/encoder_forward.cu @@ -2,7 +2,7 @@ Kernels for the positional encoder forward pass in GPT-2. Compile example: -nvcc -O3 --use_fast_math encoder_forward.cu -o encoder_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt encoder_forward.cu -o encoder_forward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./encoder_forward 1 diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu index f228503af..b98a67c4b 100644 --- a/dev/cuda/fused_residual_forward.cu +++ b/dev/cuda/fused_residual_forward.cu @@ -2,7 +2,7 @@ Kernels for residual forward pass fused with layernorm Compile example: -nvcc -O3 --use_fast_math fused_residual_forward.cu -o fused_residual_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt fused_residual_forward.cu -o fused_residual_forward version 1 is naive port from CPU code to kernel ./fused_residual_forward 1 diff --git a/dev/cuda/gelu_backward.cu b/dev/cuda/gelu_backward.cu index bbd81c4bc..3d12dd864 100644 --- a/dev/cuda/gelu_backward.cu +++ b/dev/cuda/gelu_backward.cu @@ -2,7 +2,7 @@ Kernels for gelu backward pass. Compile example: -nvcc -O3 --use_fast_math gelu_backward.cu -o gelu_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_backward.cu -o gelu_backward If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file: diff --git a/dev/cuda/gelu_forward.cu b/dev/cuda/gelu_forward.cu index e07ad663a..01abfe2b5 100644 --- a/dev/cuda/gelu_forward.cu +++ b/dev/cuda/gelu_forward.cu @@ -2,7 +2,7 @@ Kernels for gelu forward pass. Compile example: -nvcc -O3 --use_fast_math gelu_forward.cu -o gelu_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt gelu_forward.cu -o gelu_forward If encountering "error: identifier "M_PI" is undefined", add the following lines to the top of the file: diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu index 904a57e0c..575e0a962 100644 --- a/dev/cuda/layernorm_backward.cu +++ b/dev/cuda/layernorm_backward.cu @@ -2,7 +2,7 @@ Kernels for layernorm backward pass. Compile example: -nvcc -O3 --use_fast_math layernorm_backward.cu -o layernorm_backward +nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_backward.cu -o layernorm_backward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./layernorm_backward 1 diff --git a/dev/cuda/layernorm_forward.cu b/dev/cuda/layernorm_forward.cu index 5cefd408e..3e948289a 100644 --- a/dev/cuda/layernorm_forward.cu +++ b/dev/cuda/layernorm_forward.cu @@ -2,7 +2,7 @@ Kernels for layernorm forward pass. Compile example: -nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./layernorm_forward 1 diff --git a/dev/cuda/matmul_backward.cu b/dev/cuda/matmul_backward.cu index 9d3763930..dece1f6dc 100644 --- a/dev/cuda/matmul_backward.cu +++ b/dev/cuda/matmul_backward.cu @@ -2,7 +2,7 @@ Kernels for matmul backward pass. Compile example: -nvcc -O3 --use_fast_math -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward -lcublas +nvcc -O3 --use_fast_math -lcublas -lcublasLt -Xcompiler -fopenmp matmul_backward.cu -o matmul_backward OMP_NUM_THREADS=32 ./matmul_backward 1 */ diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu index 7aef54547..65b331699 100644 --- a/dev/cuda/matmul_backward_bias.cu +++ b/dev/cuda/matmul_backward_bias.cu @@ -2,7 +2,7 @@ Kernels for matmul backward pass bias only. Compile example: -nvcc -O3 matmul_backward_bias.cu -lineinfo -o matmul_backward_bias +nvcc -O3 -lcublas -lcublasLt matmul_backward_bias.cu -lineinfo -o matmul_backward_bias ./matmul_backward_bias 1 ./matmul_backward_bias 2 diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu index 3bc9564f1..260ba02ba 100644 --- a/dev/cuda/nccl_all_reduce.cu +++ b/dev/cuda/nccl_all_reduce.cu @@ -5,7 +5,7 @@ Fills a vector with 1s on the first GPU, 2s on the second, etc. Then aggregates the values in the resulting vectors. Compile example: -nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ nccl_all_reduce.cu -o nccl_all_reduce +nvcc -lmpi -lnccl -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -lcublas -lcublasLt nccl_all_reduce.cu -o nccl_all_reduce Run on 2 local GPUs (set -np to a different value to change GPU count): mpirun -np 2 ./nccl_all_reduce diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu index f07871a29..fd7d1fb8e 100644 --- a/dev/cuda/residual_forward.cu +++ b/dev/cuda/residual_forward.cu @@ -2,7 +2,7 @@ Kernels for residual forward pass. Compile example: -nvcc -O3 --use_fast_math residual_forward.cu -o residual_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt residual_forward.cu -o residual_forward version 1 is naive port from CPU code to kernel ./residual_forward 1 diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu index d0d38850d..279549b28 100644 --- a/dev/cuda/softmax_forward.cu +++ b/dev/cuda/softmax_forward.cu @@ -2,7 +2,7 @@ Kernels for softmax forward pass. Compile example: -nvcc -O3 --use_fast_math softmax_forward.cu -o softmax_forward +nvcc -O3 --use_fast_math -lcublas -lcublasLt softmax_forward.cu -o softmax_forward version 1 is naive port from CPU code to kernel: parallelizes over B,T, loops over C ./softmax_forward 1 diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu index 133ced16f..1c093e2a1 100644 --- a/dev/cuda/trimat_forward.cu +++ b/dev/cuda/trimat_forward.cu @@ -3,7 +3,7 @@ Triangular matrix multiplication as in autoregressive attention. A short story. by @ngc92 Compile: -nvcc -O3 --use_fast_math trimat_forward.cu -o trimat_forward -lcublas +nvcc -O3 --use_fast_math -lcublas -lcublasLt trimat_forward.cu -o trimat_forward -lcublas Run: