diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 81458a995..52715eb9c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -49,8 +49,8 @@ jobs:
       - name: Execute testing program (No OpenMP)
         run: ./test_gpt2
 
-  build-with-cuda-fp32:
-    runs-on: ubuntu-latest  # Host OS, Docker will run on top of this
+  build-cuda-fp32:
+    runs-on: ubuntu-latest
     container:
       image: nvidia/cuda:12.4.1-devel-ubuntu22.04
 
@@ -64,8 +64,8 @@ jobs:
       - name: Build FP32 precision
         run: PRECISION=FP32 make train_gpt2cu test_gpt2cu profile_gpt2cu
 
-  build-with-cuda-bf16:
-    runs-on: ubuntu-latest  # Host OS, Docker will run on top of this
+  build-cuda-bf16:
+    runs-on: ubuntu-latest
     container:
       image: nvidia/cuda:12.4.1-devel-ubuntu22.04
 
@@ -76,8 +76,8 @@ jobs:
       - name: Build project
         run: PRECISION=BF16 make test_gpt2cu train_gpt2cu profile_gpt2cu
 
-  build-with-cuda-fp16:
-    runs-on: ubuntu-latest  # Host OS, Docker will run on top of this
+  build-cuda-fp16:
+    runs-on: ubuntu-latest
     container:
       image: nvidia/cuda:12.4.1-devel-ubuntu22.04
 
@@ -87,3 +87,18 @@ jobs:
 
       - name: Build project
         run: PRECISION=FP16 make test_gpt2cu train_gpt2cu profile_gpt2cu
+
+  build-cuda-kernels:
+    runs-on: ubuntu-latest
+    container:
+      image: nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Install OpenMP and OpenMPI
+        run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev
+
+      - name: Build project
+        run: make -j4 -C dev/cuda