feat: parse Cuda compute cap from env (#1066)

* feat: add support for multiple compute caps * Revert to one compute cap * fmt * fix
huggingface · Oct 16, 2023 · 7562998 · 7562998
1 parent 0106b0b
commit 7562998
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 125 deletions.
diff --git a/candle-book/src/guide/installation.md b/candle-book/src/guide/installation.md
@@ -12,6 +12,9 @@ compute_cap
 8.9
 ```
 
+You can also compile the Cuda kernels for a specific compute cap using the 
+`CUDA_COMPUTE_CAP=<compute cap>` environment variable.
+
 If any of the above commands errors out, please make sure to update your Cuda version.
 
 2. Create a new app and add [`candle-core`](https://github.com/huggingface/candle/tree/main/candle-core) with Cuda support.

diff --git a/candle-flash-attn/build.rs b/candle-flash-attn/build.rs
@@ -84,12 +84,19 @@ fn main() -> Result<()> {
             (kernel_dir.join(f), obj_file)
         })
         .collect();
+    let out_modified: Result<_, _> = out_file.metadata().and_then(|m| m.modified());
     let should_compile = if out_file.exists() {
-        cu_files.iter().any(|(cu_file, _)| {
-            let out_modified = out_file.metadata().unwrap().modified().unwrap();
-            let in_modified = cu_file.metadata().unwrap().modified().unwrap();
-            in_modified.duration_since(out_modified).is_ok()
-        })
+        kernel_dir
+            .read_dir()
+            .expect("kernels folder should exist")
+            .any(|entry| {
+                if let (Ok(entry), Ok(out_modified)) = (entry, &out_modified) {
+                    let in_modified = entry.metadata().unwrap().modified().unwrap();
+                    in_modified.duration_since(*out_modified).is_ok()
+                } else {
+                    true
+                }
+            })
     } else {
         true
     };
@@ -100,12 +107,19 @@ fn main() -> Result<()> {
                 let mut command = std::process::Command::new("nvcc");
                 command
                     .arg("-std=c++17")
+                    .arg("-O3")
+                    .arg("-U__CUDA_NO_HALF_OPERATORS__")
+                    .arg("-U__CUDA_NO_HALF_CONVERSIONS__")
+                    .arg("-U__CUDA_NO_HALF2_OPERATORS__")
+                    .arg("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
                     .arg(format!("--gpu-architecture=sm_{compute_cap}"))
                     .arg("-c")
                     .args(["-o", obj_file.to_str().unwrap()])
                     .args(["--default-stream", "per-thread"])
                     .arg("-Icutlass/include")
                     .arg("--expt-relaxed-constexpr")
+                    .arg("--expt-extended-lambda")
+                    .arg("--use_fast_math")
                     .arg("--verbose");
                 if let Ok(ccbin_path) = &ccbin_env {
                     command
@@ -203,13 +217,21 @@ fn set_cuda_include_dir() -> Result<()> {
 
 #[allow(unused)]
 fn compute_cap() -> Result<usize> {
-    // Grab compute code from nvidia-smi
-    let mut compute_cap = {
+    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
+
+    // Try to parse compute caps from env
+    let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
+        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
+        compute_cap_str
+            .parse::<usize>()
+            .context("Could not parse compute cap")?
+    } else {
+        // Use nvidia-smi to get the current compute cap
         let out = std::process::Command::new("nvidia-smi")
-                    .arg("--query-gpu=compute_cap")
-                    .arg("--format=csv")
-                    .output()
-                    .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
+            .arg("--query-gpu=compute_cap")
+            .arg("--format=csv")
+            .output()
+            .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
         let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
         let mut lines = out.lines();
         assert_eq!(
@@ -220,16 +242,19 @@ fn compute_cap() -> Result<usize> {
             .next()
             .context("missing line in stdout")?
             .replace('.', "");
-        cap.parse::<usize>()
-            .with_context(|| format!("cannot parse as int {cap}"))?
+        let cap = cap
+            .parse::<usize>()
+            .with_context(|| format!("cannot parse as int {cap}"))?;
+        println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
+        cap
     };
 
     // Grab available GPU codes from nvcc and select the highest one
-    let max_nvcc_code = {
+    let (supported_nvcc_codes, max_nvcc_code) = {
         let out = std::process::Command::new("nvcc")
-                    .arg("--list-gpu-code")
-                    .output()
-                    .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+            .arg("--list-gpu-code")
+            .output()
+            .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
         let out = std::str::from_utf8(&out.stdout).unwrap();
 
         let out = out.lines().collect::<Vec<&str>>();
@@ -243,30 +268,21 @@ fn compute_cap() -> Result<usize> {
             }
         }
         codes.sort();
-        if !codes.contains(&compute_cap) {
-            anyhow::bail!(
-                "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}."
-            );
-        }
-        *codes.last().unwrap()
+        let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
+        (codes, max_nvcc_code)
     };
 
-    // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
-    // then choose the highest gpu code in nvcc
+    // Check that nvcc supports the asked compute caps
+    if !supported_nvcc_codes.contains(&compute_cap) {
+        anyhow::bail!(
+            "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
+        );
+    }
     if compute_cap > max_nvcc_code {
-        println!(
-            "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
+        anyhow::bail!(
+            "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
         );
-        compute_cap = max_nvcc_code;
     }
 
-    println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-    if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-        compute_cap = compute_cap_str
-            .parse::<usize>()
-            .with_context(|| format!("cannot parse as usize '{compute_cap_str}'"))?;
-        println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
-    }
-    println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
     Ok(compute_cap)
 }
diff --git a/candle-kernels/Cargo.toml b/candle-kernels/Cargo.toml
@@ -12,5 +12,6 @@ license = "MIT OR Apache-2.0"
 [dependencies]
 
 [build-dependencies]
+anyhow = { version = "1", features = ["backtrace"] }
 glob = "0.3.1"
-rayon = "1.7.0"
+rayon = "1.7.0"
diff --git a/candle-kernels/build.rs b/candle-kernels/build.rs
@@ -1,4 +1,5 @@
 use std::io::Write;
+
 fn main() {
     println!("cargo:rerun-if-changed=build.rs");
 
@@ -23,6 +24,8 @@ fn main() {
 }
 
 mod cuda {
+    use anyhow::{Context, Result};
+
     pub fn set_include_dir() {
         use std::path::PathBuf;
         // NOTE: copied from cudarc build.rs.
@@ -100,34 +103,112 @@ mod cuda {
         include_directories.sort();
         include_directories.dedup();
 
+        let compute_cap = compute_cap().expect("Could not get Cuda compute cap");
+
         #[allow(unused)]
         let include_options: Vec<String> = include_directories
             .into_iter()
             .map(|s| "-I".to_string() + &s.into_os_string().into_string().unwrap())
             .collect::<Vec<_>>();
 
-        // let start = std::time::Instant::now();
+        let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
+        println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
+        let children = kernel_paths
+            .par_iter()
+            .flat_map(|p| {
+                let mut output = p.clone();
+                output.set_extension("ptx");
+                let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
+
+                let ignore = if output_filename.exists() {
+                    let out_modified = output_filename.metadata().unwrap().modified().unwrap();
+                    let in_modified = p.metadata().unwrap().modified().unwrap();
+                    out_modified.duration_since(in_modified).is_ok()
+                } else {
+                    false
+                };
+                if ignore {
+                    None
+                } else {
+                    let mut command = std::process::Command::new("nvcc");
+                    command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
+                        .arg("--ptx")
+                        .args(["--default-stream", "per-thread"])
+                        .args(["--output-directory", &out_dir])
+                        // Flash attention only
+                        // .arg("--expt-relaxed-constexpr")
+                        .args(&include_options);
+                    if let Ok(ccbin_path) = &ccbin_env {
+                        command
+                            .arg("-allow-unsupported-compiler")
+                            .args(["-ccbin", ccbin_path]);
+                    }
+                    command.arg(p);
+                    Some((p, command.spawn()
+                        .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output()))
+                }
+            })
+            .collect::<Vec<_>>();
+
+        let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx"))
+            .unwrap()
+            .map(|p| p.unwrap())
+            .collect();
+        // We should rewrite `src/lib.rs` only if there are some newly compiled kernels, or removed
+        // some old ones
+        let write = !children.is_empty() || kernel_paths.len() < ptx_paths.len();
+        for (kernel_path, child) in children {
+            let output = child.expect("nvcc failed to run. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+            assert!(
+                output.status.success(),
+                "nvcc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
+                String::from_utf8_lossy(&output.stdout),
+                String::from_utf8_lossy(&output.stderr)
+            );
+        }
+        (write, kernel_paths)
+    }
+
+    #[allow(unused)]
+    fn compute_cap() -> Result<usize> {
+        println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
 
-        // Grab compute code from nvidia-smi
-        let mut compute_cap = {
+        // Try to parse compute caps from env
+        let mut compute_cap = if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
+            println!("cargo:rustc-env=CUDA_COMPUTE_CAP={compute_cap_str}");
+            compute_cap_str
+                .parse::<usize>()
+                .context("Could not parse code")?
+        } else {
+            // Use nvidia-smi to get the current compute cap
             let out = std::process::Command::new("nvidia-smi")
-                    .arg("--query-gpu=compute_cap")
-                    .arg("--format=csv")
-                    .output()
-                    .expect("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.");
-            let out = std::str::from_utf8(&out.stdout).unwrap();
+                .arg("--query-gpu=compute_cap")
+                .arg("--format=csv")
+                .output()
+                .context("`nvidia-smi` failed. Ensure that you have CUDA installed and that `nvidia-smi` is in your PATH.")?;
+            let out = std::str::from_utf8(&out.stdout).context("stdout is not a utf8 string")?;
             let mut lines = out.lines();
-            assert_eq!(lines.next().unwrap(), "compute_cap");
-            let cap = lines.next().unwrap().replace('.', "");
-            cap.parse::<usize>().unwrap()
+            assert_eq!(
+                lines.next().context("missing line in stdout")?,
+                "compute_cap"
+            );
+            let cap = lines
+                .next()
+                .context("missing line in stdout")?
+                .replace('.', "");
+            let cap = cap
+                .parse::<usize>()
+                .with_context(|| format!("cannot parse as int {cap}"))?;
+            println!("cargo:rustc-env=CUDA_COMPUTE_CAP={cap}");
+            cap
         };
 
         // Grab available GPU codes from nvcc and select the highest one
-        let max_nvcc_code = {
+        let (supported_nvcc_codes, max_nvcc_code) = {
             let out = std::process::Command::new("nvcc")
-                    .arg("--list-gpu-code")
-                    .output()
-                    .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
+                .arg("--list-gpu-code")
+                .output()
+                .expect("`nvcc` failed. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
             let out = std::str::from_utf8(&out.stdout).unwrap();
 
             let out = out.lines().collect::<Vec<&str>>();
@@ -141,83 +222,22 @@ mod cuda {
                 }
             }
             codes.sort();
-            if !codes.contains(&compute_cap) {
-                panic!("nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {codes:?}.");
-            }
-            *codes.last().unwrap()
+            let max_nvcc_code = *codes.last().context("no gpu codes parsed from nvcc")?;
+            (codes, max_nvcc_code)
         };
 
-        // If nvidia-smi compute_cap is higher than the highest gpu code from nvcc,
-        // then choose the highest gpu code in nvcc
-        if compute_cap > max_nvcc_code {
-            println!(
-                "cargo:warning=Lowering gpu arch {compute_cap} to max nvcc target {max_nvcc_code}."
-            );
-            compute_cap = max_nvcc_code;
+        // Check that nvcc supports the asked compute caps
+        if !supported_nvcc_codes.contains(&compute_cap) {
+            anyhow::bail!(
+            "nvcc cannot target gpu arch {compute_cap}. Available nvcc targets are {supported_nvcc_codes:?}."
+        );
         }
-
-        println!("cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP");
-        if let Ok(compute_cap_str) = std::env::var("CUDA_COMPUTE_CAP") {
-            compute_cap = compute_cap_str.parse::<usize>().unwrap();
-            println!("cargo:warning=Using gpu arch {compute_cap} from $CUDA_COMPUTE_CAP");
+        if compute_cap > max_nvcc_code {
+            anyhow::bail!(
+            "CUDA compute cap {compute_cap} is higher than the highest gpu code from nvcc {max_nvcc_code}"
+        );
         }
 
-        println!("cargo:rustc-env=CUDA_COMPUTE_CAP=sm_{compute_cap}");
-
-        let ccbin_env = std::env::var("CANDLE_NVCC_CCBIN");
-        println!("cargo:rerun-if-env-changed=CANDLE_NVCC_CCBIN");
-        let children = kernel_paths
-                .par_iter()
-                .flat_map(|p| {
-                    let mut output = p.clone();
-                    output.set_extension("ptx");
-                    let output_filename = std::path::Path::new(&out_dir).to_path_buf().join("out").with_file_name(output.file_name().unwrap());
-
-                    let ignore = if output_filename.exists() {
-                        let out_modified = output_filename.metadata().unwrap().modified().unwrap();
-                        let in_modified = p.metadata().unwrap().modified().unwrap();
-                        out_modified.duration_since(in_modified).is_ok()
-                    }else{
-                        false
-                    };
-                    if ignore{
-                        None
-                    }else{
-                        let mut command = std::process::Command::new("nvcc");
-                            command.arg(format!("--gpu-architecture=sm_{compute_cap}"))
-                            .arg("--ptx")
-                            .args(["--default-stream", "per-thread"])
-                            .args(["--output-directory", &out_dir])
-                            // Flash attention only
-                            // .arg("--expt-relaxed-constexpr")
-                            .args(&include_options);
-                        if let Ok(ccbin_path) = &ccbin_env {
-                            command
-                                .arg("-allow-unsupported-compiler")
-                                .args(["-ccbin", ccbin_path]);
-                        }
-                        command.arg(p);
-                        Some((p,  command.spawn()
-                        .expect("nvcc failed to start. Ensure that you have CUDA installed and that `nvcc` is in your PATH.").wait_with_output()))
-                    }})
-                .collect::<Vec<_>>();
-
-        let ptx_paths: Vec<PathBuf> = glob::glob(&format!("{out_dir}/**/*.ptx"))
-            .unwrap()
-            .map(|p| p.unwrap())
-            .collect();
-        // We should rewrite `src/lib.rs` only if there are some newly compiled kernels, or removed
-        // some old ones
-        let write = !children.is_empty() || kernel_paths.len() < ptx_paths.len();
-        for (kernel_path, child) in children {
-            let output = child.expect("nvcc failed to run. Ensure that you have CUDA installed and that `nvcc` is in your PATH.");
-            assert!(
-                output.status.success(),
-                "nvcc error while compiling {kernel_path:?}:\n\n# stdout\n{:#}\n\n# stderr\n{:#}",
-                String::from_utf8_lossy(&output.stdout),
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
-        (write, kernel_paths)
+        Ok(compute_cap)
     }
 }