Merge remote-tracking branch 'upstream/main'

EricLBuehler · Oct 27, 2024 · 41324ef · 41324ef
2 parents 522531d + 0e2c8c1
commit 41324ef
Show file tree

Hide file tree

Showing 63 changed files with 5,969 additions and 289 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -46,7 +46,7 @@ criterion = { version = "0.5.1", default-features=false }
 cudarc = { version = "0.12.1", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
 fancy-regex = "0.13.0"
 gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
-hf-hub = "0.3.0"
+hf-hub = { version = "0.3.3", package = "candle-hf-hub" }
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
 float8 = { version = "0.1.0", features = ["num-traits", "rand_distr"] }
 hound = "3.5.1"
@@ -71,6 +71,9 @@ tokenizers = { version = "0.19.1", default-features = false }
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"
+ug = "0.0.2"
+ug-cuda = "0.0.2"
+ug-metal = "0.0.2"
 yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "1.1.1", default-features = false }
 metal = { version = "0.27.0", features = ["mps"]}

diff --git a/README.md b/README.md
@@ -2,7 +2,8 @@
 [![discord server](https://dcbadge.vercel.app/api/server/hugging-face-879548962464493619)](https://discord.gg/hugging-face-879548962464493619)
 [![Latest version](https://img.shields.io/crates/v/candle-core.svg)](https://crates.io/crates/candle-core)
 [![Documentation](https://docs.rs/candle-core/badge.svg)](https://docs.rs/candle-core)
-![License](https://img.shields.io/crates/l/candle-core.svg)
+[![License](https://img.shields.io/github/license/base-org/node?color=blue)](https://github.com/huggingface/candle/blob/main/LICENSE-MIT)
+[![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](https://github.com/huggingface/candle/blob/main/LICENSE-APACHE)
 
 **This is an optimized implmentation by Eric Buehler.**
 
@@ -189,6 +190,7 @@ And then head over to
 - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle.
 - [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem.
 - [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library.
+- [`atoma-infer`](https://github.com/atoma-network/atoma-infer): A Rust library for fast inference at scale, leveraging FlashAttention2 for efficient attention computation, PagedAttention for efficient KV-cache memory management, and multi-GPU support. It is OpenAI api compatible.
 
 If you have an addition to this list, please submit a pull request.
 

diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md
@@ -11,8 +11,8 @@ Then let's start by downloading the [model file](https://huggingface.co/bert-bas
 
 ```rust
 # extern crate candle_core;
-# extern crate hf_hub;
-use hf_hub::api::sync::Api;
+# extern crate candle_hf_hub;
+use candle_hf_hub::api::sync::Api;
 use candle_core::Device;
 
 let api = Api::new().unwrap();
@@ -50,8 +50,8 @@ Now that we have our weights, we can use them in our bert architecture:
 ```rust
 # extern crate candle_core;
 # extern crate candle_nn;
-# extern crate hf_hub;
-# use hf_hub::api::sync::Api;
+# extern crate candle_hf_hub;
+# use candle_hf_hub::api::sync::Api;
 # 
 # let api = Api::new().unwrap();
 # let repo = api.model("bert-base-uncased".to_string());

diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
@@ -29,6 +29,9 @@ rand_distr = { workspace = true }
 rayon = { workspace = true }
 safetensors = { workspace = true }
 thiserror = { workspace = true }
+ug = { workspace = true }
+ug-cuda = { workspace = true, optional = true }
+ug-metal = { workspace = true, optional = true }
 yoke = { workspace = true }
 zip = { workspace = true }
 
@@ -40,11 +43,11 @@ criterion = { workspace = true }
 
 [features]
 default = []
-cuda = ["cudarc", "dep:candle-kernels", "float8/cuda"]
+cuda = ["cudarc", "dep:candle-kernels", "float8/cuda", "dep:ug-cuda"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
-metal = ["dep:metal", "dep:candle-metal-kernels"]
+metal = ["dep:metal", "dep:candle-metal-kernels", "dep:ug-metal"]
 
 [[bench]]
 name = "bench_main"

diff --git a/candle-core/src/cuda_backend/cudnn.rs b/candle-core/src/cuda_backend/cudnn.rs
@@ -26,6 +26,7 @@ impl From<cudarc::driver::DriverError> for crate::Error {
 
 pub(crate) fn launch_conv2d<
     T: DeviceRepr + WithDType + ValidAsZeroBits + cudarc::cudnn::CudnnDataType,
+    Y: cudarc::cudnn::CudnnDataType,
 >(
     src: &CudaView<T>,
     src_l: &crate::Layout,
@@ -48,7 +49,7 @@ pub(crate) fn launch_conv2d<
         }
         c
     })?;
-    let conv = cudnn.create_conv2d::<T>(
+    let conv = cudnn.create_conv2d::<Y>(
         /* pad */ [params.padding as i32, params.padding as i32],
         /* stride */ [params.stride as i32, params.stride as i32],
         /* dilation */ [params.dilation as i32, params.dilation as i32],
@@ -62,18 +63,18 @@ pub(crate) fn launch_conv2d<
     ];
     // Note that `src` already starts at the proper offset.
     let x = if src_l.is_contiguous() {
-        cudnn.create_4d_tensor(
+        cudnn.create_4d_tensor::<T>(
             cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
             x_shape,
         )?
     } else {
         let s = src_l.stride();
-        cudnn.create_4d_tensor_ex(
+        cudnn.create_4d_tensor_ex::<T>(
             x_shape,
             [s[0] as i32, s[1] as i32, s[2] as i32, s[3] as i32],
         )?
     };
-    let w = cudnn.create_4d_filter(
+    let w = cudnn.create_4d_filter::<T>(
         cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
         [
             params.c_out as i32,
@@ -83,7 +84,7 @@ pub(crate) fn launch_conv2d<
         ],
     )?;
     let (w_out, h_out) = (params.out_w() as i32, params.out_h() as i32);
-    let y = cudnn.create_4d_tensor(
+    let y = cudnn.create_4d_tensor::<T>(
         cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
         [params.b_size as i32, params.c_out as i32, h_out, w_out],
     )?;

diff --git a/candle-core/src/cuda_backend/device.rs b/candle-core/src/cuda_backend/device.rs
@@ -57,6 +57,27 @@ impl CudaDevice {
         self.device.clone()
     }
 
+    pub fn compile(
+        &self,
+        func_name: &'static str,
+        kernel: ug::lang::ssa::Kernel,
+    ) -> Result<CudaFunction> {
+        let mut buf = vec![];
+        ug_cuda::code_gen::gen(&mut buf, func_name, &kernel)?;
+        let cuda_code = String::from_utf8(buf)?;
+        let opts = cudarc::nvrtc::CompileOptions {
+            use_fast_math: Some(true),
+            ..Default::default()
+        };
+        let ptx = cudarc::nvrtc::safe::compile_ptx_with_opts(cuda_code, opts).w()?;
+        self.device.load_ptx(ptx, "ug", &[func_name]).w()?;
+        let func = match self.device.get_func("ug", func_name) {
+            Some(func) => func,
+            None => crate::bail!("unknown function ug::{func_name}"),
+        };
+        Ok(func)
+    }
+
     pub fn id(&self) -> DeviceId {
         self.id
     }
@@ -174,6 +195,20 @@ impl CudaDevice {
     }
 }
 
+impl CudaDevice {
+    pub fn new_with_stream(ordinal: usize) -> Result<Self> {
+        let device = cudarc::driver::CudaDevice::new_with_stream(ordinal).w()?;
+        let blas = cudarc::cublas::CudaBlas::new(device.clone()).w()?;
+        let curand = cudarc::curand::CudaRng::new(299792458, device.clone()).w()?;
+        Ok(Self {
+            id: DeviceId::new(),
+            device,
+            blas: Arc::new(blas),
+            curand: Arc::new(Mutex::new(CudaRng(curand))),
+        })
+    }
+}
+
 impl BackendDevice for CudaDevice {
     type Storage = CudaStorage;
 

diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs
@@ -1593,39 +1593,42 @@ impl BackendStorage for CudaStorage {
                 let inp = &inp.slice(inp_l.start_offset()..);
                 let k = &k.slice(kernel_l.start_offset()..);
                 let mut out = unsafe { device.alloc::<u8>(dst_el) }.w()?;
-                crate::cudnn::launch_conv2d::<u8>(inp, inp_l, k, &mut out, params, &device)
+                crate::cudnn::launch_conv2d::<u8, u8>(inp, inp_l, k, &mut out, params, &device)
                     .map_err(crate::Error::wrap)?;
                 S::U8(out)
             }
             (S::BF16(inp), S::BF16(k)) => {
                 let inp = &inp.slice(inp_l.start_offset()..);
                 let k = &k.slice(kernel_l.start_offset()..);
                 let mut out = unsafe { device.alloc::<bf16>(dst_el) }.w()?;
-                crate::cudnn::launch_conv2d::<bf16>(inp, inp_l, k, &mut out, params, &device)
+                // Only PSEUDO_BFLOAT16_CONFIG is supported in cudnn, there is no "true bfloat16"
+                // version.
+                // https://docs.nvidia.com/deeplearning/cudnn/latest/api/cudnn-cnn-library.html#id88
+                crate::cudnn::launch_conv2d::<bf16, f32>(inp, inp_l, k, &mut out, params, &device)
                     .map_err(crate::Error::wrap)?;
                 S::BF16(out)
             }
             (S::F16(inp), S::F16(k)) => {
                 let inp = &inp.slice(inp_l.start_offset()..);
                 let k = &k.slice(kernel_l.start_offset()..);
                 let mut out = unsafe { device.alloc::<f16>(dst_el) }.w()?;
-                crate::cudnn::launch_conv2d::<f16>(inp, inp_l, k, &mut out, params, &device)
+                crate::cudnn::launch_conv2d::<f16, f16>(inp, inp_l, k, &mut out, params, &device)
                     .map_err(crate::Error::wrap)?;
                 S::F16(out)
             }
             (S::F32(inp), S::F32(k)) => {
                 let inp = &inp.slice(inp_l.start_offset()..);
                 let k = &k.slice(kernel_l.start_offset()..);
                 let mut out = unsafe { device.alloc::<f32>(dst_el) }.w()?;
-                crate::cudnn::launch_conv2d::<f32>(inp, inp_l, k, &mut out, params, &device)
+                crate::cudnn::launch_conv2d::<f32, f32>(inp, inp_l, k, &mut out, params, &device)
                     .map_err(crate::Error::wrap)?;
                 S::F32(out)
             }
             (S::F64(inp), S::F64(k)) => {
                 let inp = &inp.slice(inp_l.start_offset()..);
                 let k = &k.slice(kernel_l.start_offset()..);
                 let mut out = unsafe { device.alloc::<f64>(dst_el) }.w()?;
-                crate::cudnn::launch_conv2d::<f64>(inp, inp_l, k, &mut out, params, &device)
+                crate::cudnn::launch_conv2d::<f64, f64>(inp, inp_l, k, &mut out, params, &device)
                     .map_err(crate::Error::wrap)?;
                 S::F64(out)
             }

diff --git a/candle-core/src/custom_op.rs b/candle-core/src/custom_op.rs
@@ -375,3 +375,110 @@ impl Tensor {
         )
     }
 }
+
+pub struct UgIOp1 {
+    name: &'static str,
+    #[cfg(feature = "cuda")]
+    func: cudarc::driver::CudaFunction,
+    #[cfg(feature = "metal")]
+    func: metal::ComputePipelineState,
+}
+
+impl UgIOp1 {
+    #[allow(unused)]
+    pub fn new(
+        name: &'static str,
+        kernel: ug::lang::ssa::Kernel,
+        device: &crate::Device,
+    ) -> Result<Self> {
+        #[cfg(feature = "cuda")]
+        {
+            let device = device.as_cuda_device()?;
+            let func = device.compile(name, kernel)?;
+            Ok(Self { name, func })
+        }
+        #[cfg(feature = "metal")]
+        {
+            let device = device.as_metal_device()?;
+            let func = device.compile(name, kernel)?;
+            Ok(Self { name, func })
+        }
+        #[cfg(not(any(feature = "cuda", feature = "metal")))]
+        {
+            Ok(Self { name })
+        }
+    }
+}
+
+impl InplaceOp1 for UgIOp1 {
+    fn name(&self) -> &'static str {
+        self.name
+    }
+
+    fn cpu_fwd(&self, _: &mut CpuStorage, _: &Layout) -> Result<()> {
+        crate::bail!("ug ops are only supported on metal/cuda at the moment")
+    }
+
+    #[cfg(feature = "metal")]
+    fn metal_fwd(&self, sto: &mut MetalStorage, layout: &Layout) -> Result<()> {
+        use crate::backend::BackendStorage;
+        use candle_metal_kernels::utils::EncoderProvider;
+
+        let elem_count = layout.shape().elem_count();
+        if sto.dtype() != crate::DType::F32 {
+            // TODO: support more dtypes.
+            crate::bail!("input is not a f32 tensor")
+        }
+        let device = sto.device();
+        println!("here");
+        let command_buffer = device.command_buffer()?;
+        let command_buffer = &command_buffer;
+        let encoder = command_buffer.encoder();
+        let encoder = encoder.as_ref();
+        encoder.set_compute_pipeline_state(&self.func);
+        let (g, b) = if elem_count % 32 == 0 {
+            (elem_count / 32, 32)
+        } else {
+            (elem_count, 1)
+        };
+        let grid_dims = metal::MTLSize {
+            width: g as u64,
+            height: 1,
+            depth: 1,
+        };
+        let group_dims = candle_metal_kernels::utils::get_block_dims(b as u64, 1, 1);
+        candle_metal_kernels::utils::set_param(encoder, 0, (sto.buffer(), 0usize));
+
+        encoder.use_resource(sto.buffer(), metal::MTLResourceUsage::Write);
+        encoder.dispatch_threads(grid_dims, group_dims);
+
+        Ok(())
+    }
+
+    #[cfg(feature = "cuda")]
+    fn cuda_fwd(&self, sto: &mut CudaStorage, layout: &Layout) -> Result<()> {
+        use crate::cuda_backend::WrapErr;
+        use cudarc::driver::LaunchAsync;
+
+        let elem_count = layout.shape().elem_count();
+        // TODO: support more dtypes.
+        let sto = sto.as_cuda_slice::<f32>()?;
+        let sto = match layout.contiguous_offsets() {
+            None => crate::bail!("input has to be contiguous"),
+            Some((o1, o2)) => sto.slice(o1..o2),
+        };
+        let params = (&sto,);
+        let (g, b) = if elem_count % 32 == 0 {
+            (elem_count / 32, 32)
+        } else {
+            (elem_count, 1)
+        };
+        let cfg = cudarc::driver::LaunchConfig {
+            grid_dim: (g as u32, 1, 1),
+            block_dim: (b as u32, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe { self.func.clone().launch(cfg, params) }.w()?;
+        Ok(())
+    }
+}
diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs
@@ -130,6 +130,26 @@ impl Device {
         Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?))
     }
 
+    pub fn as_cuda_device(&self) -> Result<&crate::CudaDevice> {
+        match self {
+            Self::Cuda(d) => Ok(d),
+            Self::Cpu => crate::bail!("expected a cuda device, got cpu"),
+            Self::Metal(_) => crate::bail!("expected a cuda device, got Metal"),
+        }
+    }
+
+    pub fn as_metal_device(&self) -> Result<&crate::MetalDevice> {
+        match self {
+            Self::Cuda(_) => crate::bail!("expected a metal device, got cuda"),
+            Self::Cpu => crate::bail!("expected a metal device, got cpu"),
+            Self::Metal(d) => Ok(d),
+        }
+    }
+
+    pub fn new_cuda_with_stream(ordinal: usize) -> Result<Self> {
+        Ok(Self::Cuda(crate::CudaDevice::new_with_stream(ordinal)?))
+    }
+
     pub fn new_metal(ordinal: usize) -> Result<Self> {
         Ok(Self::Metal(crate::MetalDevice::new(ordinal)?))
     }

diff --git a/candle-core/src/dummy_cuda_backend.rs b/candle-core/src/dummy_cuda_backend.rs
@@ -14,6 +14,12 @@ macro_rules! fail {
     };
 }
 
+impl CudaDevice {
+    pub fn new_with_stream(_: usize) -> Result<Self> {
+        Err(Error::NotCompiledWithCudaSupport)
+    }
+}
+
 impl crate::backend::BackendStorage for CudaStorage {
     type Device = CudaDevice;
 

diff --git a/candle-core/src/error.rs b/candle-core/src/error.rs
@@ -186,6 +186,9 @@ pub enum Error {
     #[error("Metal error {0}")]
     Metal(#[from] MetalError),
 
+    #[error(transparent)]
+    Ug(#[from] ug::Error),
+
     #[error(transparent)]
     TryFromIntError(#[from] core::num::TryFromIntError),
 
@@ -200,6 +203,10 @@ pub enum Error {
     #[error(transparent)]
     ParseInt(#[from] std::num::ParseIntError),
 
+    /// Utf8 parse error.
+    #[error(transparent)]
+    FromUtf8(#[from] std::string::FromUtf8Error),
+
     /// I/O error.
     #[error(transparent)]
     Io(#[from] std::io::Error),