From fa4902fe31f81c95ebbe8892f6d8fa34d2ef9208 Mon Sep 17 00:00:00 2001
From: Eric Buehler <65165915+EricLBuehler@users.noreply.github.com>
Date: Thu, 17 Oct 2024 07:28:05 -0400
Subject: [PATCH] Add initial f8 e4m3 dtype (#31)

* Add initial f8 e4m3 type

* Fixes

* Update deps

* Implement CudaDType

* Add some cast kernels

* Add copy2d, other impls

* Fix zeros impl

* Error checking for metal

* Use isnanf
---
 .vscode/settings.json                  |   3 +
 Cargo.toml                             |   1 +
 candle-core/Cargo.toml                 |   3 +-
 candle-core/src/convert.rs             |   6 +
 candle-core/src/cpu_backend/mod.rs     | 133 ++++++++++
 candle-core/src/cpu_backend/utils.rs   |   6 +
 candle-core/src/cuda_backend/device.rs |  35 ++-
 candle-core/src/cuda_backend/mod.rs    |  49 ++++
 candle-core/src/cuda_backend/utils.rs  |   8 +
 candle-core/src/display.rs             |   9 +
 candle-core/src/dtype.rs               |  23 +-
 candle-core/src/metal_backend/mod.rs   |   4 +
 candle-core/src/npy.rs                 |  10 +
 candle-core/src/op.rs                  |  67 +++++
 candle-core/src/safetensors.rs         |   5 +
 candle-core/src/sort.rs                |   3 +
 candle-kernels/src/affine.cu           |  28 ++-
 candle-kernels/src/binary.cu           |  15 ++
 candle-kernels/src/cast.cu             |  86 +++++++
 candle-kernels/src/compatibility.cuh   |   1 +
 candle-kernels/src/conv.cu             |  12 +
 candle-kernels/src/cuda_utils.cuh      |  23 ++
 candle-kernels/src/fill.cu             |   5 +
 candle-kernels/src/fused_layer_norm.cu | 329 -------------------------
 candle-kernels/src/indexing.cu         |  95 +++++++
 candle-kernels/src/kvconcat.cu         |   1 +
 candle-kernels/src/lib.rs              |   1 -
 candle-kernels/src/reduce.cu           |   8 +
 candle-kernels/src/sort.cu             |   3 +
 candle-kernels/src/ternary.cu          |   6 +
 candle-kernels/src/unary.cu            |  27 ++
 candle-pyo3/Cargo.toml                 |   1 +
 candle-pyo3/src/lib.rs                 |   3 +
 33 files changed, 662 insertions(+), 347 deletions(-)
 delete mode 100644 candle-kernels/src/fused_layer_norm.cu

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 6abf0d3d6d..62b4261af3 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -11,4 +11,7 @@
     "rust-analyzer.cargo.features": [
         "cuda",
     ],
+    "files.associations": {
+        "cmath": "cpp"
+    },
 }
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index d6cf18614f..3a431ae6ba 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,7 @@ fancy-regex = "0.13.0"
 gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
 hf-hub = "0.3.0"
 half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
+float8 = { version = "0.1.0", features = ["num-traits", "rand_distr"] }
 hound = "3.5.1"
 image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }
diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml
index cbf8f2007f..6ce7e31e1c 100644
--- a/candle-core/Cargo.toml
+++ b/candle-core/Cargo.toml
@@ -18,6 +18,7 @@ metal = { workspace = true, optional = true}
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
+float8 = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 libc = { workspace = true, optional = true }
 memmap2 = { workspace = true }
@@ -39,7 +40,7 @@ criterion = { workspace = true }
 
 [features]
 default = []
-cuda = ["cudarc", "dep:candle-kernels"]
+cuda = ["cudarc", "dep:candle-kernels", "float8/cuda"]
 cudnn = ["cuda", "cudarc/cudnn"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
diff --git a/candle-core/src/convert.rs b/candle-core/src/convert.rs
index 3e19d970c3..173a96d6e6 100644
--- a/candle-core/src/convert.rs
+++ b/candle-core/src/convert.rs
@@ -1,5 +1,6 @@
 //! Implement conversion traits for tensors
 use crate::{DType, Device, Error, Tensor, WithDType};
+use float8::F8E4M3;
 use half::{bf16, f16, slice::HalfFloatSliceExt};
 use std::convert::TryFrom;
 
@@ -149,6 +150,11 @@ impl Tensor {
                 let vs = vs.to_vec1::<u8>()?;
                 f.write_all(&vs)?;
             }
+            DType::F8E4M3 => {
+                for v in vs.to_vec1::<F8E4M3>()? {
+                    f.write_u8(v.to_bits())?
+                }
+            }
         }
         Ok(())
     }
diff --git a/candle-core/src/cpu_backend/mod.rs b/candle-core/src/cpu_backend/mod.rs
index 24ce83581c..6ef74c0725 100644
--- a/candle-core/src/cpu_backend/mod.rs
+++ b/candle-core/src/cpu_backend/mod.rs
@@ -3,6 +3,7 @@ use std::ops::Deref;
 use crate::backend::{BackendDevice, BackendStorage};
 use crate::op::{BinaryOpT, CmpOp, ReduceOp, UnaryOpT};
 use crate::{DType, Error, IntDType, Layout, Result, Shape, WithDType};
+use float8::F8E4M3;
 use half::{bf16, f16};
 use rayon::prelude::*;
 
@@ -29,6 +30,7 @@ pub enum CpuStorage {
     F16(Vec<f16>),
     F32(Vec<f32>),
     F64(Vec<f64>),
+    F8E4M3(Vec<F8E4M3>),
 }
 
 #[derive(Debug, Clone)]
@@ -42,6 +44,7 @@ pub enum CpuStorageRef<'a> {
     F16(&'a [f16]),
     F32(&'a [f32]),
     F64(&'a [f64]),
+    F8E4M3(&'a [F8E4M3]),
 }
 
 #[derive(Debug, Clone)]
@@ -2366,6 +2369,17 @@ impl CpuStorage {
                     .concat();
                 Self::F64(storages)
             }
+            Self::F8E4M3(_) => {
+                let storages = storages
+                    .iter()
+                    .map(|s| match s {
+                        Self::F8E4M3(s) => Ok(s.as_slice()),
+                        _ => crate::bail!("dtype mismatch"),
+                    })
+                    .collect::<Result<Vec<_>>>()?
+                    .concat();
+                Self::F8E4M3(storages)
+            }
         };
         Ok(s)
     }
@@ -2385,6 +2399,7 @@ impl BackendStorage for CpuStorage {
             Self::F16(_) => DType::F16,
             Self::F32(_) => DType::F32,
             Self::F64(_) => DType::F64,
+            Self::F8E4M3(_) => DType::F8E4M3,
         }
     }
 
@@ -2427,6 +2442,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, bf16::from_f64);
                 Ok(Self::BF16(data))
             }
+            (Self::F8E4M3(storage), DType::BF16) => {
+                let data = unary_map(storage, layout, |v| bf16::from_f32(v.to_f32()));
+                Ok(Self::BF16(data))
+            }
             (Self::U8(storage), DType::F16) => {
                 let data = unary_map(storage, layout, |v| f16::from_f32(v as f32));
                 Ok(Self::F16(data))
@@ -2463,6 +2482,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, f16::from_f64);
                 Ok(Self::F16(data))
             }
+            (Self::F8E4M3(storage), DType::F16) => {
+                let data = unary_map(storage, layout, |v| f16::from_f32(v.to_f32()));
+                Ok(Self::F16(data))
+            }
             (Self::U8(storage), DType::F32) => {
                 let data = unary_map(storage, layout, |v| v as f32);
                 Ok(Self::F32(data))
@@ -2499,6 +2522,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v as f32);
                 Ok(Self::F32(data))
             }
+            (Self::F8E4M3(storage), DType::F32) => {
+                let data = unary_map(storage, layout, |v| v.to_f32());
+                Ok(Self::F32(data))
+            }
             (Self::U8(storage), DType::U8) => {
                 let data = unary_map(storage, layout, |v| v);
                 Ok(Self::U8(data))
@@ -2535,6 +2562,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v as u8);
                 Ok(Self::U8(data))
             }
+            (Self::F8E4M3(storage), DType::U8) => {
+                let data = unary_map(storage, layout, |v| v.to_f32() as u8);
+                Ok(Self::U8(data))
+            }
             (Self::U8(storage), DType::U32) => {
                 let data = unary_map(storage, layout, |v| v as u32);
                 Ok(Self::U32(data))
@@ -2571,6 +2602,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v as u32);
                 Ok(Self::U32(data))
             }
+            (Self::F8E4M3(storage), DType::U32) => {
+                let data = unary_map(storage, layout, |v| v.to_f32() as u32);
+                Ok(Self::U32(data))
+            }
             (Self::U8(storage), DType::I16) => {
                 let data = unary_map(storage, layout, |v| v as i64);
                 Ok(Self::I64(data))
@@ -2607,6 +2642,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v as i16);
                 Ok(Self::I16(data))
             }
+            (Self::F8E4M3(storage), DType::I16) => {
+                let data = unary_map(storage, layout, |v| v.to_f32() as i16);
+                Ok(Self::I16(data))
+            }
             (Self::U8(storage), DType::I32) => {
                 let data = unary_map(storage, layout, |v| v as i64);
                 Ok(Self::I64(data))
@@ -2643,6 +2682,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v as i32);
                 Ok(Self::I32(data))
             }
+            (Self::F8E4M3(storage), DType::I32) => {
+                let data = unary_map(storage, layout, |v| v.to_f32() as i32);
+                Ok(Self::I32(data))
+            }
             (Self::U8(storage), DType::I64) => {
                 let data = unary_map(storage, layout, |v| v as i64);
                 Ok(Self::I64(data))
@@ -2679,6 +2722,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v as i64);
                 Ok(Self::I64(data))
             }
+            (Self::F8E4M3(storage), DType::I64) => {
+                let data = unary_map(storage, layout, |v| v.to_f32() as i64);
+                Ok(Self::I64(data))
+            }
             (Self::U8(storage), DType::F64) => {
                 let data = unary_map(storage, layout, |v| v as f64);
                 Ok(Self::F64(data))
@@ -2715,6 +2762,50 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v);
                 Ok(Self::F64(data))
             }
+            (Self::F8E4M3(storage), DType::F64) => {
+                let data = unary_map(storage, layout, |v| v.to_f64());
+                Ok(Self::F64(data))
+            }
+            (Self::U8(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| F8E4M3::from_f32(v as f32));
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::U32(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| F8E4M3::from_f32(v as f32));
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::I16(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| F8E4M3::from_f32(v as f32));
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::I32(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| F8E4M3::from_f32(v as f32));
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::I64(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| F8E4M3::from_f32(v as f32));
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::BF16(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| F8E4M3::from(v.to_f32()));
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::F16(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| F8E4M3::from_f32(v.to_f32()));
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::F32(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, F8E4M3::from_f32);
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::F64(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, F8E4M3::from_f64);
+                Ok(Self::F8E4M3(data))
+            }
+            (Self::F8E4M3(storage), DType::F8E4M3) => {
+                let data = unary_map(storage, layout, |v| v);
+                Ok(Self::F8E4M3(data))
+            }
         }
     }
 
@@ -2828,6 +2919,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| v.powf(e));
                 Ok(Self::F64(data))
             }
+            Self::F8E4M3(storage) => {
+                let data = unary_map(storage, layout, |v| v.powf(F8E4M3::from_f64(e)));
+                Ok(Self::F8E4M3(data))
+            }
             Self::U8(_) => Err(Error::UnsupportedDTypeForOp(DType::U8, "elu").bt()),
             Self::U32(_) => Err(Error::UnsupportedDTypeForOp(DType::U32, "elu").bt()),
             Self::I16(_) => Err(Error::UnsupportedDTypeForOp(DType::I16, "elu").bt()),
@@ -2855,6 +2950,10 @@ impl BackendStorage for CpuStorage {
                 let data = unary_map(storage, layout, |v| elu(v, alpha));
                 Ok(Self::F64(data))
             }
+            Self::F8E4M3(storage) => {
+                let data = unary_map(storage, layout, |v| elu(v, F8E4M3::from_f64(alpha)));
+                Ok(Self::F8E4M3(data))
+            }
             Self::U8(_) => Err(Error::UnsupportedDTypeForOp(DType::U8, "elu").bt()),
             Self::U32(_) => Err(Error::UnsupportedDTypeForOp(DType::U32, "elu").bt()),
             Self::I16(_) => Err(Error::UnsupportedDTypeForOp(DType::I16, "elu").bt()),
@@ -2901,6 +3000,15 @@ impl BackendStorage for CpuStorage {
                     Ok(Self::F64(data))
                 }
             }
+            Self::F8E4M3(storage) => {
+                if B::F8E4M3_VEC {
+                    let data = unary_map_vec(storage, layout, B::f8e4m3, B::f8e4m3_vec);
+                    Ok(Self::F8E4M3(data))
+                } else {
+                    let data = unary_map(storage, layout, B::f8e4m3);
+                    Ok(Self::F8E4M3(data))
+                }
+            }
             Self::U8(storage) => {
                 let data = unary_map(storage, layout, B::u8);
                 Ok(Self::U8(data))
@@ -3455,6 +3563,15 @@ impl BackendDevice for CpuDevice {
                 }
                 Ok(CpuStorage::F16(data))
             }
+            DType::F8E4M3 => {
+                let mut data = Vec::with_capacity(elem_count);
+                let uniform =
+                    rand::distributions::Uniform::new(F8E4M3::from_f64(min), F8E4M3::from_f64(max));
+                for _i in 0..elem_count {
+                    data.push(rng.sample::<F8E4M3, _>(uniform))
+                }
+                Ok(CpuStorage::F8E4M3(data))
+            }
             DType::F32 => {
                 let mut data = Vec::with_capacity(elem_count);
                 let uniform = rand::distributions::Uniform::new(min as f32, max as f32);
@@ -3501,6 +3618,15 @@ impl BackendDevice for CpuDevice {
                 }
                 Ok(CpuStorage::F16(data))
             }
+            DType::F8E4M3 => {
+                let mut data = Vec::with_capacity(elem_count);
+                let normal = rand_distr::Normal::new(F8E4M3::from_f64(mean), F8E4M3::from_f64(std))
+                    .map_err(Error::wrap)?;
+                for _i in 0..elem_count {
+                    data.push(normal.sample(&mut rng))
+                }
+                Ok(CpuStorage::F8E4M3(data))
+            }
             DType::F32 => {
                 let mut data = Vec::with_capacity(elem_count);
                 let normal =
@@ -3574,6 +3700,11 @@ impl BackendDevice for CpuDevice {
                 v.set_len(elem_count);
                 CpuStorage::F64(v)
             }
+            DType::F8E4M3 => {
+                let mut v = Vec::with_capacity(elem_count);
+                v.set_len(elem_count);
+                CpuStorage::F8E4M3(v)
+            }
         };
         Ok(storage)
     }
@@ -3588,6 +3719,7 @@ impl BackendDevice for CpuDevice {
             DType::I64 => CpuStorage::I64(vec![1i64; elem_count]),
             DType::BF16 => CpuStorage::BF16(vec![bf16::ONE; elem_count]),
             DType::F16 => CpuStorage::F16(vec![f16::ONE; elem_count]),
+            DType::F8E4M3 => CpuStorage::F8E4M3(vec![F8E4M3::ONE; elem_count]),
             DType::F32 => CpuStorage::F32(vec![1f32; elem_count]),
             DType::F64 => CpuStorage::F64(vec![1f64; elem_count]),
         };
@@ -3604,6 +3736,7 @@ impl BackendDevice for CpuDevice {
             DType::I64 => CpuStorage::I64(vec![0i64; elem_count]),
             DType::BF16 => CpuStorage::BF16(vec![bf16::ZERO; elem_count]),
             DType::F16 => CpuStorage::F16(vec![f16::ZERO; elem_count]),
+            DType::F8E4M3 => CpuStorage::F8E4M3(vec![F8E4M3::ZERO; elem_count]),
             DType::F32 => CpuStorage::F32(vec![0f32; elem_count]),
             DType::F64 => CpuStorage::F64(vec![0f64; elem_count]),
         };
diff --git a/candle-core/src/cpu_backend/utils.rs b/candle-core/src/cpu_backend/utils.rs
index 20f362e8c4..495fcd660b 100644
--- a/candle-core/src/cpu_backend/utils.rs
+++ b/candle-core/src/cpu_backend/utils.rs
@@ -17,6 +17,7 @@ pub trait Map1 {
             C::F16(vs) => Ok(C::F16(self.f(vs, layout)?)),
             C::F32(vs) => Ok(C::F32(self.f(vs, layout)?)),
             C::F64(vs) => Ok(C::F64(self.f(vs, layout)?)),
+            C::F8E4M3(vs) => Ok(C::F8E4M3(self.f(vs, layout)?)),
         }
     }
 }
@@ -35,6 +36,7 @@ pub trait Map1Any {
             C::F16(vs) => Ok(self.f(vs, layout, C::F16)?),
             C::F32(vs) => Ok(self.f(vs, layout, C::F32)?),
             C::F64(vs) => Ok(self.f(vs, layout, C::F64)?),
+            C::F8E4M3(vs) => Ok(self.f(vs, layout, C::F8E4M3)?),
         }
     }
 }
@@ -52,6 +54,7 @@ pub trait Map2 {
             (C::F16(v1), C::F16(v2)) => Ok(C::F16(self.f(v1, l1, v2, l2)?)),
             (C::F32(v1), C::F32(v2)) => Ok(C::F32(self.f(v1, l1, v2, l2)?)),
             (C::F64(v1), C::F64(v2)) => Ok(C::F64(self.f(v1, l1, v2, l2)?)),
+            (C::F8E4M3(v1), C::F8E4M3(v2)) => Ok(C::F8E4M3(self.f(v1, l1, v2, l2)?)),
             _ => Err(Error::DTypeMismatchBinaryOp {
                 lhs: v1.dtype(),
                 rhs: v2.dtype(),
@@ -96,6 +99,7 @@ pub trait Map3 {
             (C::F16(v1), C::F16(v2), C::F16(v3)) => Ok(self.f(v1, l1, v2, l2, v3, l3, s)?),
             (C::F32(v1), C::F32(v2), C::F32(v3)) => Ok(self.f(v1, l1, v2, l2, v3, l3, s)?),
             (C::F64(v1), C::F64(v2), C::F64(v3)) => Ok(self.f(v1, l1, v2, l2, v3, l3, s)?),
+            (C::F8E4M3(v1), C::F8E4M3(v2), C::F8E4M3(v3)) => Ok(self.f(v1, l1, v2, l2, v3, l3, s)?),
             _ => Err(Error::DTypeMismatchBinaryOp3 {
                 lhs: v1.dtype(),
                 rhs: v2.dtype(),
@@ -129,6 +133,7 @@ pub trait Map2Alpha {
             (C::F16(v1), C::F16(v2)) => Ok(C::F16(self.f(v1, l1, v2, l2, s)?)),
             (C::F32(v1), C::F32(v2)) => Ok(C::F32(self.f(v1, l1, v2, l2, s)?)),
             (C::F64(v1), C::F64(v2)) => Ok(C::F64(self.f(v1, l1, v2, l2, s)?)),
+            (C::F8E4M3(v1), C::F8E4M3(v2)) => Ok(C::F8E4M3(self.f(v1, l1, v2, l2, s)?)),
             _ => Err(Error::DTypeMismatchBinaryOp {
                 lhs: v1.dtype(),
                 rhs: v2.dtype(),
@@ -152,6 +157,7 @@ pub trait Map2U8 {
             (C::F16(v1), C::F16(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
             (C::F32(v1), C::F32(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
             (C::F64(v1), C::F64(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
+            (C::F8E4M3(v1), C::F8E4M3(v2)) => Ok(C::U8(self.f(v1, l1, v2, l2)?)),
             _ => Err(Error::DTypeMismatchBinaryOp {
                 lhs: v1.dtype(),
                 rhs: v2.dtype(),
diff --git a/candle-core/src/cuda_backend/device.rs b/candle-core/src/cuda_backend/device.rs
index ccca8c039c..8ca69e2c15 100644
--- a/candle-core/src/cuda_backend/device.rs
+++ b/candle-core/src/cuda_backend/device.rs
@@ -3,6 +3,7 @@ use crate::{CpuStorage, CpuStorageRef, DType, Layout, Result, Shape};
 pub use candle_kernels as kernels;
 pub use cudarc;
 use cudarc::driver::{CudaFunction, LaunchAsync, LaunchConfig};
+use float8::F8E4M3;
 use half::{bf16, f16};
 use std::sync::{Arc, Mutex, RwLock};
 
@@ -136,6 +137,14 @@ impl CudaDevice {
                 unsafe { func.launch(cfg, params) }.w()?;
                 CudaStorageSlice::F64(data)
             }
+            DType::F8E4M3 => {
+                // SAFETY: Set later by running the fill kernel.
+                let data = unsafe { self.alloc::<F8E4M3>(elem_count) }.w()?;
+                let func = self.get_or_load_func("fill_f8_e4m3", kernels::FILL)?;
+                let params = (&data, v, elem_count);
+                unsafe { func.launch(cfg, params) }.w()?;
+                CudaStorageSlice::F8E4M3(data)
+            }
         };
         Ok(CudaStorage {
             slice,
@@ -243,6 +252,10 @@ impl BackendDevice for CudaDevice {
                 let data = self.alloc_zeros::<f64>(elem_count).w()?;
                 CudaStorageSlice::F64(data)
             }
+            DType::F8E4M3 => {
+                let data = self.alloc_zeros::<F8E4M3>(elem_count).w()?;
+                CudaStorageSlice::F8E4M3(data)
+            }
         };
         Ok(CudaStorage {
             slice,
@@ -262,7 +275,8 @@ impl BackendDevice for CudaDevice {
             | DType::I32
             | DType::I16
             | DType::F16
-            | DType::BF16 => Err(CudaError::UnsupportedDtype {
+            | DType::BF16
+            | DType::F8E4M3 => Err(CudaError::UnsupportedDtype {
                 dtype,
                 op: "rand_uniform",
             })
@@ -310,7 +324,8 @@ impl BackendDevice for CudaDevice {
             | DType::I32
             | DType::I64
             | DType::F16
-            | DType::BF16 => Err(CudaError::UnsupportedDtype {
+            | DType::BF16
+            | DType::F8E4M3 => Err(CudaError::UnsupportedDtype {
                 dtype,
                 op: "rand_normal",
             })
@@ -378,6 +393,10 @@ impl BackendDevice for CudaDevice {
                 let data = self.alloc::<f64>(elem_count).w()?;
                 CudaStorageSlice::F64(data)
             }
+            DType::F8E4M3 => {
+                let data = self.alloc::<F8E4M3>(elem_count).w()?;
+                CudaStorageSlice::F8E4M3(data)
+            }
         };
         Ok(CudaStorage {
             slice,
@@ -423,6 +442,10 @@ impl BackendDevice for CudaDevice {
                 let data = self.htod_sync_copy(storage).w()?;
                 CudaStorageSlice::F64(data)
             }
+            CpuStorageRef::F8E4M3(storage) => {
+                let data = self.htod_sync_copy(storage).w()?;
+                CudaStorageSlice::F8E4M3(data)
+            }
         };
         Ok(CudaStorage {
             slice,
@@ -468,6 +491,10 @@ impl BackendDevice for CudaDevice {
                 let data = self.htod_sync_copy(storage).w()?;
                 CudaStorageSlice::F64(data)
             }
+            CpuStorage::F8E4M3(storage) => {
+                let data = self.htod_sync_copy(storage).w()?;
+                CudaStorageSlice::F8E4M3(data)
+            }
         };
         Ok(CudaStorage {
             slice,
@@ -513,6 +540,10 @@ impl BackendDevice for CudaDevice {
                 let data = self.htod_copy(storage).w()?;
                 CudaStorageSlice::F64(data)
             }
+            CpuStorage::F8E4M3(storage) => {
+                let data = self.htod_copy(storage).w()?;
+                CudaStorageSlice::F8E4M3(data)
+            }
         };
         Ok(CudaStorage {
             slice,
diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs
index 1a394d4b58..a8045dab39 100644
--- a/candle-core/src/cuda_backend/mod.rs
+++ b/candle-core/src/cuda_backend/mod.rs
@@ -7,6 +7,7 @@ use cudarc::cublas::{Gemm, GemmConfig, StridedBatchedConfig};
 use cudarc::driver::{
     CudaSlice, DevicePtr, DeviceRepr, DeviceSlice, LaunchAsync, LaunchConfig, ValidAsZeroBits,
 };
+use float8::F8E4M3;
 use half::{bf16, f16};
 
 #[cfg(feature = "cudnn")]
@@ -54,6 +55,7 @@ pub enum CudaStorageSlice {
     F16(CudaSlice<f16>),
     F32(CudaSlice<f32>),
     F64(CudaSlice<f64>),
+    F8E4M3(CudaSlice<F8E4M3>),
 }
 
 struct Clone;
@@ -1059,6 +1061,7 @@ cuda_dtype!(f16, F16);
 cuda_dtype!(bf16, BF16);
 cuda_dtype!(f32, F32);
 cuda_dtype!(f64, F64);
+cuda_dtype!(F8E4M3, F8E4M3);
 
 impl CudaStorage {
     pub fn wrap_cuda_slice<T: CudaDType>(slice: CudaSlice<T>, device: CudaDevice) -> CudaStorage {
@@ -1183,6 +1186,7 @@ impl BackendStorage for CudaStorage {
             CudaStorageSlice::F16(_) => DType::F16,
             CudaStorageSlice::F32(_) => DType::F32,
             CudaStorageSlice::F64(_) => DType::F64,
+            CudaStorageSlice::F8E4M3(_) => DType::F8E4M3,
         }
     }
 
@@ -1211,6 +1215,7 @@ impl BackendStorage for CudaStorage {
             CudaStorageSlice::F16(inp) => *inp.slice(start_o..).device_ptr(),
             CudaStorageSlice::F32(inp) => *inp.slice(start_o..).device_ptr(),
             CudaStorageSlice::F64(inp) => *inp.slice(start_o..).device_ptr(),
+            CudaStorageSlice::F8E4M3(inp) => *inp.slice(start_o..).device_ptr(),
         };
         let inp = &inp;
 
@@ -1271,6 +1276,12 @@ impl BackendStorage for CudaStorage {
                 unsafe { func.launch(cfg, params) }.w()?;
                 CudaStorageSlice::F64(out)
             }
+            DType::F8E4M3 => {
+                let out = unsafe { dev.alloc::<F8E4M3>(el) }.w()?;
+                let params = (el, dims.len(), &ds, *inp, &out);
+                unsafe { func.launch(cfg, params) }.w()?;
+                CudaStorageSlice::F8E4M3(out)
+            }
         };
         Ok(Self {
             slice,
@@ -1372,6 +1383,11 @@ impl BackendStorage for CudaStorage {
                 let cpu_storage = dev.dtoh_sync_copy(slice).w()?;
                 Ok(CpuStorage::F64(cpu_storage))
             }
+            CudaStorageSlice::F8E4M3(slice) => {
+                let dev = slice.device();
+                let cpu_storage = dev.dtoh_sync_copy(slice).w()?;
+                Ok(CpuStorage::F8E4M3(cpu_storage))
+            }
         }
     }
 
@@ -1917,6 +1933,11 @@ impl BackendStorage for CudaStorage {
                 *d.slice(dst_o..).device_ptr(),
                 "copy2d_f64",
             ),
+            (S::F8E4M3(s), S::F8E4M3(d)) => (
+                *s.slice(src_o..).device_ptr(),
+                *d.slice(dst_o..).device_ptr(),
+                "copy2d_f8_e4m3",
+            ),
             _ => Err(CudaError::InternalError("dtype mismatch in copy2d"))?,
         };
         let func = dev.get_or_load_func(kname, kernels::FILL)?;
@@ -1974,6 +1995,18 @@ impl BackendStorage for CudaStorage {
                     unsafe { func.launch(cfg, params) }.w()?
                 }
             }
+            (CudaStorageSlice::F8E4M3(src), CudaStorageSlice::F8E4M3(dst)) => {
+                let (src, mut dst) = slice_src_and_dst(src, src_l, dst, dst_offset);
+                if src_l.is_contiguous() {
+                    dev.dtod_copy(&src, &mut dst).w()?
+                } else {
+                    let func = dev.get_or_load_func("ucopy_f8_e4m3", kernels::UNARY)?;
+                    // SAFETY: Set later by running the kernel.
+                    let params = (el_count, dims.len(), &ds, &src, &mut dst);
+                    // SAFETY: ffi.
+                    unsafe { func.launch(cfg, params) }.w()?
+                }
+            }
             (CudaStorageSlice::U8(src), CudaStorageSlice::U8(dst)) => {
                 let (src, mut dst) = slice_src_and_dst(src, src_l, dst, dst_offset);
                 if src_l.is_contiguous() {
@@ -2373,6 +2406,22 @@ impl crate::CustomOp2 for KVConcat {
                 unsafe { func.launch(cfg, params) }.w()?;
                 CudaStorageSlice::F64(out)
             }
+            (CudaStorageSlice::F8E4M3(left_), CudaStorageSlice::F8E4M3(right_)) => {
+                let out = unsafe { dev.alloc::<F8E4M3>(elem_count).w()? };
+                let func = dev.get_or_load_func("kvconcat_f8_e4m3", kernels::KVCONCAT)?;
+                let params = (
+                    left_,
+                    right_,
+                    &out,
+                    self.concat_dim,
+                    chunk_l,
+                    chunk_r,
+                    lstride,
+                    rstride,
+                );
+                unsafe { func.launch(cfg, params) }.w()?;
+                CudaStorageSlice::F8E4M3(out)
+            }
             (CudaStorageSlice::U8(left_), CudaStorageSlice::U8(right_)) => {
                 let out = unsafe { dev.alloc::<u8>(elem_count).w()? };
                 let func = dev.get_or_load_func("kvconcat_u8", kernels::KVCONCAT)?;
diff --git a/candle-core/src/cuda_backend/utils.rs b/candle-core/src/cuda_backend/utils.rs
index df06756d78..581d687aac 100644
--- a/candle-core/src/cuda_backend/utils.rs
+++ b/candle-core/src/cuda_backend/utils.rs
@@ -26,6 +26,7 @@ pub trait Map1 {
             S::F16(s) => S::F16(self.f(s, d, l)?),
             S::F32(s) => S::F32(self.f(s, d, l)?),
             S::F64(s) => S::F64(self.f(s, d, l)?),
+            S::F8E4M3(s) => S::F8E4M3(self.f(s, d, l)?),
         };
         Ok(out)
     }
@@ -50,6 +51,7 @@ pub trait Map2 {
             (S::F16(s1), S::F16(s2)) => S::F16(self.f(s1, l1, s2, l2, d)?),
             (S::F32(s1), S::F32(s2)) => S::F32(self.f(s1, l1, s2, l2, d)?),
             (S::F64(s1), S::F64(s2)) => S::F64(self.f(s1, l1, s2, l2, d)?),
+            (S::F8E4M3(s1), S::F8E4M3(s2)) => S::F8E4M3(self.f(s1, l1, s2, l2, d)?),
             _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?,
         };
         Ok(out)
@@ -88,6 +90,9 @@ pub trait Map3 {
             (S::F16(s1), S::F16(s2), S::F16(s3)) => S::F16(self.f(s1, l1, s2, l2, s3, l3, d)?),
             (S::F32(s1), S::F32(s2), S::F32(s3)) => S::F32(self.f(s1, l1, s2, l2, s3, l3, d)?),
             (S::F64(s1), S::F64(s2), S::F64(s3)) => S::F64(self.f(s1, l1, s2, l2, s3, l3, d)?),
+            (S::F8E4M3(s1), S::F8E4M3(s2), S::F8E4M3(s3)) => {
+                S::F8E4M3(self.f(s1, l1, s2, l2, s3, l3, d)?)
+            }
             _ => Err(CudaError::InternalError("dtype mismatch in ternary op"))?,
         };
         Ok(out)
@@ -120,6 +125,7 @@ pub trait Map2InPlace {
             (S::F16(dst), S::F16(src)) => self.f(dst, dst_s, src, src_l, d),
             (S::F32(dst), S::F32(src)) => self.f(dst, dst_s, src, src_l, d),
             (S::F64(dst), S::F64(src)) => self.f(dst, dst_s, src, src_l, d),
+            (S::F8E4M3(dst), S::F8E4M3(src)) => self.f(dst, dst_s, src, src_l, d),
             _ => Err(CudaError::InternalError("dtype mismatch in binary op"))?,
         }
     }
@@ -145,6 +151,7 @@ pub trait Map1Any {
             S::F16(s) => self.f(s, d, l, S::F16)?,
             S::F32(s) => self.f(s, d, l, S::F32)?,
             S::F64(s) => self.f(s, d, l, S::F64)?,
+            S::F8E4M3(s) => self.f(s, d, l, S::F8E4M3)?,
         };
         Ok(out)
     }
@@ -169,6 +176,7 @@ pub trait Map2Any {
             (S::F16(s1), S::F16(s2)) => self.f(s1, l1, s2, l2, d)?,
             (S::F32(s1), S::F32(s2)) => self.f(s1, l1, s2, l2, d)?,
             (S::F64(s1), S::F64(s2)) => self.f(s1, l1, s2, l2, d)?,
+            (S::F8E4M3(s1), S::F8E4M3(s2)) => self.f(s1, l1, s2, l2, d)?,
             _ => Err(CudaError::InternalError("dtype mismatch in binary op")).w()?,
         };
         Ok(out)
diff --git a/candle-core/src/display.rs b/candle-core/src/display.rs
index 50e0129aeb..8b8f49503d 100644
--- a/candle-core/src/display.rs
+++ b/candle-core/src/display.rs
@@ -2,6 +2,7 @@
 /// This implementation should be in line with the PyTorch version.
 /// https://github.com/pytorch/pytorch/blob/7b419e8513a024e172eae767e24ec1b849976b13/torch/_tensor_str.py
 use crate::{DType, Result, Tensor, WithDType};
+use float8::F8E4M3;
 use half::{bf16, f16};
 
 impl Tensor {
@@ -62,6 +63,7 @@ impl std::fmt::Debug for Tensor {
             DType::F16 => self.fmt_dt::<f16>(f),
             DType::F32 => self.fmt_dt::<f32>(f),
             DType::F64 => self.fmt_dt::<f64>(f),
+            DType::F8E4M3 => self.fmt_dt::<F8E4M3>(f),
         }
     }
 }
@@ -511,6 +513,13 @@ impl std::fmt::Display for Tensor {
                     writeln!(f)?;
                 }
             }
+            DType::F8E4M3 => {
+                if let Ok(tf) = FloatFormatter::<F8E4M3>::new(&to_display, &po) {
+                    let max_w = tf.max_width(&to_display);
+                    tf.fmt_tensor(self, 1, max_w, summarize, &po, f)?;
+                    writeln!(f)?;
+                }
+            }
         };
 
         let device_str = match self.device().location() {
diff --git a/candle-core/src/dtype.rs b/candle-core/src/dtype.rs
index 42d3b1eef9..f40ec3f7e1 100644
--- a/candle-core/src/dtype.rs
+++ b/candle-core/src/dtype.rs
@@ -1,11 +1,14 @@
 //! Types for elements that can be stored and manipulated using tensors.
 #![allow(clippy::redundant_closure_call)]
 use crate::backend::BackendStorage;
+use crate::cpu::kernels::VecOps;
 use crate::{CpuStorage, CpuStorageRef, Error, Result};
 
 /// The different types of elements allowed in tensors.
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
 pub enum DType {
+    // Floating-point 8 bits integer (4-bit exponent, 3-bit mantissa).
+    F8E4M3,
     // Unsigned 8 bits integer.
     U8,
     // Unsigned 32 bits integer.
@@ -50,6 +53,7 @@ impl std::str::FromStr for DType {
             "f16" => Ok(Self::F16),
             "f32" => Ok(Self::F32),
             "f64" => Ok(Self::F64),
+            "f8_e4m3" => Ok(Self::F8E4M3),
             _ => Err(DTypeParseError(s.to_string())),
         }
     }
@@ -68,6 +72,7 @@ impl DType {
             Self::F16 => "f16",
             Self::F32 => "f32",
             Self::F64 => "f64",
+            Self::F8E4M3 => "f8_e4m3",
         }
     }
 
@@ -75,6 +80,7 @@ impl DType {
     pub fn size_in_bytes(&self) -> usize {
         match self {
             Self::U8 => 1,
+            Self::F8E4M3 => 1,
             Self::U32 => 4,
             Self::I16 => 2,
             Self::I32 => 4,
@@ -89,14 +95,14 @@ impl DType {
     pub fn is_int(&self) -> bool {
         match self {
             Self::U8 | Self::U32 | Self::I16 | Self::I32 | Self::I64 => true,
-            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => false,
+            Self::BF16 | Self::F16 | Self::F32 | Self::F64 | Self::F8E4M3 => false,
         }
     }
 
     pub fn is_float(&self) -> bool {
         match self {
             Self::U8 | Self::U32 | Self::I16 | Self::I32 | Self::I64 => false,
-            Self::BF16 | Self::F16 | Self::F32 | Self::F64 => true,
+            Self::BF16 | Self::F16 | Self::F32 | Self::F64 | Self::F8E4M3 => true,
         }
     }
 }
@@ -175,6 +181,7 @@ macro_rules! with_dtype {
         }
     };
 }
+use float8::F8E4M3;
 use half::{bf16, f16};
 
 with_dtype!(u8, U8, |v: f64| v as u8, |v: u8| v as f64);
@@ -186,6 +193,17 @@ with_dtype!(f16, F16, f16::from_f64, f16::to_f64);
 with_dtype!(bf16, BF16, bf16::from_f64, bf16::to_f64);
 with_dtype!(f32, F32, |v: f64| v as f32, |v: f32| v as f64);
 with_dtype!(f64, F64, |v: f64| v, |v: f64| v);
+with_dtype!(F8E4M3, F8E4M3, |v: f64| F8E4M3::from_f64(v), |v: F8E4M3| v
+    .to_f64());
+
+impl VecOps for F8E4M3 {
+    fn max(self, rhs: Self) -> Self {
+        F8E4M3::max(self, rhs)
+    }
+    fn min(self, rhs: Self) -> Self {
+        F8E4M3::min(self, rhs)
+    }
+}
 
 pub trait IntDType: WithDType {
     fn is_true(&self) -> bool;
@@ -243,3 +261,4 @@ impl FloatDType for f16 {}
 impl FloatDType for bf16 {}
 impl FloatDType for f32 {}
 impl FloatDType for f64 {}
+impl FloatDType for F8E4M3 {}
diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs
index 7fad400eae..400ada07b6 100644
--- a/candle-core/src/metal_backend/mod.rs
+++ b/candle-core/src/metal_backend/mod.rs
@@ -103,6 +103,7 @@ impl BackendStorage for MetalStorage {
             DType::BF16 => Ok(CpuStorage::BF16(self.to_cpu()?)),
             DType::F32 => Ok(CpuStorage::F32(self.to_cpu()?)),
             DType::F64 => Ok(CpuStorage::F64(self.to_cpu()?)),
+            DType::F8E4M3 => Ok(CpuStorage::F64(self.to_cpu()?)),
         }
     }
 
@@ -2133,6 +2134,7 @@ impl BackendDevice for MetalDevice {
             DType::F32 => "fill_f32",
             DType::I32 => "fill_i32",
             DType::I16 => "fill_i16",
+            DType::F8E4M3 => crate::bail!("Metal device does not yet support F8E4M3."),
             DType::F64 => {
                 let cpu_storage = crate::cpu_backend::CpuDevice.ones_impl(shape, dtype)?;
                 return self.storage_from_cpu_storage(&cpu_storage);
@@ -2170,6 +2172,7 @@ impl BackendDevice for MetalDevice {
             CpuStorageRef::F16(storage) => (storage.len(), self.new_buffer_with_data(storage)),
             CpuStorageRef::F32(storage) => (storage.len(), self.new_buffer_with_data(storage)),
             CpuStorageRef::F64(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorageRef::F8E4M3(_) => crate::bail!("Metal device does not yet support F8E4M3."),
         };
         Ok(Self::Storage::new(buffer?, self.clone(), count, T::DTYPE))
     }
@@ -2185,6 +2188,7 @@ impl BackendDevice for MetalDevice {
             CpuStorage::F16(storage) => (storage.len(), self.new_buffer_with_data(storage)),
             CpuStorage::F32(storage) => (storage.len(), self.new_buffer_with_data(storage)),
             CpuStorage::F64(storage) => (storage.len(), self.new_buffer_with_data(storage)),
+            CpuStorage::F8E4M3(_) => crate::bail!("Metal device does not yet support F8E4M3."),
         };
         Ok(Self::Storage::new(
             buffer?,
diff --git a/candle-core/src/npy.rs b/candle-core/src/npy.rs
index 33a4f4c728..28d5a63e90 100644
--- a/candle-core/src/npy.rs
+++ b/candle-core/src/npy.rs
@@ -27,11 +27,13 @@
 //! ```
 use crate::{DType, Device, Error, Result, Shape, Tensor};
 use byteorder::{LittleEndian, ReadBytesExt};
+use float8::F8E4M3;
 use half::{bf16, f16, slice::HalfFloatSliceExt};
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::{BufReader, Read, Write};
 use std::path::Path;
+use std::slice;
 
 const NPY_MAGIC_STRING: &[u8] = b"\x93NUMPY";
 const NPY_SUFFIX: &str = ".npy";
@@ -90,6 +92,7 @@ impl Header {
             DType::I64 => "i8",
             DType::U32 => "u4",
             DType::U8 => "u1",
+            DType::F8E4M3 => Err(Error::Npy("f8e4m3 is not supported".into()))?,
         };
         if !shape.is_empty() {
             shape.push(',')
@@ -251,6 +254,13 @@ impl Tensor {
                 reader.read_i64_into::<LittleEndian>(&mut data_t)?;
                 Tensor::from_vec(data_t, shape, &Device::Cpu)
             }
+            DType::F8E4M3 => {
+                let mut data_t = vec![F8E4M3::ZERO; elem_count];
+                let ptr = data_t.as_mut_ptr().cast::<i8>();
+                let len = data_t.len();
+                reader.read_i8_into(unsafe { slice::from_raw_parts_mut(ptr, len) })?;
+                Tensor::from_vec(data_t, shape, &Device::Cpu)
+            }
         }
     }
 
diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs
index 3786a82aaf..208977913a 100644
--- a/candle-core/src/op.rs
+++ b/candle-core/src/op.rs
@@ -1,5 +1,6 @@
 #![allow(clippy::redundant_closure_call)]
 use crate::Tensor;
+use float8::F8E4M3;
 use half::{bf16, f16};
 use num_traits::float::Float;
 
@@ -187,6 +188,7 @@ pub trait UnaryOpT {
     fn f16(v1: f16) -> f16;
     fn f32(v1: f32) -> f32;
     fn f64(v1: f64) -> f64;
+    fn f8e4m3(v1: F8E4M3) -> F8E4M3;
     fn u8(v1: u8) -> u8;
     fn u32(v1: u32) -> u32;
     fn i16(v1: i16) -> i16;
@@ -199,6 +201,8 @@ pub trait UnaryOpT {
     fn bf16_vec(_xs: &[bf16], _ys: &mut [bf16]) {}
     const F16_VEC: bool = false;
     fn f16_vec(_xs: &[f16], _ys: &mut [f16]) {}
+    const F8E4M3_VEC: bool = false;
+    fn f8e4m3_vec(_xs: &[F8E4M3], _ys: &mut [F8E4M3]) {}
     const F32_VEC: bool = false;
     fn f32_vec(_xs: &[f32], _ys: &mut [f32]) {}
     const F64_VEC: bool = false;
@@ -213,6 +217,7 @@ pub trait BinaryOpT {
     fn f16(v1: f16, v2: f16) -> f16;
     fn f32(v1: f32, v2: f32) -> f32;
     fn f64(v1: f64, v2: f64) -> f64;
+    fn f8e4m3(v1: F8E4M3, v2: F8E4M3) -> F8E4M3;
     fn u8(v1: u8, v2: u8) -> u8;
     fn u32(v1: u32, v2: u32) -> u32;
     fn i16(v1: i16, v2: i16) -> i16;
@@ -227,6 +232,8 @@ pub trait BinaryOpT {
     fn f32_vec(_xs1: &[f32], _xs2: &[f32], _ys: &mut [f32]) {}
     const F64_VEC: bool = false;
     fn f64_vec(_xs1: &[f64], _xs2: &[f64], _ys: &mut [f64]) {}
+    const F8E4M3_VEC: bool = false;
+    fn f8e4m3_vec(_xs1: &[F8E4M3], __xs2: &[F8E4M3], _ys: &mut [F8E4M3]) {}
     const U8_VEC: bool = false;
     fn u8_vec(_xs1: &[u8], _xs2: &[u8], _ys: &mut [u8]) {}
     const U32_VEC: bool = false;
@@ -288,6 +295,10 @@ macro_rules! bin_op {
                 $e(v1, v2)
             }
             #[inline(always)]
+            fn f8e4m3(v1: F8E4M3, v2: F8E4M3) -> F8E4M3 {
+                $e(v1, v2)
+            }
+            #[inline(always)]
             fn u8(v1: u8, v2: u8) -> u8 {
                 $e(v1, v2)
             }
@@ -376,6 +387,10 @@ macro_rules! unary_op {
                 $e
             }
             #[inline(always)]
+            fn f8e4m3($a: F8E4M3) -> F8E4M3 {
+                $e
+            }
+            #[inline(always)]
             fn f32($a: f32) -> f32 {
                 $e
             }
@@ -428,6 +443,10 @@ macro_rules! unary_op {
                 $e
             }
             #[inline(always)]
+            fn f8e4m3($a: F8E4M3) -> F8E4M3 {
+                $e
+            }
+            #[inline(always)]
             fn u8(_: u8) -> u8 {
                 todo!("no unary function for u8")
             }
@@ -527,6 +546,17 @@ impl UnaryOpT for Gelu {
                 ))
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        F8E4M3::from_f32(0.5)
+            * v
+            * (F8E4M3::ONE
+                + F8E4M3::tanh(
+                    F8E4M3::from_f32(SQRT_TWO_OVER_PI_F32)
+                        * v
+                        * (F8E4M3::ONE + F8E4M3::from_f32(0.044715) * v * v),
+                ))
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         0.5 * v * (1.0 + f32::tanh(SQRT_TWO_OVER_PI_F32 * v * (1.0 + 0.044715 * v * v)))
     }
@@ -608,6 +638,10 @@ impl UnaryOpT for Erf {
         f16::from_f64(Self::f64(v.to_f64()))
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        F8E4M3::from_f64(Self::f64(v.to_f64()))
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         Self::f64(v as f64) as f32
     }
@@ -650,6 +684,10 @@ impl UnaryOpT for Silu {
         v / (f16::ONE + (-v).exp())
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        v / (F8E4M3::ONE + (-v).exp())
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         v / (1.0 + (-v).exp())
     }
@@ -729,6 +767,10 @@ impl UnaryOpT for Abs {
         v.abs()
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        v.abs()
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         v.abs()
     }
@@ -771,6 +813,10 @@ impl UnaryOpT for Ceil {
         v.ceil()
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        v.ceil()
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         v.ceil()
     }
@@ -813,6 +859,10 @@ impl UnaryOpT for Floor {
         v.floor()
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        v.floor()
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         v.floor()
     }
@@ -855,6 +905,10 @@ impl UnaryOpT for Round {
         v.round()
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        v.round()
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         v.round()
     }
@@ -897,6 +951,10 @@ impl UnaryOpT for GeluErf {
         f16::from_f64(Self::f64(v.to_f64()))
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        F8E4M3::from_f64(Self::f64(v.to_f64()))
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         Self::f64(v as f64) as f32
     }
@@ -939,6 +997,10 @@ impl UnaryOpT for Relu {
         v.max(f16::ZERO)
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        v.max(F8E4M3::ZERO)
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         v.max(0f32)
     }
@@ -1045,6 +1107,11 @@ impl UnaryOpT for Sign {
         f16::from((v > f16::ZERO) as i8) - f16::from((v < f16::ZERO) as i8)
     }
     #[inline(always)]
+    fn f8e4m3(v: F8E4M3) -> F8E4M3 {
+        F8E4M3::from((v > F8E4M3::ZERO) as i8 as f32)
+            - F8E4M3::from((v < F8E4M3::ZERO) as i8 as f32)
+    }
+    #[inline(always)]
     fn f32(v: f32) -> f32 {
         f32::from(v > 0.) - f32::from(v < 0.)
     }
diff --git a/candle-core/src/safetensors.rs b/candle-core/src/safetensors.rs
index 12436a0903..52df166313 100644
--- a/candle-core/src/safetensors.rs
+++ b/candle-core/src/safetensors.rs
@@ -1,4 +1,5 @@
 use crate::{DType, Device, Error, Result, Tensor, WithDType};
+use float8::F8E4M3;
 use safetensors::tensor as st;
 use safetensors::tensor::SafeTensors;
 use std::borrow::Cow;
@@ -17,6 +18,7 @@ impl From<DType> for st::Dtype {
             DType::F16 => st::Dtype::F16,
             DType::F32 => st::Dtype::F32,
             DType::F64 => st::Dtype::F64,
+            DType::F8E4M3 => st::Dtype::F8_E4M3,
         }
     }
 }
@@ -32,6 +34,7 @@ impl TryFrom<st::Dtype> for DType {
             st::Dtype::F16 => Ok(DType::F16),
             st::Dtype::F32 => Ok(DType::F32),
             st::Dtype::F64 => Ok(DType::F64),
+            st::Dtype::F8_E4M3 => Ok(DType::F8E4M3),
             dtype => Err(Error::UnsupportedSafeTensorDtype(dtype)),
         }
     }
@@ -196,6 +199,7 @@ impl Tensor {
             DType::F16 => convert_slice::<half::f16>(data, shape, device),
             DType::F32 => convert_slice::<f32>(data, shape, device),
             DType::F64 => convert_slice::<f64>(data, shape, device),
+            DType::F8E4M3 => convert_slice::<F8E4M3>(data, shape, device),
         }
     }
 }
@@ -232,6 +236,7 @@ fn convert_back(tensor: &Tensor) -> Result<Vec<u8>> {
         DType::BF16 => Ok(convert_back_::<half::bf16>(tensor.to_vec1()?)),
         DType::F32 => Ok(convert_back_::<f32>(tensor.to_vec1()?)),
         DType::F64 => Ok(convert_back_::<f64>(tensor.to_vec1()?)),
+        DType::F8E4M3 => Ok(convert_back_::<F8E4M3>(tensor.to_vec1()?)),
     }
 }
 
diff --git a/candle-core/src/sort.rs b/candle-core/src/sort.rs
index b48f74ba5c..34ef0259e2 100644
--- a/candle-core/src/sort.rs
+++ b/candle-core/src/sort.rs
@@ -72,6 +72,7 @@ impl crate::CustomOp1 for ArgSort {
             crate::CpuStorage::F16(vs) => self.asort(vs, layout),
             crate::CpuStorage::F32(vs) => self.asort(vs, layout),
             crate::CpuStorage::F64(vs) => self.asort(vs, layout),
+            crate::CpuStorage::F8E4M3(vs) => self.asort(vs, layout),
         };
         let sort_indexes = crate::CpuStorage::U32(sort_indexes);
         Ok((sort_indexes, layout.shape().into()))
@@ -153,6 +154,7 @@ impl crate::CustomOp1 for ArgSort {
                     DType::I64 => "asort_asc_i64",
                     DType::I32 => "asort_asc_i32",
                     DType::I16 => "asort_asc_i16",
+                    DType::F8E4M3 => crate::bail!("Metal device does not yet support F8E4M3."),
                 }
             } else {
                 match storage.dtype() {
@@ -165,6 +167,7 @@ impl crate::CustomOp1 for ArgSort {
                     DType::I64 => "asort_desc_i64",
                     DType::I32 => "asort_desc_i32",
                     DType::I16 => "asort_desc_i16",
+                    DType::F8E4M3 => crate::bail!("Metal device does not yet support F8E4M3."),
                 }
             }
         };
diff --git a/candle-kernels/src/affine.cu b/candle-kernels/src/affine.cu
index 301bcd5a64..ef75dffd36 100644
--- a/candle-kernels/src/affine.cu
+++ b/candle-kernels/src/affine.cu
@@ -1,7 +1,7 @@
 #include "cuda_utils.cuh"
 #include<stdint.h>
 
-#define AFFINE_OP(TYPENAME, FN_NAME) \
+#define AFFINE_OP(TYPENAME, FN_NAME, AFFINE) \
 extern "C" __global__ void FN_NAME(  \
     const size_t numel,  \
     const size_t num_dims, \
@@ -16,30 +16,34 @@ extern "C" __global__ void FN_NAME(  \
     if (info == nullptr || is_contiguous(num_dims, dims, strides)) { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
             TYPENAME x = inp ? inp[i] : out[i]; \
-            out[i] = x * mul + add; \
+            out[i] = AFFINE; \
         } \
     } \
     else { \
         for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) { \
             unsigned strided_i = get_strided_index(i, num_dims, dims, strides); \
             TYPENAME x = inp ? inp[strided_i] : out[i]; \
-            out[i] = x * mul + add; \
+            out[i] = AFFINE; \
         } \
     } \
 } \
 
 #if __CUDA_ARCH__ >= 800
-AFFINE_OP(__nv_bfloat16, affine_bf16)
+AFFINE_OP(__nv_bfloat16, affine_bf16, x * mul + add)
+
+#define F8E4M3_TO_FLOAT(x) __half2float(__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3))
+
+AFFINE_OP(__nv_fp8_e4m3, affine_f8_e4m3, __nv_fp8_e4m3(F8E4M3_TO_FLOAT(x) * F8E4M3_TO_FLOAT(mul) + F8E4M3_TO_FLOAT(add)))
 #endif
 
 #if __CUDA_ARCH__ >= 530
-AFFINE_OP(__half, affine_f16)
+AFFINE_OP(__half, affine_f16, x * mul + add)
 #endif
 
-AFFINE_OP(float, affine_f32)
-AFFINE_OP(double, affine_f64)
-AFFINE_OP(uint8_t, affine_u8)
-AFFINE_OP(uint32_t, affine_u32)
-AFFINE_OP(int16_t, affine_i16)
-AFFINE_OP(int32_t, affine_i32)
-AFFINE_OP(int64_t, affine_i64)
+AFFINE_OP(float, affine_f32, x * mul + add)
+AFFINE_OP(double, affine_f64, x * mul + add)
+AFFINE_OP(uint8_t, affine_u8, x * mul + add)
+AFFINE_OP(uint32_t, affine_u32, x * mul + add)
+AFFINE_OP(int16_t, affine_i16, x * mul + add)
+AFFINE_OP(int32_t, affine_i32, x * mul + add)
+AFFINE_OP(int64_t, affine_i64, x * mul + add)
diff --git a/candle-kernels/src/binary.cu b/candle-kernels/src/binary.cu
index 99ab23b875..7bda3e463e 100644
--- a/candle-kernels/src/binary.cu
+++ b/candle-kernels/src/binary.cu
@@ -14,6 +14,21 @@ BINARY_OP_OUT(__nv_bfloat16, uint8_t, lt_bf16, x < y)
 BINARY_OP_OUT(__nv_bfloat16, uint8_t, le_bf16, x <= y)
 BINARY_OP_OUT(__nv_bfloat16, uint8_t, gt_bf16, x > y)
 BINARY_OP_OUT(__nv_bfloat16, uint8_t, ge_bf16, x >= y)
+
+#define F8E4M3_TO_FLOAT(x) __half2float(__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3))
+
+BINARY_OP(__nv_fp8_e4m3, badd_f8_e4m3, __nv_fp8_e4m3(F8E4M3_TO_FLOAT(x) + F8E4M3_TO_FLOAT(y)))
+BINARY_OP(__nv_fp8_e4m3, bdiv_f8_e4m3, __nv_fp8_e4m3(F8E4M3_TO_FLOAT(x) / F8E4M3_TO_FLOAT(y)))
+BINARY_OP(__nv_fp8_e4m3, bmul_f8_e4m3, __nv_fp8_e4m3(F8E4M3_TO_FLOAT(x) * F8E4M3_TO_FLOAT(y)))
+BINARY_OP(__nv_fp8_e4m3, bsub_f8_e4m3, __nv_fp8_e4m3(F8E4M3_TO_FLOAT(x) - F8E4M3_TO_FLOAT(y)))
+BINARY_OP(__nv_fp8_e4m3, bmaximum_f8_e4m3, maxg(x, y))
+BINARY_OP(__nv_fp8_e4m3, bminimum_f8_e4m3, ming(x, y))
+BINARY_OP_OUT(__nv_fp8_e4m3, uint8_t, eq_f8_e4m3, F8E4M3_TO_FLOAT(x) == F8E4M3_TO_FLOAT(y))
+BINARY_OP_OUT(__nv_fp8_e4m3, uint8_t, ne_f8_e4m3, F8E4M3_TO_FLOAT(x) != F8E4M3_TO_FLOAT(y))
+BINARY_OP_OUT(__nv_fp8_e4m3, uint8_t, lt_f8_e4m3, F8E4M3_TO_FLOAT(x) < F8E4M3_TO_FLOAT(y))
+BINARY_OP_OUT(__nv_fp8_e4m3, uint8_t, le_f8_e4m3, F8E4M3_TO_FLOAT(x) <= F8E4M3_TO_FLOAT(y))
+BINARY_OP_OUT(__nv_fp8_e4m3, uint8_t, gt_f8_e4m3, F8E4M3_TO_FLOAT(x) > F8E4M3_TO_FLOAT(y))
+BINARY_OP_OUT(__nv_fp8_e4m3, uint8_t, ge_f8_e4m3, F8E4M3_TO_FLOAT(x) >= F8E4M3_TO_FLOAT(y))
 #endif
 
 #if __CUDA_ARCH__ >= 530
diff --git a/candle-kernels/src/cast.cu b/candle-kernels/src/cast.cu
index e288bf1812..207425b308 100644
--- a/candle-kernels/src/cast.cu
+++ b/candle-kernels/src/cast.cu
@@ -24,6 +24,53 @@ __device__ void cast_(
     }
 }
 
+#define F8E4M3_TO_FLOAT(x) __half2float(__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3))
+
+template <typename T>
+__device__ void cast_fp8_(
+    const size_t numel,
+    const size_t num_dims,
+    const size_t *info,
+    const __nv_fp8_e4m3 *inp,
+    T *out
+) {
+    const size_t *dims = info;
+    const size_t *strides = info + num_dims;
+    if (info == nullptr || is_contiguous(num_dims, dims, strides)) {
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
+            out[i] = F8E4M3_TO_FLOAT(inp[i]);
+        }
+    }
+    else {
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
+            unsigned strided_i = get_strided_index(i, num_dims, dims, strides);
+            out[i] = F8E4M3_TO_FLOAT(inp[strided_i]);
+        }
+    }
+}
+template <typename S>
+__device__ void cast_fp8_into_(
+    const size_t numel,
+    const size_t num_dims,
+    const size_t *info,
+    const S *inp,
+    __nv_fp8_e4m3 *out
+) {
+    const size_t *dims = info;
+    const size_t *strides = info + num_dims;
+    if (info == nullptr || is_contiguous(num_dims, dims, strides)) {
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
+            out[i] = __nv_fp8_e4m3((float)inp[i]);
+        }
+    }
+    else {
+        for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
+            unsigned strided_i = get_strided_index(i, num_dims, dims, strides);
+            out[i] = __nv_fp8_e4m3((float)inp[strided_i]);
+        }
+    }
+}
+
 template <typename S, typename T, typename I>
 __device__ void cast_through(
     const size_t numel,
@@ -59,6 +106,30 @@ extern "C" __global__ void FN_NAME( \
     cast_<SRC_TYPENAME, DST_TYPENAME>(numel, num_dims, info, inp, out); \
 } \
 
+
+#define CAST_OP_FP8(SRC_TYPENAME, DST_TYPENAME, FN_NAME) \
+extern "C" __global__ void FN_NAME( \
+    const size_t numel, \
+    const size_t num_dims, \
+    const size_t *info, \
+    const SRC_TYPENAME *inp, \
+    DST_TYPENAME *out \
+) { \
+    cast_fp8_<DST_TYPENAME>(numel, num_dims, info, inp, out); \
+} \
+
+
+#define CAST_OP_FP8_INTO(SRC_TYPENAME, DST_TYPENAME, FN_NAME) \
+extern "C" __global__ void FN_NAME( \
+    const size_t numel, \
+    const size_t num_dims, \
+    const size_t *info, \
+    const SRC_TYPENAME *inp, \
+    DST_TYPENAME *out \
+) { \
+    cast_fp8_into_<SRC_TYPENAME>(numel, num_dims, info, inp, out); \
+} \
+
 #define CAST_THROUGH_OP(SRC_TYPENAME, DST_TYPENAME, INT_TYPENAME, FN_NAME) \
 extern "C" __global__ void FN_NAME( \
     const size_t numel, \
@@ -72,6 +143,7 @@ extern "C" __global__ void FN_NAME( \
 
 #if __CUDA_ARCH__ >= 800
 CAST_OP(__nv_bfloat16, __nv_bfloat16, cast_bf16_bf16)
+CAST_OP(__nv_fp8_e4m3, __nv_fp8_e4m3, cast_f8_e4m3_f8_e4m3)
 
 CAST_OP(__nv_bfloat16, uint32_t, cast_bf16_u32)
 CAST_OP(__nv_bfloat16, float,    cast_bf16_f32)
@@ -85,6 +157,19 @@ CAST_THROUGH_OP(__nv_bfloat16, __half,   float, cast_bf16_f16)
 CAST_THROUGH_OP(__half,   __nv_bfloat16, float, cast_f16_bf16)
 CAST_THROUGH_OP(int32_t,   __nv_bfloat16, float, cast_i32_bf16)
 CAST_THROUGH_OP(__nv_bfloat16, int32_t, float, cast_bf16_i32)
+
+CAST_OP_FP8(__nv_fp8_e4m3, float,    cast_f8_e4m3_f32)
+CAST_OP_FP8_INTO(float,    __nv_fp8_e4m3, cast_f32_f8_e4m3)
+CAST_OP_FP8(__nv_fp8_e4m3, uint8_t, cast_f8_e4m3_u8)
+CAST_OP_FP8(__nv_fp8_e4m3, __half, cast_f8_e4m3_f16)
+CAST_OP_FP8(__nv_fp8_e4m3, double,  cast_f8_e4m3_f64)
+CAST_OP_FP8_INTO(__half,   __nv_fp8_e4m3, cast_f16_f8_e4m3)
+CAST_OP_FP8_INTO(double,   __nv_fp8_e4m3, cast_f64_f8_e4m3)
+CAST_OP_FP8_INTO(uint8_t,   __nv_fp8_e4m3, cast_u8_f8_e4m3)
+CAST_OP_FP8_INTO(int32_t,   __nv_fp8_e4m3, cast_i32_f8_e4m3)
+CAST_OP_FP8(__nv_fp8_e4m3, int32_t, cast_f8_e4m3_i32)
+CAST_OP_FP8(__nv_fp8_e4m3, __nv_bfloat16, cast_f8_e4m3_bf16)
+CAST_OP_FP8_INTO(__nv_bfloat16, __nv_fp8_e4m3, cast_bf16_f8_e4m3)
 #else
 #include <cuda.h>
 #if CUDA_VERSION >= 11000
@@ -98,6 +183,7 @@ CAST_THROUGH_OP(double,   __nv_bfloat16, float, cast_f64_bf16)
 CAST_THROUGH_OP(uint8_t,   __nv_bfloat16, float, cast_u8_bf16)
 CAST_THROUGH_OP(int32_t,   __nv_bfloat16, float, cast_i32_bf16)
 CAST_THROUGH_OP(__nv_bfloat16, int32_t, float, cast_bf16_i32)
+CAST_THROUGH_OP(__nv_bfloat16, __nv_fp8_e4m3, float, cast_bf16_f8_e4m3)
 #endif
 #endif
 
diff --git a/candle-kernels/src/compatibility.cuh b/candle-kernels/src/compatibility.cuh
index d0791749bb..1e4cf215c1 100644
--- a/candle-kernels/src/compatibility.cuh
+++ b/candle-kernels/src/compatibility.cuh
@@ -1,5 +1,6 @@
 #include "cuda_fp16.h"
 #include "cuda_bf16.h"
+#include "cuda_fp8.h"
 
 // Table showing which features are supported on which compute capability
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#features-and-technical-specifications
diff --git a/candle-kernels/src/conv.cu b/candle-kernels/src/conv.cu
index fa834faa3a..6ca6fd7c2b 100644
--- a/candle-kernels/src/conv.cu
+++ b/candle-kernels/src/conv.cu
@@ -702,6 +702,18 @@ UPSAMPLE_NEAREST2D_OP(__nv_bfloat16, upsample_nearest2d_bf16)
 IM2COL_OP(__nv_bfloat16, im2col_bf16)
 IM2COL1D_OP(__nv_bfloat16, im2col1d_bf16)
 COL2IM1D_OP(__nv_bfloat16, col2im1d_bf16)
+
+// NOTE: No conv ops for f8
+// CONV1D_OP(__nv_bfloat16, float, conv1d_f8_e5m)
+// CONV2D_OP(__nv_fp8_e4m3, float, conv2d_f8_e5m)
+// CONVT1D_OP(__nv_fp8_e4m3, float, conv_transpose1d_f8_e5m)
+// CONVT2D_OP(__nv_fp8_e4m3, float, conv_transpose2d_f8_e5m)
+// AVG_POOL2D_OP(__nv_fp8_e4m3, float, avg_pool2d_f8_e5m)
+// MAX_POOL2D_OP(__nv_fp8_e4m3, max_pool2d_f8_e5m)
+// UPSAMPLE_NEAREST2D_OP(__nv_fp8_e4m3, upsample_nearest2d_f8_e5m)
+// IM2COL_OP(__nv_fp8_e4m3, im2col_f8_e5m)
+// IM2COL1D_OP(__nv_fp8_e4m3, im2col1d_f8_e5m)
+// COL2IM1D_OP(__nv_fp8_e4m3, col2im1d_f8_e5m)
 #endif
 
 #if __CUDA_ARCH__ >= 530
diff --git a/candle-kernels/src/cuda_utils.cuh b/candle-kernels/src/cuda_utils.cuh
index df1497f672..f3af71b18c 100644
--- a/candle-kernels/src/cuda_utils.cuh
+++ b/candle-kernels/src/cuda_utils.cuh
@@ -231,4 +231,27 @@ __device__ __forceinline__ __nv_bfloat16 logg(__nv_bfloat16 a) { return hlog(a);
 __device__ __forceinline__ __nv_bfloat16 expg(__nv_bfloat16 a) { return hexp(a); }
 __device__ __forceinline__ __nv_bfloat16 absg(__nv_bfloat16 a) { return __habs(a); }
 __device__ __forceinline__ __nv_bfloat16 copysigng(__nv_bfloat16 a, __nv_bfloat16 b) { return __float2bfloat16(copysignf(__bfloat162float(a), __bfloat162float(b))); }
+
+#define F8E4M3_TO_FLOAT(x) __half2float(__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3))
+
+__device__ __forceinline__ __nv_fp8_e4m3 powg(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) { return __nv_fp8_e4m3(powf(F8E4M3_TO_FLOAT(a), F8E4M3_TO_FLOAT(b))); }
+__device__ __forceinline__ bool isnang(__nv_fp8_e4m3 a) { return isnan(F8E4M3_TO_FLOAT(a)); }
+__device__ __forceinline__ __nv_fp8_e4m3 sqrtg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(sqrtf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 cosg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(cosf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 sing(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(sinf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 recipg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(1. / F8E4M3_TO_FLOAT(a)); }
+__device__ __forceinline__ __nv_fp8_e4m3 maxg(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) { return __nv_fp8_e4m3(fmaxf(F8E4M3_TO_FLOAT(a), F8E4M3_TO_FLOAT(b))); }
+__device__ __forceinline__ __nv_fp8_e4m3 tanhg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(tanhf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 erfg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(erff(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 ceilg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(ceilf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 floorg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(floorf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 roundg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(roundf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 normcdfg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(normcdff(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 ming(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) { return __nv_fp8_e4m3(fminf(F8E4M3_TO_FLOAT(a), F8E4M3_TO_FLOAT(b))); }
+__device__ __forceinline__ __nv_fp8_e4m3 logg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(logf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 expg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(expf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 absg(__nv_fp8_e4m3 a) { return __nv_fp8_e4m3(fabsf(F8E4M3_TO_FLOAT(a))); }
+__device__ __forceinline__ __nv_fp8_e4m3 copysigng(__nv_fp8_e4m3 a, __nv_fp8_e4m3 b) { return __nv_fp8_e4m3(copysignf(F8E4M3_TO_FLOAT(a), F8E4M3_TO_FLOAT(b))); }
+
+
 #endif
diff --git a/candle-kernels/src/fill.cu b/candle-kernels/src/fill.cu
index 0654c2631b..eeea8d4cd4 100644
--- a/candle-kernels/src/fill.cu
+++ b/candle-kernels/src/fill.cu
@@ -47,6 +47,11 @@ COPY2D_OP(__half, copy2d_f16)
 
 #if __CUDA_ARCH__ >= 800
 #include <cuda_bf16.h>
+#include <cuda_fp8.h>
+
 extern "C" __global__ void fill_bf16(__nv_bfloat16 *buf, __nv_bfloat16 value, const size_t numel) { fill_with(buf, value, numel); }
 COPY2D_OP(__nv_bfloat16, copy2d_bf16)
+
+extern "C" __global__ void fill_f8_e4m3(__nv_fp8_e4m3 *buf, __nv_fp8_e4m3 value, const size_t numel) { fill_with(buf, value, numel); }
+COPY2D_OP(__nv_fp8_e4m3, copy2d_f8_e4m3)
 #endif
diff --git a/candle-kernels/src/fused_layer_norm.cu b/candle-kernels/src/fused_layer_norm.cu
deleted file mode 100644
index cea64c519b..0000000000
--- a/candle-kernels/src/fused_layer_norm.cu
+++ /dev/null
@@ -1,329 +0,0 @@
-// Based on https://github.com/NVIDIA/apex/blob/master/apex/contrib/csrc/multihead_attn/layer_norm.cuh#L243
-// Modified Eric Buehler 2024
-
-#include "cuda_fp16.h"
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#if __CUDA_ARCH__ >= 800
-#include <cuda_bf16.h>
-#endif
-
-template <typename U>
-__device__ void cuWelfordOnlineSum(const U curr, U &mu, U &sigma2, U &count) {
-  count = count + U(1);
-  U delta = curr - mu;
-  U lmean = mu + delta / count;
-  mu = lmean;
-  U delta2 = curr - lmean;
-  sigma2 = sigma2 + delta * delta2;
-}
-
-template <typename U>
-__device__ void cuChanOnlineSum(const U muB, const U sigma2B, const U countB,
-                                U &mu, U &sigma2, U &count) {
-  U delta = muB - mu;
-  U nA = count;
-  U nB = countB;
-  count = count + countB;
-  U nX = count;
-  if (nX > U(0)) {
-    nA = nA / nX;
-    nB = nB / nX;
-    mu = nA * mu + nB * muB;
-    sigma2 = sigma2 + sigma2B + delta * delta * nA * nB * nX;
-  } else {
-    mu = U(0);
-    sigma2 = U(0);
-  }
-}
-
-// https://github.com/pytorch/pytorch/blob/7fe0cc53e903e515e86b4a350614011c66e3b32d/aten/src/ATen/cuda/DeviceUtils.cuh#L50
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = warpSize, unsigned int mask = 0xffffffff)
-{
-#if !defined(USE_ROCM)
-    return __shfl_sync(mask, value, srcLane, width);
-#else
-    return __shfl(value, srcLane, width);
-#endif
-}
-
-template <typename T, typename U>
-__device__ void cuWelfordMuSigma2(const T *__restrict__ vals, const int n1,
-                                  const int n2, const int i1, U &mu, U &sigma2,
-                                  U *buf) {
-  // Assumptions:
-  // 1) blockDim.x == warpSize
-  // 2) Tensor is contiguous
-  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
-  //
-  // compute variance and mean over n2
-  U count = U(0);
-  mu = U(0);
-  sigma2 = U(0);
-  if (i1 < n1) {
-    // one warp normalizes one n1 index,
-    // synchronization is implicit
-    // initialize with standard Welford algorithm
-    const int numx = blockDim.x * blockDim.y;
-    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
-    const T *lvals = vals + i1 * n2;
-    int l = 4 * thrx;
-    for (; l + 3 < n2; l += 4 * numx) {
-      for (int k = 0; k < 4; ++k) {
-        U curr = static_cast<U>(lvals[l + k]);
-        cuWelfordOnlineSum<U>(curr, mu, sigma2, count);
-      }
-    }
-    for (; l < n2; ++l) {
-      U curr = static_cast<U>(lvals[l]);
-      cuWelfordOnlineSum<U>(curr, mu, sigma2, count);
-    }
-    // intra-warp reductions
-    for (int l = 0; l <= 4; ++l) {
-      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      U muB = WARP_SHFL(mu, srcLaneB);
-      U countB = WARP_SHFL(count, srcLaneB);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
-    }
-    // threadIdx.x == 0 has correct values for each warp
-    // inter-warp reductions
-    if (blockDim.y > 1) {
-      U *ubuf = (U *)buf;
-      U *ibuf = (U *)(ubuf + blockDim.y);
-      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
-        // upper half of warps write to shared
-        if (threadIdx.x == 0 && threadIdx.y >= offset &&
-            threadIdx.y < 2 * offset) {
-          const int wrt_y = threadIdx.y - offset;
-          ubuf[2 * wrt_y] = mu;
-          ubuf[2 * wrt_y + 1] = sigma2;
-          ibuf[wrt_y] = count;
-        }
-        __syncthreads();
-        // lower half merges
-        if (threadIdx.x == 0 && threadIdx.y < offset) {
-          U muB = ubuf[2 * threadIdx.y];
-          U sigma2B = ubuf[2 * threadIdx.y + 1];
-          U countB = ibuf[threadIdx.y];
-          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
-        }
-        __syncthreads();
-      }
-      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
-      if (threadIdx.x == 0 && threadIdx.y == 0) {
-        ubuf[0] = mu;
-        ubuf[1] = sigma2;
-      }
-      __syncthreads();
-      mu = ubuf[0];
-      sigma2 = ubuf[1] / U(n2);
-      // don't care about final value of count, we know count == n2
-    } else {
-      mu = WARP_SHFL(mu, 0);
-      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
-    }
-  }
-}
-
-template <>
-__device__ void cuWelfordMuSigma2(const __half *__restrict__ vals,
-                                  const int n1, const int n2, const int i1,
-                                  float &mu, float &sigma2, float *buf) {
-  // Assumptions:
-  // 1) blockDim.x == warpSize
-  // 2) Tensor is contiguous
-  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
-  //
-  // compute variance and mean over n2
-  float count = 0.0f;
-  mu = float(0);
-  sigma2 = float(0);
-
-  if (i1 < n1) {
-    // one warp normalizes one n1 index,
-    // synchronization is implicit
-    // initialize with standard Welford algorithm
-    const int numx = blockDim.x * blockDim.y;
-    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
-    const __half *lvals = vals + i1 * n2;
-    int l = 8 * thrx;
-    if ((((size_t)lvals) & 3) != 0) {
-      // 16 bit alignment
-      // first thread consumes first point
-      if (thrx == 0) {
-        float curr = static_cast<float>(lvals[0]);
-        cuWelfordOnlineSum(curr, mu, sigma2, count);
-      }
-      ++l;
-    }
-    // at this point, lvals[l] are 32 bit aligned for all threads.
-    for (; l + 7 < n2; l += 8 * numx) {
-      for (int k = 0; k < 8; k += 2) {
-        float2 curr = __half22float2(*((__half2 *)(lvals + l + k)));
-        cuWelfordOnlineSum(curr.x, mu, sigma2, count);
-        cuWelfordOnlineSum(curr.y, mu, sigma2, count);
-      }
-    }
-    for (; l < n2; ++l) {
-      float curr = static_cast<float>(lvals[l]);
-      cuWelfordOnlineSum(curr, mu, sigma2, count);
-    }
-    // intra-warp reductions
-    for (int l = 0; l <= 4; ++l) {
-      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      float muB = WARP_SHFL(mu, srcLaneB);
-      float countB = WARP_SHFL(count, srcLaneB);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
-    }
-    // threadIdx.x == 0 has correct values for each warp
-    // inter-warp reductions
-    if (blockDim.y > 1) {
-      float *ubuf = (float *)buf;
-      float *ibuf = (float *)(ubuf + blockDim.y);
-      for (int offset = blockDim.y / 2; offset > 0; offset /= 2) {
-        // upper half of warps write to shared
-        if (threadIdx.x == 0 && threadIdx.y >= offset &&
-            threadIdx.y < 2 * offset) {
-          const int wrt_y = threadIdx.y - offset;
-          ubuf[2 * wrt_y] = mu;
-          ubuf[2 * wrt_y + 1] = sigma2;
-          ibuf[wrt_y] = count;
-        }
-        __syncthreads();
-        // lower half merges
-        if (threadIdx.x == 0 && threadIdx.y < offset) {
-          float muB = ubuf[2 * threadIdx.y];
-          float sigma2B = ubuf[2 * threadIdx.y + 1];
-          float countB = ibuf[threadIdx.y];
-          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
-        }
-        __syncthreads();
-      }
-      // threadIdx.x = 0 && threadIdx.y == 0 only thread that has correct values
-      if (threadIdx.x == 0 && threadIdx.y == 0) {
-        ubuf[0] = mu;
-        ubuf[1] = sigma2;
-      }
-      __syncthreads();
-      mu = ubuf[0];
-      sigma2 = ubuf[1] / float(n2);
-      // don't care about final value of count, we know count == n2
-    } else {
-      mu = WARP_SHFL(mu, 0);
-      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
-    }
-  }
-}
-
-template <typename U> __device__ U rsqrt(U v) { return U(1) / sqrt(v); }
-template <> __device__ float rsqrt(float v) { return rsqrtf(v); }
-template <> __device__ double rsqrt(double v) { return rsqrt(v); }
-template <> __device__ __half rsqrt(__half v) { return rsqrt(v); }
-#if __CUDA_ARCH__ >= 800
-template <> __device__ __nv_bfloat16 rsqrt(__nv_bfloat16 v) { return rsqrt(v); }
-#endif
-
-// This is the un-specialized struct.  Note that we prevent instantiation of
-// this struct by putting an undefined symbol in the function body so it won't
-// compile.
-//  template <typename T>
-//  struct SharedMemory
-//  {
-//      // Ensure that we won't compile any un-specialized types
-//      __device__ T *getPointer()
-//      {
-//          extern __device__ void error(void);
-//          error();
-//          return NULL;
-//      }
-//  };
-// https://github.com/NVIDIA/apex/issues/246
-template <typename T> struct SharedMemory;
-template <> struct SharedMemory<float> {
-  __device__ float *getPointer() {
-    extern __shared__ float s_float[];
-    return s_float;
-  }
-};
-
-template <> struct SharedMemory<__half> {
-  __device__ __half *getPointer() {
-    extern __shared__ __half s_half[];
-    return s_half;
-  }
-};
-
-#if __CUDA_ARCH__ >= 800
-template <> struct SharedMemory<__nv_bfloat16> {
-  __device__ __nv_bfloat16 *getPointer() {
-    extern __shared__ __nv_bfloat16 s_bf[];
-    return s_bf;
-  }
-};
-#endif
-
-template <typename T, typename U>
-__device__ void
-cuApplyLayerNorm(T *__restrict__ output_vals, U *__restrict__ mean,
-                 U *__restrict__ invvar, const T *__restrict__ vals,
-                 const int n1, const int n2, const U epsilon,
-                 const T *__restrict__ gamma, const T *__restrict__ beta) {
-  // Assumptions:
-  // 1) blockDim.x == warpSize
-  // 2) Tensors are contiguous
-  //
-  for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
-    SharedMemory<U> shared;
-    U *buf = shared.getPointer();
-    U mu, sigma2;
-    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
-    const T *lvals = vals + i1 * n2;
-    T *ovals = output_vals + i1 * n2;
-    U c_invvar = rsqrt(sigma2 + epsilon);
-    const int numx = blockDim.x * blockDim.y;
-    const int thrx = threadIdx.x + threadIdx.y * blockDim.x;
-    if (gamma != NULL && beta != NULL) {
-      for (int i = thrx; i < n2; i += numx) {
-        U curr = static_cast<U>(lvals[i]);
-        ovals[i] = gamma[i] * static_cast<T>(c_invvar * (curr - mu)) + beta[i];
-      }
-    } else {
-      for (int i = thrx; i < n2; i += numx) {
-        U curr = static_cast<U>(lvals[i]);
-        ovals[i] = static_cast<T>(c_invvar * (curr - mu));
-      }
-    }
-    if (threadIdx.x == 0 && threadIdx.y == 0) {
-      mean[i1] = mu;
-      invvar[i1] = c_invvar;
-    }
-  }
-}
-
-extern "C" __global__ void layernorm_f16(__half *__restrict__ output_vals, __half *__restrict__ mean,
-                 __half *__restrict__ invvar, const __half *__restrict__ vals,
-                 const int n1, const int n2, const __half epsilon,
-                 const __half *__restrict__ gamma, const __half *__restrict__ beta) {
-  cuApplyLayerNorm(output_vals, mean, invvar, vals, n1, n2, epsilon, gamma, beta);
-}
-
-extern "C" __global__ void layernorm_f32(float *__restrict__ output_vals, float *__restrict__ mean,
-                 float *__restrict__ invvar, const float *__restrict__ vals,
-                 const int n1, const int n2, const float epsilon,
-                 const float *__restrict__ gamma, const float *__restrict__ beta) {
-  cuApplyLayerNorm(output_vals, mean, invvar, vals, n1, n2, epsilon, gamma, beta);
-}
-
-#if __CUDA_ARCH__ >= 800
-#include <cuda_bf16.h>
-extern "C" __global__ void layernorm_bf16(__nv_bfloat16 *__restrict__ output_vals, __nv_bfloat16 *__restrict__ mean,
-                 __nv_bfloat16 *__restrict__ invvar, const __nv_bfloat16 *__restrict__ vals,
-                 const int n1, const int n2, const __nv_bfloat16 epsilon,
-                 const __nv_bfloat16 *__restrict__ gamma, const __nv_bfloat16 *__restrict__ beta) {
-  cuApplyLayerNorm(output_vals, mean, invvar, vals, n1, n2, epsilon, gamma, beta);
-}
-#endif
diff --git a/candle-kernels/src/indexing.cu b/candle-kernels/src/indexing.cu
index df0e3a071d..52846a04bf 100644
--- a/candle-kernels/src/indexing.cu
+++ b/candle-kernels/src/indexing.cu
@@ -99,6 +99,57 @@ __device__ void index_add(
       }
 }
 
+#if __CUDA_ARCH__ >= 800
+#define F8E4M3_TO_FLOAT(x) __half2float(__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3))
+
+template<typename I>
+__device__ void scatter_add_f8(
+    const I *ids,
+    const __nv_fp8_e4m3 *inp,
+    __nv_fp8_e4m3 *out,
+    const size_t left_size,
+    const size_t src_dim_size,
+    const size_t dst_dim_size,
+    const size_t right_size
+) {
+      const size_t numel = left_size * right_size;
+      for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
+          const size_t pre = i / right_size;
+          const size_t post = i % right_size;
+          for (unsigned int j = 0; j < src_dim_size; ++j) {
+              const size_t src_i = (pre * src_dim_size + j) * right_size + post;
+              const size_t idx = ids[src_i];
+              const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
+              out[dst_i] = __nv_fp8_e4m3(F8E4M3_TO_FLOAT(out[dst_i]) + F8E4M3_TO_FLOAT(inp[src_i]));
+          }
+      }
+}
+
+template<typename I>
+__device__ void index_add_f8(
+    const I *ids,
+    const size_t ids_dim_size,
+    const __nv_fp8_e4m3 *inp,
+    __nv_fp8_e4m3 *out,
+    const size_t left_size,
+    const size_t src_dim_size,
+    const size_t dst_dim_size,
+    const size_t right_size
+) {
+      const size_t numel = left_size * right_size;
+      for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < numel; i += blockDim.x * gridDim.x) {
+          const size_t pre = i / right_size;
+          const size_t post = i % right_size;
+          for (unsigned int j = 0; j < ids_dim_size; ++j) {
+              const size_t idx = ids[j];
+              const size_t src_i = (pre * ids_dim_size + j) * right_size + post;
+              const size_t dst_i = (pre * dst_dim_size + idx) * right_size + post;
+              out[dst_i] = __nv_fp8_e4m3(F8E4M3_TO_FLOAT(out[dst_i]) + F8E4M3_TO_FLOAT(inp[src_i]));
+          }
+      }
+}
+#endif
+
 #define IA_OP(TYPENAME, INDEX_TYPENAME, FN_NAME) \
 extern "C" __global__ void FN_NAME(  \
     const INDEX_TYPENAME *ids, \
@@ -111,6 +162,18 @@ extern "C" __global__ void FN_NAME(  \
     const size_t right_size \
 ) { index_add(ids, ids_dim_size, inp, out, left_size, src_dim_size, dst_dim_size, right_size); } \
 
+#define IA_OP_F8(TYPENAME, INDEX_TYPENAME, FN_NAME) \
+extern "C" __global__ void FN_NAME(  \
+    const INDEX_TYPENAME *ids, \
+    const size_t ids_dim_size, \
+    const TYPENAME *inp, \
+    TYPENAME *out, \
+    const size_t left_size, \
+    const size_t src_dim_size, \
+    const size_t dst_dim_size, \
+    const size_t right_size \
+) { index_add_f8(ids, ids_dim_size, inp, out, left_size, src_dim_size, dst_dim_size, right_size); } \
+
 template<typename T, typename I>
 __device__ void scatter_add(
     const I *ids,
@@ -145,6 +208,17 @@ extern "C" __global__ void FN_NAME(  \
     const size_t right_size \
 ) { scatter_add(ids, inp, out, left_size, src_dim_size, dst_dim_size, right_size); } \
 
+#define SA_OP_F8(TYPENAME, INDEX_TYPENAME, FN_NAME) \
+extern "C" __global__ void FN_NAME(  \
+    const INDEX_TYPENAME *ids, \
+    const TYPENAME *inp, \
+    TYPENAME *out, \
+    const size_t left_size, \
+    const size_t src_dim_size, \
+    const size_t dst_dim_size, \
+    const size_t right_size \
+) { scatter_add_f8(ids, inp, out, left_size, src_dim_size, dst_dim_size, right_size); } \
+
 
 #if __CUDA_ARCH__ >= 800
 IS_OP(__nv_bfloat16, int16_t, is_i16_bf16)
@@ -167,6 +241,27 @@ SA_OP(__nv_bfloat16, int32_t, sa_i32_bf16)
 SA_OP(__nv_bfloat16, int64_t, sa_i64_bf16)
 SA_OP(__nv_bfloat16, uint32_t, sa_u32_bf16)
 SA_OP(__nv_bfloat16, uint8_t, sa_u8_bf16)
+
+IS_OP(__nv_fp8_e4m3, int16_t, is_i16_f8_e4m3)
+IS_OP(__nv_fp8_e4m3, int32_t, is_i32_f8_e4m3)
+IS_OP(__nv_fp8_e4m3, int64_t, is_i64_f8_e4m3)
+IS_OP(__nv_fp8_e4m3, uint32_t, is_u32_f8_e4m3)
+IS_OP(__nv_fp8_e4m3, uint8_t, is_u8_f8_e4m3)
+GATHER_OP(__nv_fp8_e4m3, int16_t, gather_i16_f8_e4m3)
+GATHER_OP(__nv_fp8_e4m3, int32_t, gather_i32_f8_e4m3)
+GATHER_OP(__nv_fp8_e4m3, int64_t, gather_i64_f8_e4m3)
+GATHER_OP(__nv_fp8_e4m3, uint32_t, gather_u32_f8_e4m3)
+GATHER_OP(__nv_fp8_e4m3, uint8_t, gather_u8_f8_e4m3)
+IA_OP_F8(__nv_fp8_e4m3, int16_t, ia_i16_f8_e4m3)
+IA_OP_F8(__nv_fp8_e4m3, int32_t, ia_i32_f8_e4m3)
+IA_OP_F8(__nv_fp8_e4m3, int64_t, ia_i64_f8_e4m3)
+IA_OP_F8(__nv_fp8_e4m3, uint32_t, ia_u32_f8_e4m3)
+IA_OP_F8(__nv_fp8_e4m3, uint8_t, ia_u8_f8_e4m3)
+SA_OP_F8(__nv_fp8_e4m3, int16_t, sa_i16_f8_e4m3)
+SA_OP_F8(__nv_fp8_e4m3, int32_t, sa_i32_f8_e4m3)
+SA_OP_F8(__nv_fp8_e4m3, int64_t, sa_i64_f8_e4m3)
+SA_OP_F8(__nv_fp8_e4m3, uint32_t, sa_u32_f8_e4m3)
+SA_OP_F8(__nv_fp8_e4m3, uint8_t, sa_u8_f8_e4m3)
 #endif
 
 #if __CUDA_ARCH__ >= 530
diff --git a/candle-kernels/src/kvconcat.cu b/candle-kernels/src/kvconcat.cu
index 7c78d3abe7..2bbd6c53a0 100644
--- a/candle-kernels/src/kvconcat.cu
+++ b/candle-kernels/src/kvconcat.cu
@@ -50,4 +50,5 @@ KVCONCAT_OP(__half, kvconcat_f16)
 
 #if __CUDA_ARCH__ >= 800
 KVCONCAT_OP(__nv_bfloat16, kvconcat_bf16)
+KVCONCAT_OP(__nv_fp8_e4m3, kvconcat_f8_e4m3)
 #endif
\ No newline at end of file
diff --git a/candle-kernels/src/lib.rs b/candle-kernels/src/lib.rs
index cec1b1e2d4..0bb490ca1c 100644
--- a/candle-kernels/src/lib.rs
+++ b/candle-kernels/src/lib.rs
@@ -3,7 +3,6 @@ pub const BINARY: &str = include_str!(concat!(env!("OUT_DIR"), "/binary.ptx"));
 pub const CAST: &str = include_str!(concat!(env!("OUT_DIR"), "/cast.ptx"));
 pub const CONV: &str = include_str!(concat!(env!("OUT_DIR"), "/conv.ptx"));
 pub const FILL: &str = include_str!(concat!(env!("OUT_DIR"), "/fill.ptx"));
-pub const FUSED_LAYER_NORM: &str = include_str!(concat!(env!("OUT_DIR"), "/fused_layer_norm.ptx"));
 pub const FUSED_RMS_NORM: &str = include_str!(concat!(env!("OUT_DIR"), "/fused_rms_norm.ptx"));
 pub const FUSED_ROPE: &str = include_str!(concat!(env!("OUT_DIR"), "/fused_rope.ptx"));
 pub const INDEXING: &str = include_str!(concat!(env!("OUT_DIR"), "/indexing.ptx"));
diff --git a/candle-kernels/src/reduce.cu b/candle-kernels/src/reduce.cu
index fe2e30160a..f42cad471e 100644
--- a/candle-kernels/src/reduce.cu
+++ b/candle-kernels/src/reduce.cu
@@ -580,6 +580,14 @@ LAYERNORM_OP(__nv_bfloat16, layernorm_bf16)
 ROPE_OP(__nv_bfloat16, rope_bf16, rope_i_bf16, rope_thd_bf16)
 SUM_OP(__nv_bfloat16, sum_bf16)
 FAST_OP(__nv_bfloat16, fast_min_bf16, fast_max_bf16, fast_argmin_bf16, fast_argmax_bf16, fast_sum_bf16)
+
+// NOTE: No reduce ops for f8
+// SUM_OP(__nv_fp8_e4m3, sum_fp8_e4m3)
+// SOFTMAX_OP(__nv_fp8_e4m3, float, softmax_fp8_e4m3)
+// RMSNORM_OP(__nv_fp8_e4m3, rmsnorm_fp8_e4m3)
+// LAYERNORM_OP(__nv_fp8_e4m3, layernorm_fp8_e4m3)
+// ROPE_OP(__nv_fp8_e4m3, rope_fp8_e4m3, rope_i_fp8_e4m3, rope_thd_fp8_e4m3)
+// FAST_OP(__nv_fp8_e4m3, fast_min_fp8_e4m3, fast_max_fp8_e4m3, fast_argmin_fp8_e4m3, fast_argmax_fp8_e4m3, fast_sum_fp8_e4m3)
 #endif
 
 #if __CUDA_ARCH__ >= 530
diff --git a/candle-kernels/src/sort.cu b/candle-kernels/src/sort.cu
index f2b2e9d458..7db1b20ec5 100644
--- a/candle-kernels/src/sort.cu
+++ b/candle-kernels/src/sort.cu
@@ -75,6 +75,9 @@ extern "C" __global__ void asort_desc_##RUST_NAME(  \
  
 #if __CUDA_ARCH__ >= 800
 ASORT_OP(__nv_bfloat16, bf16)
+
+// NOTE: No sort ops for f8
+// ASORT_OP(__nv_fp8_e4m3, fp8_e4m3)
 #endif
 
 #if __CUDA_ARCH__ >= 530
diff --git a/candle-kernels/src/ternary.cu b/candle-kernels/src/ternary.cu
index 18beede021..c426640b39 100644
--- a/candle-kernels/src/ternary.cu
+++ b/candle-kernels/src/ternary.cu
@@ -38,6 +38,12 @@ WHERE_OP(__nv_bfloat16, int32_t, where_i32_bf16)
 WHERE_OP(__nv_bfloat16, int64_t, where_i64_bf16)
 WHERE_OP(__nv_bfloat16, uint32_t, where_u32_bf16)
 WHERE_OP(__nv_bfloat16, uint8_t, where_u8_bf16)
+
+WHERE_OP(__nv_fp8_e4m3, int16_t, where_i16_fp8_e4m3)
+WHERE_OP(__nv_fp8_e4m3, int32_t, where_i32_fp8_e4m3)
+WHERE_OP(__nv_fp8_e4m3, int64_t, where_i64_fp8_e4m3)
+WHERE_OP(__nv_fp8_e4m3, uint32_t, where_u32_fp8_e4m3)
+WHERE_OP(__nv_fp8_e4m3, uint8_t, where_u8_fp8_e4m3)
 #endif
 
 #if __CUDA_ARCH__ >= 530
diff --git a/candle-kernels/src/unary.cu b/candle-kernels/src/unary.cu
index bfd60de0b1..ca888ca3fa 100644
--- a/candle-kernels/src/unary.cu
+++ b/candle-kernels/src/unary.cu
@@ -122,6 +122,33 @@ UNARY_OP(__nv_bfloat16, usilu_bf16, silu_fwd(x))
 UNARY_OP1(__nv_bfloat16, upowf_bf16, powg(x, param))
 UNARY_OP(__nv_bfloat16, usign_bf16, sign_(x))
 UNARY_OP(__nv_bfloat16, usigmoid_bf16, sigmoid_fwd(x))
+
+#define F8E4M3_TO_FLOAT(x) __half2float(__nv_cvt_fp8_to_halfraw(x.__x, __NV_E4M3))
+
+UNARY_OP(__nv_fp8_e4m3, ucopy_f8_e4m3, x)
+UNARY_OP(__nv_fp8_e4m3, uneg_fp8_e4m3, __nv_fp8_e4m3(-F8E4M3_TO_FLOAT(x)))
+UNARY_OP(__nv_fp8_e4m3, urecip_fp8_e4m3, recipg(x))
+UNARY_OP(__nv_fp8_e4m3, uexp_fp8_e4m3, expg(x))
+UNARY_OP(__nv_fp8_e4m3, ulog_fp8_e4m3, logg(x))
+UNARY_OP(__nv_fp8_e4m3, usin_fp8_e4m3, sing(x))
+UNARY_OP(__nv_fp8_e4m3, ucos_fp8_e4m3, cosg(x))
+UNARY_OP(__nv_fp8_e4m3, utanh_fp8_e4m3, tanhg(x))
+UNARY_OP(__nv_fp8_e4m3, uerf_fp8_e4m3, erfg(x))
+UNARY_OP(__nv_fp8_e4m3, uceil_fp8_e4m3, ceilg(x))
+UNARY_OP(__nv_fp8_e4m3, ufloor_fp8_e4m3, floorg(x))
+UNARY_OP(__nv_fp8_e4m3, uround_fp8_e4m3, roundg(x))
+UNARY_OP(__nv_fp8_e4m3, unormcdf_fp8_e4m3, normcdfg(x))
+UNARY_OP(__nv_fp8_e4m3, uabs_fp8_e4m3, absg(x))
+UNARY_OP(__nv_fp8_e4m3, usqr_fp8_e4m3, __nv_fp8_e4m3(F8E4M3_TO_FLOAT(x)*F8E4M3_TO_FLOAT(x)))
+UNARY_OP(__nv_fp8_e4m3, usqrt_fp8_e4m3, sqrtg(x))
+UNARY_OP(__nv_fp8_e4m3, ugelu_fp8_e4m3, __nv_fp8_e4m3(gelu_fwd(F8E4M3_TO_FLOAT(x))))
+UNARY_OP(__nv_fp8_e4m3, ugelu_erf_fp8_e4m3, __nv_fp8_e4m3(gelu_erf_fwd(F8E4M3_TO_FLOAT(x))))
+UNARY_OP(__nv_fp8_e4m3, urelu_fp8_e4m3, __nv_fp8_e4m3(relu_fwd(F8E4M3_TO_FLOAT(x))))
+UNARY_OP1(__nv_fp8_e4m3, uelu_fp8_e4m3, __nv_fp8_e4m3(elu_fwd(F8E4M3_TO_FLOAT(x), F8E4M3_TO_FLOAT(param))))
+UNARY_OP(__nv_fp8_e4m3, usilu_fp8_e4m3, __nv_fp8_e4m3(silu_fwd(F8E4M3_TO_FLOAT(x))))
+UNARY_OP1(__nv_fp8_e4m3, upowf_fp8_e4m3, powg(x, param))
+UNARY_OP(__nv_fp8_e4m3, usign_fp8_e4m3, __nv_fp8_e4m3(sign_(F8E4M3_TO_FLOAT(x))))
+UNARY_OP(__nv_fp8_e4m3, usigmoid_fp8_e4m3, __nv_fp8_e4m3(sigmoid_fwd(F8E4M3_TO_FLOAT(x))))
 #endif
 
 #if __CUDA_ARCH__ >= 530
diff --git a/candle-pyo3/Cargo.toml b/candle-pyo3/Cargo.toml
index 8800133429..bfed9eb48b 100644
--- a/candle-pyo3/Cargo.toml
+++ b/candle-pyo3/Cargo.toml
@@ -19,6 +19,7 @@ candle = { workspace = true }
 candle-nn = { workspace = true }
 candle-onnx = { workspace = true, optional = true }
 half = { workspace = true }
+float8 = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 pyo3 = { version = "0.21.0", features = ["extension-module", "abi3-py38"] }
 
diff --git a/candle-pyo3/src/lib.rs b/candle-pyo3/src/lib.rs
index d2179d577f..ab7f07d985 100644
--- a/candle-pyo3/src/lib.rs
+++ b/candle-pyo3/src/lib.rs
@@ -1,4 +1,5 @@
 #![allow(clippy::redundant_closure_call)]
+use float8::F8E4M3;
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::prelude::*;
 use pyo3::pyclass::CompareOp;
@@ -160,6 +161,7 @@ pydtype!(f16, f32::from);
 pydtype!(bf16, f32::from);
 pydtype!(f32, |v| v);
 pydtype!(f64, |v| v);
+pydtype!(F8E4M3, f32::from);
 
 fn actual_index(t: &Tensor, dim: usize, index: i64) -> ::candle::Result<usize> {
     let dim = t.dim(dim)?;
@@ -209,6 +211,7 @@ trait MapDType {
             DType::F16 => self.f::<f16>(t),
             DType::F32 => self.f::<f32>(t),
             DType::F64 => self.f::<f64>(t),
+            DType::F8E4M3 => self.f::<F8E4M3>(t),
         }
     }
 }