From fd08d3d0a40872f207284b008de23ef875d54f74 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 2 Oct 2024 10:22:31 +0200 Subject: [PATCH 01/28] Tweak some metal tests. (#2528) --- candle-metal-kernels/src/lib.rs | 5 -- candle-metal-kernels/src/tests.rs | 80 +++++++++---------------------- 2 files changed, 23 insertions(+), 62 deletions(-) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index a270bb2888..be6160093e 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -2372,16 +2372,11 @@ pub fn call_const_fill( let pipeline = kernels.load_pipeline(device, Source::Fill, name)?; let encoder = ep.encoder(); let encoder: &ComputeCommandEncoderRef = encoder.as_ref(); - encoder.set_compute_pipeline_state(&pipeline); - set_params!(encoder, (output, v, length)); - let (thread_group_count, thread_group_size) = linear_split(&pipeline, length); - encoder.use_resource(output, metal::MTLResourceUsage::Write); encoder.dispatch_thread_groups(thread_group_count, thread_group_size); - Ok(()) } diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index f37ab5bb9c..637bf2e243 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -2309,66 +2309,32 @@ fn conv_transpose1d_u32() { assert_eq!(results, expected); } -fn constant_fill(name: &'static str, len: usize, value: f32) -> Vec { - let dev = device(); - let kernels = Kernels::new(); - let command_queue = dev.new_command_queue(); - let command_buffer = command_queue.new_command_buffer(); - - let buffer = dev.new_buffer( - (len * std::mem::size_of::()) as u64, - MTLResourceOptions::StorageModePrivate, - ); - - call_const_fill(&dev, command_buffer, &kernels, name, len, &buffer, value).unwrap(); - - command_buffer.commit(); - command_buffer.wait_until_completed(); - - read_to_vec::(&buffer, len) -} - #[test] fn const_fill() { - let fills = [ - "fill_u8", - "fill_u32", - "fill_i64", - "fill_f16", - "fill_bf16", - "fill_f32", - ]; - - for name in fills { + fn constant_fill(name: &'static str, len: usize, value: f32) -> Vec { + let dev = device(); + let kernels = Kernels::new(); + let command_queue = dev.new_command_queue(); + let command_buffer = command_queue.new_command_buffer(); + let buffer = dev.new_buffer( + (len * std::mem::size_of::()) as u64, + MTLResourceOptions::StorageModePrivate, + ); + call_const_fill(&dev, command_buffer, &kernels, name, len, &buffer, value).unwrap(); + command_buffer.commit(); + command_buffer.wait_until_completed(); + read_to_vec::(&buffer, len) + } + fn test T>(name: &'static str, f: F) { let len = rand::thread_rng().gen_range(2..16) * rand::thread_rng().gen_range(4..16); let value = rand::thread_rng().gen_range(1. ..19.); - - match name { - "fill_u8" => { - let v = constant_fill::(name, len, value); - assert_eq!(v, vec![value as u8; len]) - } - "fill_u32" => { - let v = constant_fill::(name, len, value); - assert_eq!(v, vec![value as u32; len]) - } - "fill_i64" => { - let v = constant_fill::(name, len, value); - assert_eq!(v, vec![value as i64; len]) - } - "fill_f16" => { - let v = constant_fill::(name, len, value); - assert_eq!(v, vec![f16::from_f32(value); len]) - } - "fill_bf16" => { - let v = constant_fill::(name, len, value); - assert_eq!(v, vec![bf16::from_f32(value); len]) - } - "fill_f32" => { - let v = constant_fill::(name, len, value); - assert_eq!(v, vec![value; len]) - } - _ => unimplemented!(), - }; + let v = constant_fill::(name, len, value); + assert_eq!(v, vec![f(value); len]) } + test::("fill_u8", |v| v as u8); + test::("fill_u32", |v| v as u32); + test::("fill_i64", |v| v as i64); + test::("fill_f16", f16::from_f32); + test::("fill_bf16", bf16::from_f32); + test::("fill_f32", |v| v); } From f479840ce6d2222bd004b6f275494297f1f0ae91 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 2 Oct 2024 10:52:02 +0200 Subject: [PATCH 02/28] Add a seed to the flux example. (#2529) --- candle-examples/examples/flux/main.rs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/candle-examples/examples/flux/main.rs b/candle-examples/examples/flux/main.rs index 24b1fa2bc6..943db1121c 100644 --- a/candle-examples/examples/flux/main.rs +++ b/candle-examples/examples/flux/main.rs @@ -45,9 +45,13 @@ struct Args { #[arg(long, value_enum, default_value = "schnell")] model: Model, - /// Use the faster kernels which are buggy at the moment. + /// Use the slower kernels. #[arg(long)] - no_dmmv: bool, + use_dmmv: bool, + + /// The seed to use when generating random samples. + #[arg(long)] + seed: Option, } #[derive(Debug, Clone, Copy, clap::ValueEnum, PartialEq, Eq)] @@ -91,6 +95,9 @@ fn run(args: Args) -> Result<()> { api.repo(hf_hub::Repo::model(name.to_string())) }; let device = candle_examples::device(cpu)?; + if let Some(seed) = args.seed { + device.set_seed(seed)?; + } let dtype = device.bf16_default_to_f32(); let img = match decode_only { None => { @@ -250,6 +257,6 @@ fn run(args: Args) -> Result<()> { fn main() -> Result<()> { let args = Args::parse(); #[cfg(feature = "cuda")] - candle::quantized::cuda::set_force_dmmv(!args.no_dmmv); + candle::quantized::cuda::set_force_dmmv(args.use_dmmv); run(args) } From 936300678d588c6525594ef2578737e0c19ecf07 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 2 Oct 2024 21:07:08 +0200 Subject: [PATCH 03/28] Add whisper large-v3 turbo to the example. (#2531) --- candle-examples/examples/whisper/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/candle-examples/examples/whisper/main.rs b/candle-examples/examples/whisper/main.rs index ecd5ff84a4..84aa8b74bc 100644 --- a/candle-examples/examples/whisper/main.rs +++ b/candle-examples/examples/whisper/main.rs @@ -370,6 +370,7 @@ enum WhichModel { Large, LargeV2, LargeV3, + LargeV3Turbo, #[value(name = "distil-medium.en")] DistilMediumEn, #[value(name = "distil-large-v2")] @@ -388,6 +389,7 @@ impl WhichModel { | Self::Large | Self::LargeV2 | Self::LargeV3 + | Self::LargeV3Turbo | Self::DistilLargeV2 | Self::DistilLargeV3 => true, Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn | Self::DistilMediumEn => { @@ -409,6 +411,7 @@ impl WhichModel { Self::Large => ("openai/whisper-large", "refs/pr/36"), Self::LargeV2 => ("openai/whisper-large-v2", "refs/pr/57"), Self::LargeV3 => ("openai/whisper-large-v3", "main"), + Self::LargeV3Turbo => ("openai/whisper-large-v3-turbo", "main"), Self::DistilMediumEn => ("distil-whisper/distil-medium.en", "main"), Self::DistilLargeV2 => ("distil-whisper/distil-large-v2", "main"), Self::DistilLargeV3 => ("distil-whisper/distil-large-v3", "main"), From 7b60bda4ed8c9d861396fe74307d6c77281522ef Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 2 Oct 2024 21:30:58 +0200 Subject: [PATCH 04/28] Add support for cuda streams. (#2532) --- candle-core/src/cuda_backend/device.rs | 14 ++++++++++++++ candle-core/src/device.rs | 4 ++++ candle-core/src/dummy_cuda_backend.rs | 6 ++++++ 3 files changed, 24 insertions(+) diff --git a/candle-core/src/cuda_backend/device.rs b/candle-core/src/cuda_backend/device.rs index 0aa58cacde..89fe44a6e6 100644 --- a/candle-core/src/cuda_backend/device.rs +++ b/candle-core/src/cuda_backend/device.rs @@ -144,6 +144,20 @@ impl CudaDevice { } } +impl CudaDevice { + pub fn new_with_stream(ordinal: usize) -> Result { + let device = cudarc::driver::CudaDevice::new_with_stream(ordinal).w()?; + let blas = cudarc::cublas::CudaBlas::new(device.clone()).w()?; + let curand = cudarc::curand::CudaRng::new(299792458, device.clone()).w()?; + Ok(Self { + id: DeviceId::new(), + device, + blas: Arc::new(blas), + curand: Arc::new(Mutex::new(CudaRng(curand))), + }) + } +} + impl BackendDevice for CudaDevice { type Storage = CudaStorage; diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs index 91e569372d..c4a8e9361e 100644 --- a/candle-core/src/device.rs +++ b/candle-core/src/device.rs @@ -130,6 +130,10 @@ impl Device { Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?)) } + pub fn new_cuda_with_stream(ordinal: usize) -> Result { + Ok(Self::Cuda(crate::CudaDevice::new_with_stream(ordinal)?)) + } + pub fn new_metal(ordinal: usize) -> Result { Ok(Self::Metal(crate::MetalDevice::new(ordinal)?)) } diff --git a/candle-core/src/dummy_cuda_backend.rs b/candle-core/src/dummy_cuda_backend.rs index 68eef1efed..b4f2e8aa00 100644 --- a/candle-core/src/dummy_cuda_backend.rs +++ b/candle-core/src/dummy_cuda_backend.rs @@ -14,6 +14,12 @@ macro_rules! fail { }; } +impl CudaDevice { + pub fn new_with_stream(_: usize) -> Result { + Err(Error::NotCompiledWithCudaSupport) + } +} + impl crate::backend::BackendStorage for CudaStorage { type Device = CudaDevice; From 90d04ff622e14233d0b0902cd2ab193070369c22 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 2 Oct 2024 22:09:14 +0200 Subject: [PATCH 05/28] Support whisper large-v3 turbo in the whisper-microphone example. (#2533) --- candle-examples/examples/whisper-microphone/main.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/candle-examples/examples/whisper-microphone/main.rs b/candle-examples/examples/whisper-microphone/main.rs index 44a64b05a8..5165da1c1e 100644 --- a/candle-examples/examples/whisper-microphone/main.rs +++ b/candle-examples/examples/whisper-microphone/main.rs @@ -389,6 +389,7 @@ enum WhichModel { Large, LargeV2, LargeV3, + LargeV3Turbo, #[value(name = "distil-medium.en")] DistilMediumEn, #[value(name = "distil-large-v2")] @@ -405,6 +406,7 @@ impl WhichModel { | Self::Large | Self::LargeV2 | Self::LargeV3 + | Self::LargeV3Turbo | Self::DistilLargeV2 => true, Self::TinyEn | Self::BaseEn | Self::SmallEn | Self::MediumEn | Self::DistilMediumEn => { false @@ -425,6 +427,7 @@ impl WhichModel { Self::Large => ("openai/whisper-large", "refs/pr/36"), Self::LargeV2 => ("openai/whisper-large-v2", "refs/pr/57"), Self::LargeV3 => ("openai/whisper-large-v3", "main"), + Self::LargeV3Turbo => ("openai/whisper-large-v3-turbo", "main"), Self::DistilMediumEn => ("distil-whisper/distil-medium.en", "main"), Self::DistilLargeV2 => ("distil-whisper/distil-large-v2", "main"), } From 6faecaa6166257cb5c0ebccdf383300f46eec840 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Wed, 2 Oct 2024 23:18:55 +0200 Subject: [PATCH 06/28] Fix for cudnn bf16 conv2d. (#2535) --- candle-core/src/cuda_backend/cudnn.rs | 11 ++++++----- candle-core/src/cuda_backend/mod.rs | 13 ++++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/candle-core/src/cuda_backend/cudnn.rs b/candle-core/src/cuda_backend/cudnn.rs index d604863d35..f5b4db9026 100644 --- a/candle-core/src/cuda_backend/cudnn.rs +++ b/candle-core/src/cuda_backend/cudnn.rs @@ -26,6 +26,7 @@ impl From for crate::Error { pub(crate) fn launch_conv2d< T: DeviceRepr + WithDType + ValidAsZeroBits + cudarc::cudnn::CudnnDataType, + Y: cudarc::cudnn::CudnnDataType, >( src: &CudaView, src_l: &crate::Layout, @@ -48,7 +49,7 @@ pub(crate) fn launch_conv2d< } c })?; - let conv = cudnn.create_conv2d::( + let conv = cudnn.create_conv2d::( /* pad */ [params.padding as i32, params.padding as i32], /* stride */ [params.stride as i32, params.stride as i32], /* dilation */ [params.dilation as i32, params.dilation as i32], @@ -62,18 +63,18 @@ pub(crate) fn launch_conv2d< ]; // Note that `src` already starts at the proper offset. let x = if src_l.is_contiguous() { - cudnn.create_4d_tensor( + cudnn.create_4d_tensor::( cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, x_shape, )? } else { let s = src_l.stride(); - cudnn.create_4d_tensor_ex( + cudnn.create_4d_tensor_ex::( x_shape, [s[0] as i32, s[1] as i32, s[2] as i32, s[3] as i32], )? }; - let w = cudnn.create_4d_filter( + let w = cudnn.create_4d_filter::( cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, [ params.c_out as i32, @@ -83,7 +84,7 @@ pub(crate) fn launch_conv2d< ], )?; let (w_out, h_out) = (params.out_w() as i32, params.out_h() as i32); - let y = cudnn.create_4d_tensor( + let y = cudnn.create_4d_tensor::( cudarc::cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, [params.b_size as i32, params.c_out as i32, h_out, w_out], )?; diff --git a/candle-core/src/cuda_backend/mod.rs b/candle-core/src/cuda_backend/mod.rs index 07bb1785dd..f14e00d533 100644 --- a/candle-core/src/cuda_backend/mod.rs +++ b/candle-core/src/cuda_backend/mod.rs @@ -1522,7 +1522,7 @@ impl BackendStorage for CudaStorage { let inp = &inp.slice(inp_l.start_offset()..); let k = &k.slice(kernel_l.start_offset()..); let mut out = unsafe { device.alloc::(dst_el) }.w()?; - crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) + crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) .map_err(crate::Error::wrap)?; S::U8(out) } @@ -1530,7 +1530,10 @@ impl BackendStorage for CudaStorage { let inp = &inp.slice(inp_l.start_offset()..); let k = &k.slice(kernel_l.start_offset()..); let mut out = unsafe { device.alloc::(dst_el) }.w()?; - crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) + // Only PSEUDO_BFLOAT16_CONFIG is supported in cudnn, there is no "true bfloat16" + // version. + // https://docs.nvidia.com/deeplearning/cudnn/latest/api/cudnn-cnn-library.html#id88 + crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) .map_err(crate::Error::wrap)?; S::BF16(out) } @@ -1538,7 +1541,7 @@ impl BackendStorage for CudaStorage { let inp = &inp.slice(inp_l.start_offset()..); let k = &k.slice(kernel_l.start_offset()..); let mut out = unsafe { device.alloc::(dst_el) }.w()?; - crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) + crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) .map_err(crate::Error::wrap)?; S::F16(out) } @@ -1546,7 +1549,7 @@ impl BackendStorage for CudaStorage { let inp = &inp.slice(inp_l.start_offset()..); let k = &k.slice(kernel_l.start_offset()..); let mut out = unsafe { device.alloc::(dst_el) }.w()?; - crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) + crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) .map_err(crate::Error::wrap)?; S::F32(out) } @@ -1554,7 +1557,7 @@ impl BackendStorage for CudaStorage { let inp = &inp.slice(inp_l.start_offset()..); let k = &k.slice(kernel_l.start_offset()..); let mut out = unsafe { device.alloc::(dst_el) }.w()?; - crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) + crate::cudnn::launch_conv2d::(inp, inp_l, k, &mut out, params, &device) .map_err(crate::Error::wrap)?; S::F64(out) } From 56aacb05daa13a9a10a8995a02c5b827561ba797 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Fri, 4 Oct 2024 14:22:23 +0200 Subject: [PATCH 07/28] Make the RNN configs accessible from the models. (#2541) --- candle-examples/examples/encodec/audio_io.rs | 1 - candle-examples/examples/mimi/audio_io.rs | 1 - candle-nn/src/rnn.rs | 175 +++++++++++-------- 3 files changed, 103 insertions(+), 74 deletions(-) diff --git a/candle-examples/examples/encodec/audio_io.rs b/candle-examples/examples/encodec/audio_io.rs index 2103dd4adf..fa1a26fbf7 100644 --- a/candle-examples/examples/encodec/audio_io.rs +++ b/candle-examples/examples/encodec/audio_io.rs @@ -1,4 +1,3 @@ -#![allow(unused)] use anyhow::{Context, Result}; use std::sync::{Arc, Mutex}; diff --git a/candle-examples/examples/mimi/audio_io.rs b/candle-examples/examples/mimi/audio_io.rs index 2103dd4adf..fa1a26fbf7 100644 --- a/candle-examples/examples/mimi/audio_io.rs +++ b/candle-examples/examples/mimi/audio_io.rs @@ -1,4 +1,3 @@ -#![allow(unused)] use anyhow::{Context, Result}; use std::sync::{Arc, Mutex}; diff --git a/candle-nn/src/rnn.rs b/candle-nn/src/rnn.rs index b4b443c6b8..798db6ac4d 100644 --- a/candle-nn/src/rnn.rs +++ b/candle-nn/src/rnn.rs @@ -116,7 +116,7 @@ impl LSTMConfig { /// A Long Short-Term Memory (LSTM) layer. /// /// -#[allow(clippy::upper_case_acronyms, unused)] +#[allow(clippy::upper_case_acronyms)] #[derive(Clone, Debug)] pub struct LSTM { w_ih: Tensor, @@ -129,6 +129,62 @@ pub struct LSTM { dtype: DType, } +impl LSTM { + /// Creates a LSTM layer. + pub fn new( + in_dim: usize, + hidden_dim: usize, + config: LSTMConfig, + vb: crate::VarBuilder, + ) -> Result { + let layer_idx = config.layer_idx; + let direction_str = match config.direction { + Direction::Forward => "", + Direction::Backward => "_reverse", + }; + let w_ih = vb.get_with_hints( + (4 * hidden_dim, in_dim), + &format!("weight_ih_l{layer_idx}{direction_str}"), // Only a single layer is supported. + config.w_ih_init, + )?; + let w_hh = vb.get_with_hints( + (4 * hidden_dim, hidden_dim), + &format!("weight_hh_l{layer_idx}{direction_str}"), // Only a single layer is supported. + config.w_hh_init, + )?; + let b_ih = match config.b_ih_init { + Some(init) => Some(vb.get_with_hints( + 4 * hidden_dim, + &format!("bias_ih_l{layer_idx}{direction_str}"), + init, + )?), + None => None, + }; + let b_hh = match config.b_hh_init { + Some(init) => Some(vb.get_with_hints( + 4 * hidden_dim, + &format!("bias_hh_l{layer_idx}{direction_str}"), + init, + )?), + None => None, + }; + Ok(Self { + w_ih, + w_hh, + b_ih, + b_hh, + hidden_dim, + config, + device: vb.device().clone(), + dtype: vb.dtype(), + }) + } + + pub fn config(&self) -> &LSTMConfig { + &self.config + } +} + /// Creates a LSTM layer. pub fn lstm( in_dim: usize, @@ -136,47 +192,7 @@ pub fn lstm( config: LSTMConfig, vb: crate::VarBuilder, ) -> Result { - let layer_idx = config.layer_idx; - let direction_str = match config.direction { - Direction::Forward => "", - Direction::Backward => "_reverse", - }; - let w_ih = vb.get_with_hints( - (4 * hidden_dim, in_dim), - &format!("weight_ih_l{layer_idx}{direction_str}"), // Only a single layer is supported. - config.w_ih_init, - )?; - let w_hh = vb.get_with_hints( - (4 * hidden_dim, hidden_dim), - &format!("weight_hh_l{layer_idx}{direction_str}"), // Only a single layer is supported. - config.w_hh_init, - )?; - let b_ih = match config.b_ih_init { - Some(init) => Some(vb.get_with_hints( - 4 * hidden_dim, - &format!("bias_ih_l{layer_idx}{direction_str}"), - init, - )?), - None => None, - }; - let b_hh = match config.b_hh_init { - Some(init) => Some(vb.get_with_hints( - 4 * hidden_dim, - &format!("bias_hh_l{layer_idx}{direction_str}"), - init, - )?), - None => None, - }; - Ok(LSTM { - w_ih, - w_hh, - b_ih, - b_hh, - hidden_dim, - config, - device: vb.device().clone(), - dtype: vb.dtype(), - }) + LSTM::new(in_dim, hidden_dim, config, vb) } impl RNN for LSTM { @@ -270,7 +286,7 @@ impl GRUConfig { /// A Gated Recurrent Unit (GRU) layer. /// /// -#[allow(clippy::upper_case_acronyms, unused)] +#[allow(clippy::upper_case_acronyms)] #[derive(Clone, Debug)] pub struct GRU { w_ih: Tensor, @@ -283,41 +299,56 @@ pub struct GRU { dtype: DType, } -/// Creates a GRU layer. +impl GRU { + /// Creates a GRU layer. + pub fn new( + in_dim: usize, + hidden_dim: usize, + config: GRUConfig, + vb: crate::VarBuilder, + ) -> Result { + let w_ih = vb.get_with_hints( + (3 * hidden_dim, in_dim), + "weight_ih_l0", // Only a single layer is supported. + config.w_ih_init, + )?; + let w_hh = vb.get_with_hints( + (3 * hidden_dim, hidden_dim), + "weight_hh_l0", // Only a single layer is supported. + config.w_hh_init, + )?; + let b_ih = match config.b_ih_init { + Some(init) => Some(vb.get_with_hints(3 * hidden_dim, "bias_ih_l0", init)?), + None => None, + }; + let b_hh = match config.b_hh_init { + Some(init) => Some(vb.get_with_hints(3 * hidden_dim, "bias_hh_l0", init)?), + None => None, + }; + Ok(Self { + w_ih, + w_hh, + b_ih, + b_hh, + hidden_dim, + config, + device: vb.device().clone(), + dtype: vb.dtype(), + }) + } + + pub fn config(&self) -> &GRUConfig { + &self.config + } +} + pub fn gru( in_dim: usize, hidden_dim: usize, config: GRUConfig, vb: crate::VarBuilder, ) -> Result { - let w_ih = vb.get_with_hints( - (3 * hidden_dim, in_dim), - "weight_ih_l0", // Only a single layer is supported. - config.w_ih_init, - )?; - let w_hh = vb.get_with_hints( - (3 * hidden_dim, hidden_dim), - "weight_hh_l0", // Only a single layer is supported. - config.w_hh_init, - )?; - let b_ih = match config.b_ih_init { - Some(init) => Some(vb.get_with_hints(3 * hidden_dim, "bias_ih_l0", init)?), - None => None, - }; - let b_hh = match config.b_hh_init { - Some(init) => Some(vb.get_with_hints(3 * hidden_dim, "bias_hh_l0", init)?), - None => None, - }; - Ok(GRU { - w_ih, - w_hh, - b_ih, - b_hh, - hidden_dim, - config, - device: vb.device().clone(), - dtype: vb.dtype(), - }) + GRU::new(in_dim, hidden_dim, config, vb) } impl RNN for GRU { From 410c89f72a0ab22a299d02d24f505a50522faaa2 Mon Sep 17 00:00:00 2001 From: dengelt Date: Fri, 4 Oct 2024 14:29:55 +0200 Subject: [PATCH 08/28] Add required feature for whisper example in Readme (#2539) --- candle-examples/examples/whisper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/candle-examples/examples/whisper/README.md b/candle-examples/examples/whisper/README.md index a7dd408164..eb77a65b9a 100644 --- a/candle-examples/examples/whisper/README.md +++ b/candle-examples/examples/whisper/README.md @@ -12,7 +12,7 @@ file](https://huggingface.co/datasets/Narsil/candle-examples/resolve/main/sample from the hub. ```bash - cargo run --example whisper --release + cargo run --example whisper --release --features="symphonia" > No audio file submitted: Downloading https://huggingface.co/datasets/Narsil/candle_demo/blob/main/samples_jfk.wav > loaded wav data: Header { audio_format: 1, channel_count: 1, sampling_rate: 16000, bytes_per_second: 32000, bytes_per_sample: 2, bits_per_sample: 16 } From d2e432914ec495baff1db29799fe316b9190b0e9 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 5 Oct 2024 10:05:14 +0200 Subject: [PATCH 09/28] Tensor tools print all (#2543) * Support whisper large-v3 turbo in the whisper-microphone example. * Print all tensors when no argument is provided. --- tensor-tools/src/main.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tensor-tools/src/main.rs b/tensor-tools/src/main.rs index ad351171f5..0bda36d524 100644 --- a/tensor-tools/src/main.rs +++ b/tensor-tools/src/main.rs @@ -197,6 +197,11 @@ fn run_print( match format { Format::Npz => { let tensors = candle::npy::NpzTensors::new(file)?; + let names = if names.is_empty() { + tensors.names().into_iter().map(|v| v.to_string()).collect() + } else { + names + }; for name in names.iter() { println!("==== {name} ===="); match tensors.get(name)? { @@ -209,6 +214,11 @@ fn run_print( use candle::safetensors::Load; let tensors = unsafe { candle::safetensors::MmapedSafetensors::new(file)? }; let tensors: std::collections::HashMap<_, _> = tensors.tensors().into_iter().collect(); + let names = if names.is_empty() { + tensors.keys().map(|v| v.to_string()).collect() + } else { + names + }; for name in names.iter() { println!("==== {name} ===="); match tensors.get(name) { @@ -222,6 +232,15 @@ fn run_print( } Format::Pth => { let pth_file = candle::pickle::PthTensors::new(file, None)?; + let names = if names.is_empty() { + pth_file + .tensor_infos() + .keys() + .map(|v| v.to_string()) + .collect() + } else { + names + }; for name in names.iter() { println!("==== {name} ===="); match pth_file.get(name)? { @@ -238,6 +257,11 @@ fn run_print( Format::Ggml => { let mut file = std::fs::File::open(file)?; let content = candle::quantized::ggml_file::Content::read(&mut file, device)?; + let names = if names.is_empty() { + content.tensors.keys().map(|v| v.to_string()).collect() + } else { + names + }; for name in names.iter() { println!("==== {name} ===="); match content.tensors.get(name) { @@ -252,6 +276,11 @@ fn run_print( Format::Gguf => { let mut file = std::fs::File::open(file)?; let content = gguf_file::Content::read(&mut file)?; + let names = if names.is_empty() { + content.tensor_infos.keys().map(|v| v.to_string()).collect() + } else { + names + }; for name in names.iter() { println!("==== {name} ===="); match content.tensor(&mut file, name, device) { From f856b5c3a75028d384c26e36501d429091662cd3 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 6 Oct 2024 10:09:38 +0200 Subject: [PATCH 10/28] pyo3 update. (#2545) * pyo3 update. * Stub fix. --- candle-examples/Cargo.toml | 4 ++-- candle-pyo3/Cargo.toml | 4 ++-- candle-pyo3/py_src/candle/utils/__init__.pyi | 10 +++------- candle-pyo3/src/lib.rs | 19 +++++++++---------- candle-pyo3/src/shape.rs | 12 ++++++------ 5 files changed, 22 insertions(+), 27 deletions(-) diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index 4edde7a966..0c1219d760 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -27,7 +27,7 @@ intel-mkl-src = { workspace = true, optional = true } num-traits = { workspace = true } palette = { version = "0.7.6", optional = true } enterpolation = { version = "0.2.1", optional = true} -pyo3 = { version = "0.21.0", features = ["auto-initialize"], optional = true } +pyo3 = { version = "0.22.0", features = ["auto-initialize"], optional = true } rayon = { workspace = true } rubato = { version = "0.15.0", optional = true } safetensors = { workspace = true } @@ -121,4 +121,4 @@ required-features = ["onnx"] [[example]] name = "colpali" -required-features = ["pdf2image"] \ No newline at end of file +required-features = ["pdf2image"] diff --git a/candle-pyo3/Cargo.toml b/candle-pyo3/Cargo.toml index 8800133429..2776a3f77c 100644 --- a/candle-pyo3/Cargo.toml +++ b/candle-pyo3/Cargo.toml @@ -20,10 +20,10 @@ candle-nn = { workspace = true } candle-onnx = { workspace = true, optional = true } half = { workspace = true } intel-mkl-src = { workspace = true, optional = true } -pyo3 = { version = "0.21.0", features = ["extension-module", "abi3-py38"] } +pyo3 = { version = "0.22.0", features = ["extension-module", "abi3-py38"] } [build-dependencies] -pyo3-build-config = "0.21" +pyo3-build-config = "0.22" [features] default = [] diff --git a/candle-pyo3/py_src/candle/utils/__init__.pyi b/candle-pyo3/py_src/candle/utils/__init__.pyi index c9a9f9f3c1..94c3228398 100644 --- a/candle-pyo3/py_src/candle/utils/__init__.pyi +++ b/candle-pyo3/py_src/candle/utils/__init__.pyi @@ -33,9 +33,7 @@ def has_mkl() -> bool: pass @staticmethod -def load_ggml( - path: Union[str, PathLike], device: Optional[Device] = None -) -> Tuple[Dict[str, QTensor], Dict[str, Any], List[str]]: +def load_ggml(path, device=None) -> Tuple[Dict[str, QTensor], Dict[str, Any], List[str]]: """ Load a GGML file. Returns a tuple of three objects: a dictionary mapping tensor names to tensors, a dictionary mapping hyperparameter names to hyperparameter values, and a vocabulary. @@ -43,9 +41,7 @@ def load_ggml( pass @staticmethod -def load_gguf( - path: Union[str, PathLike], device: Optional[Device] = None -) -> Tuple[Dict[str, QTensor], Dict[str, Any]]: +def load_gguf(path, device=None) -> Tuple[Dict[str, QTensor], Dict[str, Any]]: """ Loads a GGUF file. Returns a tuple of two dictionaries: the first maps tensor names to tensors, and the second maps metadata keys to metadata values. @@ -60,7 +56,7 @@ def load_safetensors(path: Union[str, PathLike]) -> Dict[str, Tensor]: pass @staticmethod -def save_gguf(path: Union[str, PathLike], tensors: Dict[str, QTensor], metadata: Dict[str, Any]): +def save_gguf(path, tensors, metadata): """ Save quanitzed tensors and metadata to a GGUF file. """ diff --git a/candle-pyo3/src/lib.rs b/candle-pyo3/src/lib.rs index 0da2c70028..722b5e3ace 100644 --- a/candle-pyo3/src/lib.rs +++ b/candle-pyo3/src/lib.rs @@ -6,7 +6,6 @@ use pyo3::types::{IntoPyDict, PyDict, PyTuple}; use pyo3::ToPyObject; use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; -use std::os::raw::c_long; use std::sync::Arc; use half::{bf16, f16}; @@ -115,7 +114,7 @@ impl PyDevice { } impl<'source> FromPyObject<'source> for PyDevice { - fn extract(ob: &'source PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'source, PyAny>) -> PyResult { let device: String = ob.extract()?; let device = match device.as_str() { "cpu" => PyDevice::Cpu, @@ -217,11 +216,11 @@ enum Indexer { IndexSelect(Tensor), } -#[derive(Clone, Debug)] +#[derive(Debug)] struct TorchTensor(PyObject); impl<'source> pyo3::FromPyObject<'source> for TorchTensor { - fn extract(ob: &'source PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'source, PyAny>) -> PyResult { let numpy_value: PyObject = ob.getattr("numpy")?.call0()?.extract()?; Ok(TorchTensor(numpy_value)) } @@ -540,7 +539,7 @@ impl PyTensor { )) } else if let Ok(slice) = py_indexer.downcast::() { // Handle a single slice e.g. tensor[0:1] or tensor[0:-1] - let index = slice.indices(dims[current_dim] as c_long)?; + let index = slice.indices(dims[current_dim] as isize)?; Ok(( Indexer::Slice(index.start as usize, index.stop as usize), current_dim + 1, @@ -1284,7 +1283,7 @@ fn save_safetensors( } #[pyfunction] -#[pyo3(text_signature = "(path:Union[str,PathLike], device: Optional[Device] = None)")] +#[pyo3(signature = (path, device = None))] /// Load a GGML file. Returns a tuple of three objects: a dictionary mapping tensor names to tensors, /// a dictionary mapping hyperparameter names to hyperparameter values, and a vocabulary. /// &RETURNS&: Tuple[Dict[str,QTensor], Dict[str,Any], List[str]] @@ -1325,7 +1324,7 @@ fn load_ggml( } #[pyfunction] -#[pyo3(text_signature = "(path:Union[str,PathLike], device: Optional[Device] = None)")] +#[pyo3(signature = (path, device = None))] /// Loads a GGUF file. Returns a tuple of two dictionaries: the first maps tensor names to tensors, /// and the second maps metadata keys to metadata values. /// &RETURNS&: Tuple[Dict[str,QTensor], Dict[str,Any]] @@ -1384,7 +1383,7 @@ fn load_gguf( #[pyfunction] #[pyo3( - text_signature = "(path:Union[str,PathLike], tensors:Dict[str,QTensor], metadata:Dict[str,Any])" + signature = (path, tensors, metadata) )] /// Save quanitzed tensors and metadata to a GGUF file. fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>) -> PyResult<()> { @@ -1430,7 +1429,7 @@ fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>) Ok(v) } let tensors = tensors - .extract::<&PyDict>(py) + .downcast_bound::(py) .map_err(|_| PyErr::new::("expected a dict"))? .iter() .map(|(key, value)| { @@ -1443,7 +1442,7 @@ fn save_gguf(path: &str, tensors: PyObject, metadata: PyObject, py: Python<'_>) .collect::>>()?; let metadata = metadata - .extract::<&PyDict>(py) + .downcast_bound::(py) .map_err(|_| PyErr::new::("expected a dict"))? .iter() .map(|(key, value)| { diff --git a/candle-pyo3/src/shape.rs b/candle-pyo3/src/shape.rs index 2668b7331b..b9bc67899d 100644 --- a/candle-pyo3/src/shape.rs +++ b/candle-pyo3/src/shape.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; pub struct PyShape(Vec); impl<'source> pyo3::FromPyObject<'source> for PyShape { - fn extract(ob: &'source PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'source, PyAny>) -> PyResult { if ob.is_none() { return Err(PyErr::new::( "Shape cannot be None", @@ -16,10 +16,10 @@ impl<'source> pyo3::FromPyObject<'source> for PyShape { let tuple = ob.downcast::()?; if tuple.len() == 1 { let first_element = tuple.get_item(0)?; - let dims: Vec = pyo3::FromPyObject::extract(first_element)?; + let dims: Vec = pyo3::FromPyObject::extract_bound(&first_element)?; Ok(PyShape(dims)) } else { - let dims: Vec = pyo3::FromPyObject::extract(tuple)?; + let dims: Vec = pyo3::FromPyObject::extract_bound(tuple)?; Ok(PyShape(dims)) } } @@ -36,7 +36,7 @@ impl From for ::candle::Shape { pub struct PyShapeWithHole(Vec); impl<'source> pyo3::FromPyObject<'source> for PyShapeWithHole { - fn extract(ob: &'source PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'source, PyAny>) -> PyResult { if ob.is_none() { return Err(PyErr::new::( "Shape cannot be None", @@ -46,9 +46,9 @@ impl<'source> pyo3::FromPyObject<'source> for PyShapeWithHole { let tuple = ob.downcast::()?; let dims: Vec = if tuple.len() == 1 { let first_element = tuple.get_item(0)?; - pyo3::FromPyObject::extract(first_element)? + pyo3::FromPyObject::extract_bound(&first_element)? } else { - pyo3::FromPyObject::extract(tuple)? + pyo3::FromPyObject::extract_bound(tuple)? }; // Ensure we have only positive numbers and at most one "hole" (-1) From e4a96f9e7c2b88dec33b6076cc9756ac76d44df1 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 6 Oct 2024 23:24:55 +0200 Subject: [PATCH 11/28] Switch to using the MLX matmul by default. (#2547) --- candle-core/src/metal_backend/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index 6f560c02ee..34931c9dfd 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -1865,9 +1865,9 @@ impl BackendDevice for MetalDevice { let device = metal::Device::all().swap_remove(ordinal); let command_queue = device.new_command_queue(); let kernels = Arc::new(Kernels::new()); - let use_mlx_mm = match std::env::var("CANDLE_USE_MLX_MM").as_deref() { - Ok("false") | Ok("False") | Ok("FALSE") | Ok("0") | Err(_) => false, - Ok(_) => true, + let use_mlx_mm = match std::env::var("CANDLE_USE_MFA_MM").as_deref() { + Ok("false") | Ok("False") | Ok("FALSE") | Ok("0") | Err(_) => true, + Ok(_) => false, }; let seed = Arc::new(Mutex::new(device.new_buffer_with_data( [299792458].as_ptr() as *const c_void, From edf7668291a30d6c73dd0fb884a74d1d78e5786d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Ant=C3=B3nio?= Date: Mon, 7 Oct 2024 16:30:56 +0100 Subject: [PATCH 12/28] improve (#2548) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index a351ab667f..4c84a09185 100644 --- a/README.md +++ b/README.md @@ -187,6 +187,7 @@ And then head over to - [`candle-sampling`](https://github.com/EricLBuehler/candle-sampling): Sampling techniques for Candle. - [`gpt-from-scratch-rs`](https://github.com/jeroenvlek/gpt-from-scratch-rs): A port of Andrej Karpathy's _Let's build GPT_ tutorial on YouTube showcasing the Candle API on a toy problem. - [`candle-einops`](https://github.com/tomsanbear/candle-einops): A pure rust implementation of the python [einops](https://github.com/arogozhnikov/einops) library. +- [`atoma-infer`](https://github.com/atoma-network/atoma-infer): A Rust library for fast inference at scale, leveraging FlashAttention2 for efficient attention computation, PagedAttention for efficient KV-cache memory management, and multi-GPU support. It is OpenAI api compatible. If you have an addition to this list, please submit a pull request. From 937e8eda7419818f8f67408cce50329d8f9c73ae Mon Sep 17 00:00:00 2001 From: Akshay Ballal <61191840+akshayballal95@users.noreply.github.com> Date: Mon, 7 Oct 2024 23:28:21 +0200 Subject: [PATCH 13/28] Add BertForMaskedLM to support SPLADE Models (#2550) * add bert for masked lm * working example * add example readme * Clippy fix. * And apply rustfmt. --------- Co-authored-by: Laurent --- candle-examples/examples/splade/README.md | 28 +++ candle-examples/examples/splade/main.rs | 210 ++++++++++++++++++++++ candle-transformers/src/models/bert.rs | 97 ++++++++++ 3 files changed, 335 insertions(+) create mode 100644 candle-examples/examples/splade/README.md create mode 100644 candle-examples/examples/splade/main.rs diff --git a/candle-examples/examples/splade/README.md b/candle-examples/examples/splade/README.md new file mode 100644 index 0000000000..582cea2750 --- /dev/null +++ b/candle-examples/examples/splade/README.md @@ -0,0 +1,28 @@ +# candle-splade + + SPLADE is a neural retrieval model which learns query/document sparse expansion via the BERT MLM head and sparse regularization. Sparse representations benefit from several advantages compared to dense approaches: efficient use of inverted index, explicit lexical match, interpretability... They also seem to be better at generalizing on out-of-domain data. In this example we can do the following two tasks: + +- Compute sparse embedding for a given query. +- Compute similarities between a set of sentences using sparse embeddings. + +## Sparse Sentence embeddings + +SPLADE is used to compute the sparse embedding for a given query. The model weights +are downloaded from the hub on the first run. This makes use of the BertForMaskedLM model. + +```bash +cargo run --example splade --release -- --prompt "Here is a test sentence" + +> "the out there still house inside position outside stay standing hotel sitting dog animal sit bird cat statue cats" +> [0.10270107, 0.269471, 0.047469813, 0.0016636598, 0.05394874, 0.23105666, 0.037475716, 0.45949644, 0.009062732, 0.06790692, 0.0327835, 0.33122346, 0.16863061, 0.12688516, 0.340983, 0.044972017, 0.47724655, 0.01765311, 0.37331146] +``` + +```bash +cargo run --example splade --release --features + +> score: 0.47 'The new movie is awesome' 'The new movie is so great' +> score: 0.43 'The cat sits outside' 'The cat plays in the garden' +> score: 0.14 'I love pasta' 'Do you like pizza?' +> score: 0.11 'A man is playing guitar' 'The cat plays in the garden' +> score: 0.05 'A man is playing guitar' 'A woman watches TV' +``` diff --git a/candle-examples/examples/splade/main.rs b/candle-examples/examples/splade/main.rs new file mode 100644 index 0000000000..aa4c60ac41 --- /dev/null +++ b/candle-examples/examples/splade/main.rs @@ -0,0 +1,210 @@ +use std::path::PathBuf; + +use anyhow::{Error as E, Result}; +use candle::Tensor; +use candle_nn::VarBuilder; +use candle_transformers::models::bert::{self, BertForMaskedLM, Config}; +use clap::Parser; +use hf_hub::{api::sync::Api, Repo, RepoType}; +use tokenizers::{PaddingParams, Tokenizer}; + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Run on CPU rather than on GPU. + #[arg(long)] + cpu: bool, + + /// Enable tracing (generates a trace-timestamp.json file). + #[arg(long)] + tracing: bool, + + /// The model to use, check out available models: https://huggingface.co/models?library=sentence-transformers&sort=trending + #[arg(long)] + model_id: Option, + + #[arg(long, default_value = "main")] + revision: String, + + // Path to the tokenizer file. + #[arg(long)] + tokenizer_file: Option, + + // Path to the weight files. + #[arg(long)] + weight_files: Option, + + // Path to the config file. + #[arg(long)] + config_file: Option, + + /// When set, compute embeddings for this prompt. + #[arg(long)] + prompt: Option, +} + +fn main() -> Result<()> { + let args = Args::parse(); + let api = Api::new()?; + let model_id = match &args.model_id { + Some(model_id) => model_id.to_string(), + None => "prithivida/Splade_PP_en_v1".to_string(), + }; + let repo = api.repo(Repo::with_revision( + model_id, + RepoType::Model, + args.revision, + )); + + let tokenizer_filename = match args.tokenizer_file { + Some(file) => std::path::PathBuf::from(file), + None => repo.get("tokenizer.json")?, + }; + + let config_filename = match args.config_file { + Some(file) => std::path::PathBuf::from(file), + None => repo.get("config.json")?, + }; + + let weights_filename = match args.weight_files { + Some(files) => PathBuf::from(files), + None => match repo.get("model.safetensors") { + Ok(safetensors) => safetensors, + Err(_) => match repo.get("pytorch_model.bin") { + Ok(pytorch_model) => pytorch_model, + Err(e) => { + return Err(anyhow::Error::msg(format!("Model weights not found. The weights should either be a `model.safetensors` or `pytorch_model.bin` file. Error: {}", e))); + } + }, + }, + }; + + let config = std::fs::read_to_string(config_filename)?; + let config: Config = serde_json::from_str(&config)?; + let mut tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?; + + let device = candle_examples::device(args.cpu)?; + let dtype = bert::DTYPE; + + let vb = if weights_filename.ends_with("model.safetensors") { + unsafe { VarBuilder::from_mmaped_safetensors(&[weights_filename], dtype, &device).unwrap() } + } else { + println!("Loading weights from pytorch_model.bin"); + VarBuilder::from_pth(&weights_filename, dtype, &device).unwrap() + }; + let model = BertForMaskedLM::load(vb, &config)?; + + if let Some(prompt) = args.prompt { + let tokenizer = tokenizer + .with_padding(None) + .with_truncation(None) + .map_err(E::msg)?; + let tokens = tokenizer + .encode(prompt, true) + .map_err(E::msg)? + .get_ids() + .to_vec(); + + let token_ids = Tensor::new(&tokens[..], &device)?.unsqueeze(0)?; + let token_type_ids = token_ids.zeros_like()?; + + let ys = model.forward(&token_ids, &token_type_ids, None)?; + let vec = Tensor::log( + &Tensor::try_from(1.0)? + .to_dtype(dtype)? + .to_device(&device)? + .broadcast_add(&ys.relu()?)?, + )? + .max(1)?; + let vec = normalize_l2(&vec)?; + + let vec = vec.squeeze(0)?.to_vec1::()?; + + let indices = (0..vec.len()) + .filter(|&i| vec[i] != 0.0) + .map(|x| x as u32) + .collect::>(); + + let tokens = tokenizer.decode(&indices, true).unwrap(); + println!("{tokens:?}"); + let values = indices.iter().map(|&i| vec[i as usize]).collect::>(); + println!("{values:?}"); + } else { + let sentences = [ + "The cat sits outside", + "A man is playing guitar", + "I love pasta", + "The new movie is awesome", + "The cat plays in the garden", + "A woman watches TV", + "The new movie is so great", + "Do you like pizza?", + ]; + + let n_sentences = sentences.len(); + if let Some(pp) = tokenizer.get_padding_mut() { + pp.strategy = tokenizers::PaddingStrategy::BatchLongest + } else { + let pp = PaddingParams { + strategy: tokenizers::PaddingStrategy::BatchLongest, + ..Default::default() + }; + tokenizer.with_padding(Some(pp)); + } + let tokens = tokenizer + .encode_batch(sentences.to_vec(), true) + .map_err(E::msg)?; + let token_ids = tokens + .iter() + .map(|tokens| { + let tokens = tokens.get_ids().to_vec(); + Ok(Tensor::new(tokens.as_slice(), &device)?) + }) + .collect::>>()?; + let attention_mask = tokens + .iter() + .map(|tokens| { + let tokens = tokens.get_attention_mask().to_vec(); + Ok(Tensor::new(tokens.as_slice(), &device)?) + }) + .collect::>>()?; + + let token_ids = Tensor::stack(&token_ids, 0)?; + let attention_mask = Tensor::stack(&attention_mask, 0)?; + let token_type_ids = token_ids.zeros_like()?; + + let ys = model.forward(&token_ids, &token_type_ids, Some(&attention_mask))?; + let vector = Tensor::log( + &Tensor::try_from(1.0)? + .to_dtype(dtype)? + .to_device(&device)? + .broadcast_add(&ys.relu()?)?, + )?; + let vector = vector + .broadcast_mul(&attention_mask.unsqueeze(2)?.to_dtype(dtype)?)? + .max(1)?; + let vec = normalize_l2(&vector)?; + let mut similarities = vec![]; + for i in 0..n_sentences { + let e_i = vec.get(i)?; + for j in (i + 1)..n_sentences { + let e_j = vec.get(j)?; + let sum_ij = (&e_i * &e_j)?.sum_all()?.to_scalar::()?; + let sum_i2 = (&e_i * &e_i)?.sum_all()?.to_scalar::()?; + let sum_j2 = (&e_j * &e_j)?.sum_all()?.to_scalar::()?; + let cosine_similarity = sum_ij / (sum_i2 * sum_j2).sqrt(); + similarities.push((cosine_similarity, i, j)) + } + } + similarities.sort_by(|u, v| v.0.total_cmp(&u.0)); + for &(score, i, j) in similarities[..5].iter() { + println!("score: {score:.2} '{}' '{}'", sentences[i], sentences[j]) + } + } + + Ok(()) +} + +pub fn normalize_l2(v: &Tensor) -> Result { + Ok(v.broadcast_div(&v.sqr()?.sum_keepdim(1)?.sqrt()?)?) +} diff --git a/candle-transformers/src/models/bert.rs b/candle-transformers/src/models/bert.rs index 354048de97..bdc0385deb 100644 --- a/candle-transformers/src/models/bert.rs +++ b/candle-transformers/src/models/bert.rs @@ -504,3 +504,100 @@ fn get_extended_attention_mask(attention_mask: &Tensor, dtype: DType) -> Result< (attention_mask.ones_like()? - &attention_mask)? .broadcast_mul(&Tensor::try_from(f32::MIN)?.to_device(attention_mask.device())?) } + +//https://github.com/huggingface/transformers/blob/1bd604d11c405dfb8b78bda4062d88fc75c17de0/src/transformers/models/bert/modeling_bert.py#L752-L766 +struct BertPredictionHeadTransform { + dense: Linear, + activation: HiddenActLayer, + layer_norm: LayerNorm, +} + +impl BertPredictionHeadTransform { + fn load(vb: VarBuilder, config: &Config) -> Result { + let dense = linear(config.hidden_size, config.hidden_size, vb.pp("dense"))?; + let activation = HiddenActLayer::new(config.hidden_act); + let layer_norm = layer_norm( + config.hidden_size, + config.layer_norm_eps, + vb.pp("LayerNorm"), + )?; + Ok(Self { + dense, + activation, + layer_norm, + }) + } +} + +impl Module for BertPredictionHeadTransform { + fn forward(&self, hidden_states: &Tensor) -> Result { + let hidden_states = self + .activation + .forward(&self.dense.forward(hidden_states)?)?; + self.layer_norm.forward(&hidden_states) + } +} + +// https://github.com/huggingface/transformers/blob/1bd604d11c405dfb8b78bda4062d88fc75c17de0/src/transformers/models/bert/modeling_bert.py#L769C1-L790C1 +pub struct BertLMPredictionHead { + transform: BertPredictionHeadTransform, + decoder: Linear, +} + +impl BertLMPredictionHead { + pub fn load(vb: VarBuilder, config: &Config) -> Result { + let transform = BertPredictionHeadTransform::load(vb.pp("transform"), config)?; + let decoder = linear(config.hidden_size, config.vocab_size, vb.pp("decoder"))?; + Ok(Self { transform, decoder }) + } +} + +impl Module for BertLMPredictionHead { + fn forward(&self, hidden_states: &Tensor) -> Result { + self.decoder + .forward(&self.transform.forward(hidden_states)?) + } +} + +// https://github.com/huggingface/transformers/blob/1bd604d11c405dfb8b78bda4062d88fc75c17de0/src/transformers/models/bert/modeling_bert.py#L792 +pub struct BertOnlyMLMHead { + predictions: BertLMPredictionHead, +} + +impl BertOnlyMLMHead { + pub fn load(vb: VarBuilder, config: &Config) -> Result { + let predictions = BertLMPredictionHead::load(vb.pp("predictions"), config)?; + Ok(Self { predictions }) + } +} + +impl Module for BertOnlyMLMHead { + fn forward(&self, sequence_output: &Tensor) -> Result { + self.predictions.forward(sequence_output) + } +} + +pub struct BertForMaskedLM { + bert: BertModel, + cls: BertOnlyMLMHead, +} + +impl BertForMaskedLM { + pub fn load(vb: VarBuilder, config: &Config) -> Result { + let bert = BertModel::load(vb.pp("bert"), config)?; + let cls = BertOnlyMLMHead::load(vb.pp("cls"), config)?; + Ok(Self { bert, cls }) + } + + pub fn forward( + &self, + input_ids: &Tensor, + token_type_ids: &Tensor, + attention_mask: Option<&Tensor>, + ) -> Result { + let sequence_output = self + .bert + .forward(input_ids, token_type_ids, attention_mask)?; + self.cls.forward(&sequence_output) + } +} From 0d96ec31e8be03f844ed0aed636d6217dee9c7bc Mon Sep 17 00:00:00 2001 From: SethWen Date: Thu, 10 Oct 2024 21:18:55 +0800 Subject: [PATCH 14/28] feat: intergrate chinese clip and add example (#2555) * start to impl chinese clip * impl vision model * copy code from bert * refactor use * refactor use again * fix text model * refactor * try to fix text model * tuning * tuning chinese clip * delete useless code * revert code * Clippy fixes. * Also apply cargo fmt. --------- Co-authored-by: laurent --- candle-examples/examples/chinese_clip/main.rs | 224 ++++++++ .../src/models/chinese_clip/mod.rs | 208 +++++++ .../src/models/chinese_clip/text_model.rs | 540 ++++++++++++++++++ .../src/models/chinese_clip/vision_model.rs | 385 +++++++++++++ candle-transformers/src/models/mod.rs | 1 + 5 files changed, 1358 insertions(+) create mode 100644 candle-examples/examples/chinese_clip/main.rs create mode 100644 candle-transformers/src/models/chinese_clip/mod.rs create mode 100644 candle-transformers/src/models/chinese_clip/text_model.rs create mode 100644 candle-transformers/src/models/chinese_clip/vision_model.rs diff --git a/candle-examples/examples/chinese_clip/main.rs b/candle-examples/examples/chinese_clip/main.rs new file mode 100644 index 0000000000..5cee1fc81e --- /dev/null +++ b/candle-examples/examples/chinese_clip/main.rs @@ -0,0 +1,224 @@ +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + +use candle::{DType, Device, Tensor}; +use candle_nn as nn; +use candle_transformers::models::chinese_clip::{ChineseClipConfig, ChineseClipModel}; +use clap::Parser; +use tokenizers::Tokenizer; + +#[derive(Parser)] +struct Args { + #[arg(long)] + model: Option, + + #[arg(long)] + tokenizer: Option, + + #[arg(long, use_value_delimiter = true)] + images: Option>, + + #[arg(long)] + cpu: bool, + + #[arg(long, use_value_delimiter = true)] + sequences: Option>, +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + + tracing_subscriber::fmt::init(); + + let device = candle_examples::device(args.cpu)?; + let var = load_weights(args.model, &device)?; + let clip_model = ChineseClipModel::new(var, &ChineseClipConfig::clip_vit_base_patch16())?; + tracing::info!("Transformer loaded. "); + + let (pixel_values, vec_imgs) = load_images(args.images, &device)?; + tracing::info!("Images loaded. "); + + let tokenizer = load_tokenizer()?; + let (input_ids, type_ids, attention_mask, text_sequences) = + tokenize_sequences(args.sequences, &tokenizer, &device)?; + + tracing::info!("Computing ... "); + let (_logits_per_text, logits_per_image) = clip_model.forward( + &pixel_values, + &input_ids, + Some(&type_ids), + Some(&attention_mask), + )?; + let softmax_image = nn::ops::softmax(&logits_per_image, 1)?; + + let softmax_image_vec = softmax_image.flatten_all()?.to_vec1::()?; + + let probability_vec = softmax_image_vec + .iter() + .map(|v| v * 100.0) + .collect::>(); + + let probability_per_image = probability_vec.len() / vec_imgs.len(); + + for (i, img) in vec_imgs.iter().enumerate() { + let start = i * probability_per_image; + let end = start + probability_per_image; + let prob = &probability_vec[start..end]; + tracing::info!("\n\nResults for image: {}\n", img); + + for (i, p) in prob.iter().enumerate() { + tracing::info!("Probability: {:.4}% Text: {} ", p, text_sequences[i]); + } + } + + Ok(()) +} + +pub fn load_weights(model: Option, device: &Device) -> anyhow::Result { + let model_file = match model { + None => { + let api = hf_hub::api::sync::Api::new()?; + let repo = hf_hub::Repo::with_revision( + "OFA-Sys/chinese-clip-vit-base-patch16".to_string(), + hf_hub::RepoType::Model, + "refs/pr/3".to_string(), + ); + let api = api.repo(repo); + api.get("model.safetensors")? + } + Some(model) => model.into(), + }; + + Ok(unsafe { nn::VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, device)? }) +} + +pub fn load_tokenizer() -> anyhow::Result { + let tokenizer_file = { + let api = hf_hub::api::sync::Api::new()?; + let repo = hf_hub::Repo::with_revision( + "OFA-Sys/chinese-clip-vit-base-patch16".to_string(), + hf_hub::RepoType::Model, + "refs/pr/3".to_string(), + ); + let api = api.repo(repo); + api.get("tokenizer.json")? + }; + + Tokenizer::from_file(tokenizer_file).map_err(anyhow::Error::msg) +} + +pub fn tokenize_sequences( + sequences: Option>, + tokenizer: &Tokenizer, + device: &Device, +) -> anyhow::Result<(Tensor, Tensor, Tensor, Vec)> { + let vec_seq = match sequences { + Some(seq) => seq, + None => vec![ + "自行车比赛".to_string(), + "两只猫咪".to_string(), + "拿着蜡烛的机器人".to_string(), + ], + }; + + let mut input_ids = vec![]; + let mut type_ids = vec![]; + let mut attention_mask = vec![]; + let mut max_len = 0; + + for seq in vec_seq.clone() { + let encoding = tokenizer.encode(seq, true).map_err(anyhow::Error::msg)?; + input_ids.push(encoding.get_ids().to_vec()); + type_ids.push(encoding.get_type_ids().to_vec()); + attention_mask.push(encoding.get_attention_mask().to_vec()); + if encoding.get_ids().len() > max_len { + max_len = encoding.get_ids().len(); + } + } + + let pad_id = *tokenizer + .get_vocab(true) + .get("[PAD]") + .ok_or(anyhow::Error::msg("No pad token"))?; + + let input_ids: Vec> = input_ids + .iter_mut() + .map(|item| { + item.extend(vec![pad_id; max_len - item.len()]); + item.to_vec() + }) + .collect(); + + let type_ids: Vec> = type_ids + .iter_mut() + .map(|item| { + item.extend(vec![0; max_len - item.len()]); + item.to_vec() + }) + .collect(); + + let attention_mask: Vec> = attention_mask + .iter_mut() + .map(|item| { + item.extend(vec![0; max_len - item.len()]); + item.to_vec() + }) + .collect(); + + let input_ids = Tensor::new(input_ids, device)?; + let type_ids = Tensor::new(type_ids, device)?; + let attention_mask = Tensor::new(attention_mask, device)?; + + Ok((input_ids, type_ids, attention_mask, vec_seq)) +} + +pub fn load_images( + images: Option>, + device: &Device, +) -> anyhow::Result<(Tensor, Vec)> { + let vec_imgs = match images { + Some(imgs) => imgs, + None => vec![ + "candle-examples/examples/stable-diffusion/assets/stable-diffusion-xl.jpg".to_string(), + "candle-examples/examples/yolo-v8/assets/bike.jpg".to_string(), + ], + }; + + let mut images = vec![]; + + for path in vec_imgs.iter() { + let tensor = load_image(path, 224, device)?; + images.push(tensor); + } + + let images = Tensor::stack(&images, 0)?.to_device(device)?; + Ok((images, vec_imgs)) +} + +fn load_image>( + path: T, + image_size: usize, + device: &Device, +) -> anyhow::Result { + let img = image::ImageReader::open(path)?.decode()?; + let (height, width) = (image_size, image_size); + let img = img.resize_to_fill( + width as u32, + height as u32, + image::imageops::FilterType::Triangle, + ); + + let img = img.to_rgb8().into_raw(); + let img = Tensor::from_vec(img, (height, width, 3), device)?.permute((2, 0, 1))?; + let mean = Tensor::new(&[0.48145466f32, 0.4578275, 0.40821073], device)?.reshape((3, 1, 1))?; + let std = + Tensor::new(&[0.26862954f32, 0.261_302_6, 0.275_777_1], device)?.reshape((3, 1, 1))?; + let img = (img.to_dtype(DType::F32)? / 255.)? + .broadcast_sub(&mean)? + .broadcast_div(&std)?; + + Ok(img) +} diff --git a/candle-transformers/src/models/chinese_clip/mod.rs b/candle-transformers/src/models/chinese_clip/mod.rs new file mode 100644 index 0000000000..88472f0b88 --- /dev/null +++ b/candle-transformers/src/models/chinese_clip/mod.rs @@ -0,0 +1,208 @@ +//! Chinese contrastive Language-Image Pre-Training +//! +//! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on +//! pairs of images with related texts. +//! +//! https://github.com/OFA-Sys/Chinese-CLIP +//! https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py + +use candle::{Module, Result, Tensor, D}; +use candle_nn as nn; + +use text_model::ChineseClipTextTransformer; +use vision_model::ChineseClipVisionTransformer; + +pub mod text_model; +pub mod vision_model; + +#[derive(Debug, Clone, Copy)] +pub enum Activation { + QuickGelu, + Gelu, + GeluNew, + Relu, +} + +impl From for Activation { + fn from(value: String) -> Self { + match value.as_str() { + "quick_gelu" => Activation::QuickGelu, + "gelu" => Activation::Gelu, + "gelu_new" => Activation::GeluNew, + "relu" => Activation::Relu, + _ => panic!("Invalid activation function: {}", value), + } + } +} + +impl Module for Activation { + fn forward(&self, xs: &Tensor) -> Result { + match self { + Activation::QuickGelu => xs * nn::ops::sigmoid(&(xs * 1.702f64)?)?, + Activation::Gelu => xs.gelu_erf(), + Activation::GeluNew => xs.gelu(), + Activation::Relu => xs.relu(), + } + } +} + +#[derive(Clone, Debug)] +pub struct ChineseClipConfig { + pub text_config: text_model::ChineseClipTextConfig, + pub vision_config: vision_model::ChineseClipVisionConfig, + pub projection_dim: usize, + pub logit_scale_init_value: f32, + pub image_size: usize, +} + +impl ChineseClipConfig { + /// referer: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json + pub fn clip_vit_base_patch16() -> Self { + let text_config = text_model::ChineseClipTextConfig::clip_vit_base_patch16(); + let vision_config = vision_model::ChineseClipVisionConfig::clip_vit_base_patch16(); + + Self { + text_config, + vision_config, + projection_dim: 512, + logit_scale_init_value: 2.6592, + image_size: 512, + } + } +} + +#[derive(Clone, Debug)] +pub enum EncoderConfig { + Text(text_model::ChineseClipTextConfig), + Vision(vision_model::ChineseClipVisionConfig), +} + +impl EncoderConfig { + pub fn embed_dim(&self) -> usize { + match self { + Self::Text(c) => c.hidden_size, + Self::Vision(c) => c.hidden_size, + } + } + + pub fn num_attention_heads(&self) -> usize { + match self { + Self::Text(c) => c.num_attention_heads, + Self::Vision(c) => c.num_attention_heads, + } + } + + pub fn intermediate_size(&self) -> usize { + match self { + Self::Text(c) => c.intermediate_size, + Self::Vision(c) => c.intermediate_size, + } + } + + pub fn num_hidden_layers(&self) -> usize { + match self { + Self::Text(c) => c.num_hidden_layers, + Self::Vision(c) => c.num_hidden_layers, + } + } + + pub fn activation(&self) -> Activation { + match self { + Self::Text(c) => c.hidden_act, + Self::Vision(c) => c.hidden_act, + } + } + + pub fn layer_norm_eps(&self) -> f64 { + match self { + Self::Text(c) => c.layer_norm_eps, + Self::Vision(c) => c.layer_norm_eps, + } + } +} + +#[derive(Clone, Debug)] +pub struct ChineseClipModel { + text_model: ChineseClipTextTransformer, + vision_model: ChineseClipVisionTransformer, + visual_projection: nn::Linear, + text_projection: nn::Linear, + logit_scale: Tensor, +} + +impl ChineseClipModel { + pub fn new(vs: nn::VarBuilder, c: &ChineseClipConfig) -> Result { + let text_model = ChineseClipTextTransformer::new(vs.pp("text_model"), &c.text_config)?; + + let vision_model = + ChineseClipVisionTransformer::new(vs.pp("vision_model"), &c.vision_config)?; + + let vision_embed_dim = c.vision_config.hidden_size; + let vision_projection = nn::linear_no_bias( + vision_embed_dim, + c.projection_dim, + vs.pp("visual_projection"), + )?; + + let text_embed_dim = c.text_config.hidden_size; + let text_projection = + nn::linear_no_bias(text_embed_dim, c.projection_dim, vs.pp("text_projection"))?; + + let logit_scale = if vs.contains_tensor("logit_scale") { + vs.get(&[], "logit_scale")? + } else { + Tensor::new(&[c.logit_scale_init_value], vs.device())? + }; + + Ok(Self { + text_model, + vision_model, + visual_projection: vision_projection, + text_projection, + logit_scale, + }) + } + + pub fn get_text_features( + &self, + input_ids: &Tensor, + token_type_ids: Option<&Tensor>, + attention_mask: Option<&Tensor>, + ) -> Result { + let output = self + .text_model + .forward(input_ids, token_type_ids, attention_mask)?; + self.text_projection.forward(&output) + } + + pub fn get_image_features(&self, pixel_values: &Tensor) -> Result { + pixel_values + .apply(&self.vision_model)? + .apply(&self.visual_projection) + } + + pub fn forward( + &self, + pixel_values: &Tensor, + input_ids: &Tensor, + token_type_ids: Option<&Tensor>, + attention_mask: Option<&Tensor>, + ) -> Result<(Tensor, Tensor)> { + let image_features = self.get_image_features(pixel_values)?; + let text_features = self.get_text_features(input_ids, token_type_ids, attention_mask)?; + + let image_features_normalized = div_l2_norm(&image_features)?; + let text_features_normalized = div_l2_norm(&text_features)?; + + let logits_per_text = text_features_normalized.matmul(&image_features_normalized.t()?)?; + let logit_scale = self.logit_scale.exp()?; + let logits_per_text = logits_per_text.broadcast_mul(&logit_scale)?; + let logits_per_image = logits_per_text.t()?; + Ok((logits_per_text, logits_per_image)) + } +} + +pub fn div_l2_norm(v: &Tensor) -> Result { + let l2_norm = v.sqr()?.sum_keepdim(D::Minus1)?.sqrt()?; + v.broadcast_div(&l2_norm) +} diff --git a/candle-transformers/src/models/chinese_clip/text_model.rs b/candle-transformers/src/models/chinese_clip/text_model.rs new file mode 100644 index 0000000000..19499709a7 --- /dev/null +++ b/candle-transformers/src/models/chinese_clip/text_model.rs @@ -0,0 +1,540 @@ +//! Chinese contrastive Language-Image Pre-Training +//! +//! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on +//! pairs of images with related texts. +//! +//! https://github.com/OFA-Sys/Chinese-CLIP +//! https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py + +use candle::{DType, Device, IndexOp, Module, Result, Tensor}; +use candle_nn as nn; + +use super::Activation; + +/// Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For +/// positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to +/// [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). +/// For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models +/// with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). +#[derive(Clone, Debug)] +pub enum PositionEmbeddingType { + Absolute, + RelativeKey, + RelativeKeyQuery, +} + +#[derive(Clone, Debug)] +pub struct ChineseClipTextConfig { + pub vocab_size: usize, + pub hidden_size: usize, + pub num_hidden_layers: usize, + pub num_attention_heads: usize, + pub intermediate_size: usize, + pub hidden_act: Activation, + pub hidden_dropout_prob: f32, + pub attention_probs_dropout_prob: f64, + pub max_position_embeddings: usize, + pub type_vocab_size: usize, + pub initializer_range: f64, + pub initializer_factor: f64, + pub layer_norm_eps: f64, + pub pad_token_id: usize, + pub position_embedding_type: PositionEmbeddingType, + pub use_cache: bool, +} + +impl Default for ChineseClipTextConfig { + fn default() -> Self { + Self { + vocab_size: 30522, + hidden_size: 768, + num_hidden_layers: 12, + num_attention_heads: 12, + intermediate_size: 3072, + hidden_act: Activation::Gelu, + hidden_dropout_prob: 0.1, + attention_probs_dropout_prob: 0.1, + max_position_embeddings: 512, + type_vocab_size: 2, + initializer_range: 0.02, + initializer_factor: 1.0, + layer_norm_eps: 1e-12, + pad_token_id: 0, + position_embedding_type: PositionEmbeddingType::Absolute, + use_cache: true, + } + } +} + +impl ChineseClipTextConfig { + /// referer: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json + pub fn clip_vit_base_patch16() -> Self { + Self { + vocab_size: 21128, + hidden_size: 768, + num_hidden_layers: 12, + num_attention_heads: 12, + intermediate_size: 3072, + hidden_act: Activation::Gelu, + hidden_dropout_prob: 0.1, + attention_probs_dropout_prob: 0.1, + max_position_embeddings: 512, + type_vocab_size: 2, + initializer_range: 0.02, + initializer_factor: 1.0, + layer_norm_eps: 1e-12, + pad_token_id: 0, + position_embedding_type: PositionEmbeddingType::Absolute, + use_cache: true, + } + } +} + +#[derive(Clone, Debug)] +pub struct ChineseClipTextEmbeddings { + word_embeddings: nn::Embedding, + position_embeddings: nn::Embedding, + token_type_embeddings: nn::Embedding, + layer_norm: nn::LayerNorm, + dropout: nn::Dropout, + position_embedding_type: PositionEmbeddingType, + position_ids: Tensor, + token_type_ids: Tensor, +} + +impl ChineseClipTextEmbeddings { + pub fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let word_embeddings = nn::embedding( + config.vocab_size, + config.hidden_size, + var.pp("word_embeddings"), + )?; + let position_embeddings = nn::embedding( + config.max_position_embeddings, + config.hidden_size, + var.pp("position_embeddings"), + )?; + let token_type_embeddings = nn::embedding( + config.type_vocab_size, + config.hidden_size, + var.pp("token_type_embeddings"), + )?; + let layer_norm = nn::layer_norm::( + config.hidden_size, + config.layer_norm_eps, + var.pp("LayerNorm"), + )?; + let dropout = nn::Dropout::new(config.hidden_dropout_prob); + let position_ids = + Tensor::arange(0u32, config.max_position_embeddings as u32, var.device())? + .unsqueeze(0)?; + let token_type_ids = Tensor::zeros(position_ids.shape(), DType::I64, var.device())?; + + Ok(Self { + word_embeddings, + position_embeddings, + token_type_embeddings, + layer_norm, + dropout, + position_embedding_type: config.position_embedding_type.clone(), + position_ids, + token_type_ids, + }) + } + + fn forward(&self, xs: &Tensor, token_type_ids: Option<&Tensor>) -> Result { + let (_batch_size, seq_length) = xs.dims2()?; + let position_ids = (0..seq_length as u32).collect::>(); + let position_ids = self.position_ids.index_select( + &Tensor::new(&position_ids[..], self.position_ids.device())?, + 1, + )?; + + let word_embeddings = self.word_embeddings.forward(xs)?; + + let token_type_ids = match token_type_ids { + Some(token_type_ids) => token_type_ids, + None => &self.token_type_ids.i((.., 0..seq_length))?, + }; + let token_type_ids = token_type_ids.expand(xs.shape())?; + let token_type_embeddings = self.token_type_embeddings.forward(&token_type_ids)?; + + let embeddings = (&word_embeddings + token_type_embeddings)?; + let embeddings = match self.position_embedding_type { + PositionEmbeddingType::Absolute => { + let position_embeddings = self.position_embeddings.forward(&position_ids)?; + let position_embeddings = position_embeddings.expand(embeddings.shape())?; + (embeddings + position_embeddings)? + } + _ => embeddings, + }; + let embeddings = self.layer_norm.forward(&embeddings)?; + let embeddings = self.dropout.forward(&embeddings, false)?; + Ok(embeddings) + } +} + +/// Copied from [`crate::models::bert::BertSelfOutput`] to [`ChineseClipTextSelfOutput`] +#[derive(Clone, Debug)] +struct ChineseClipTextSelfOutput { + dense: nn::Linear, + layer_norm: nn::LayerNorm, + dropout: nn::Dropout, + span: tracing::Span, +} + +impl ChineseClipTextSelfOutput { + fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let dense = nn::linear(config.hidden_size, config.hidden_size, var.pp("dense"))?; + let layer_norm = nn::layer_norm( + config.hidden_size, + config.layer_norm_eps, + var.pp("LayerNorm"), + )?; + let dropout = nn::Dropout::new(config.hidden_dropout_prob); + Ok(Self { + dense, + layer_norm, + dropout, + span: tracing::span!(tracing::Level::TRACE, "self-out"), + }) + } + + fn forward(&self, hidden_states: &Tensor, input_tensor: &Tensor) -> Result { + let _enter = self.span.enter(); + let hidden_states = self.dense.forward(hidden_states)?; + let hidden_states = self.dropout.forward(&hidden_states, false)?; + self.layer_norm.forward(&(hidden_states + input_tensor)?) + } +} + +/// Copied from [`crate::models::bert::BertSelfAttention`] to [`ChineseClipTextSelfAttention`] +#[derive(Clone, Debug)] +struct ChineseClipTextSelfAttention { + query: nn::Linear, + key: nn::Linear, + value: nn::Linear, + dropout: nn::Dropout, + num_attention_heads: usize, + attention_head_size: usize, + span: tracing::Span, + span_softmax: tracing::Span, +} + +impl ChineseClipTextSelfAttention { + fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let attention_head_size = config.hidden_size / config.num_attention_heads; + let all_head_size = config.num_attention_heads * attention_head_size; + let dropout = nn::Dropout::new(config.hidden_dropout_prob); + let hidden_size = config.hidden_size; + let query = nn::linear(hidden_size, all_head_size, var.pp("query"))?; + let value = nn::linear(hidden_size, all_head_size, var.pp("value"))?; + let key = nn::linear(hidden_size, all_head_size, var.pp("key"))?; + Ok(Self { + query, + key, + value, + dropout, + num_attention_heads: config.num_attention_heads, + attention_head_size, + span: tracing::span!(tracing::Level::TRACE, "self-attn"), + span_softmax: tracing::span!(tracing::Level::TRACE, "softmax"), + }) + } + + fn transpose_for_scores(&self, xs: &Tensor) -> Result { + let mut new_x_shape = xs.dims().to_vec(); + new_x_shape.pop(); + new_x_shape.push(self.num_attention_heads); + new_x_shape.push(self.attention_head_size); + let xs = xs.reshape(new_x_shape.as_slice())?.transpose(1, 2)?; + xs.contiguous() + } + + fn forward(&self, hidden_states: &Tensor, attention_mask: &Tensor) -> Result { + let _enter = self.span.enter(); + let query_layer = self.query.forward(hidden_states)?; + let key_layer = self.key.forward(hidden_states)?; + let value_layer = self.value.forward(hidden_states)?; + + let query_layer = self.transpose_for_scores(&query_layer)?; + let key_layer = self.transpose_for_scores(&key_layer)?; + let value_layer = self.transpose_for_scores(&value_layer)?; + + let attention_scores = query_layer.matmul(&key_layer.t()?)?; + let attention_scores = (attention_scores / (self.attention_head_size as f64).sqrt())?; + let attention_scores = attention_scores.broadcast_add(attention_mask)?; + let attention_probs = { + let _enter_sm = self.span_softmax.enter(); + nn::ops::softmax(&attention_scores, candle::D::Minus1)? + }; + let attention_probs = self.dropout.forward(&attention_probs, false)?; + + let context_layer = attention_probs.matmul(&value_layer)?; + let context_layer = context_layer.transpose(1, 2)?.contiguous()?; + let context_layer = context_layer.flatten_from(candle::D::Minus2)?; + Ok(context_layer) + } +} + +/// Copied from [`crate::models::bert::BertAttention`] to [`ChineseClipTextAttention`] +#[derive(Clone, Debug)] +struct ChineseClipTextAttention { + self_attention: ChineseClipTextSelfAttention, + self_output: ChineseClipTextSelfOutput, + span: tracing::Span, +} + +impl ChineseClipTextAttention { + fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let self_attention = ChineseClipTextSelfAttention::new(var.pp("self"), config)?; + let self_output = ChineseClipTextSelfOutput::new(var.pp("output"), config)?; + Ok(Self { + self_attention, + self_output, + span: tracing::span!(tracing::Level::TRACE, "attn"), + }) + } + + fn forward(&self, hidden_states: &Tensor, attention_mask: &Tensor) -> Result { + let _enter = self.span.enter(); + let self_outputs = self.self_attention.forward(hidden_states, attention_mask)?; + let attention_output = self.self_output.forward(&self_outputs, hidden_states)?; + Ok(attention_output) + } +} + +type HiddenActLayer = Activation; + +/// Copied from [`crate::models::bert::BertIntermediate`] to [`ChineseClipTextIntermediate`] +#[derive(Clone, Debug)] +struct ChineseClipTextIntermediate { + dense: nn::Linear, + intermediate_act: HiddenActLayer, + span: tracing::Span, +} + +impl ChineseClipTextIntermediate { + fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let dense = nn::linear( + config.hidden_size, + config.intermediate_size, + var.pp("dense"), + )?; + Ok(Self { + dense, + intermediate_act: config.hidden_act, + span: tracing::span!(tracing::Level::TRACE, "inter"), + }) + } +} + +impl Module for ChineseClipTextIntermediate { + fn forward(&self, hidden_states: &Tensor) -> Result { + let _enter = self.span.enter(); + let hidden_states = self.dense.forward(hidden_states)?; + let ys = self.intermediate_act.forward(&hidden_states)?; + Ok(ys) + } +} + +/// Copied from [`crate::models::bert::BertOutput`] to [`ChineseClipTextOutput`] +#[derive(Clone, Debug)] +struct ChineseClipTextOutput { + dense: nn::Linear, + layer_norm: nn::LayerNorm, + dropout: nn::Dropout, + span: tracing::Span, +} + +impl ChineseClipTextOutput { + fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let dense = nn::linear( + config.intermediate_size, + config.hidden_size, + var.pp("dense"), + )?; + let layer_norm = nn::layer_norm( + config.hidden_size, + config.layer_norm_eps, + var.pp("LayerNorm"), + )?; + let dropout = nn::Dropout::new(config.hidden_dropout_prob); + Ok(Self { + dense, + layer_norm, + dropout, + span: tracing::span!(tracing::Level::TRACE, "out"), + }) + } + + fn forward(&self, hidden_states: &Tensor, input_tensor: &Tensor) -> Result { + let _enter = self.span.enter(); + let hidden_states = self.dense.forward(hidden_states)?; + let hidden_states = self.dropout.forward(&hidden_states, false)?; + self.layer_norm.forward(&(hidden_states + input_tensor)?) + } +} + +/// Copied from [`crate::models::bert::BertLayer`] to [`ChineseClipTextLayer`] +#[derive(Clone, Debug)] +struct ChineseClipTextLayer { + attention: ChineseClipTextAttention, + intermediate: ChineseClipTextIntermediate, + output: ChineseClipTextOutput, + span: tracing::Span, +} + +impl ChineseClipTextLayer { + fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let attention = ChineseClipTextAttention::new(var.pp("attention"), config)?; + let intermediate = ChineseClipTextIntermediate::new(var.pp("intermediate"), config)?; + let output = ChineseClipTextOutput::new(var.pp("output"), config)?; + Ok(Self { + attention, + intermediate, + output, + span: tracing::span!(tracing::Level::TRACE, "layer"), + }) + } + + fn forward(&self, hidden_states: &Tensor, attention_mask: &Tensor) -> Result { + let _enter = self.span.enter(); + let attention_output = self.attention.forward(hidden_states, attention_mask)?; + // https://github.com/huggingface/transformers/blob/6eedfa6dd15dc1e22a55ae036f681914e5a0d9a1/src/transformers/models/bert/modeling_bert.py#L523 + let intermediate_output = self.intermediate.forward(&attention_output)?; + let layer_output = self + .output + .forward(&intermediate_output, &attention_output)?; + Ok(layer_output) + } +} + +#[derive(Clone, Debug)] +struct Tanh; + +impl Tanh { + pub fn new() -> Self { + Self {} + } +} +impl Module for Tanh { + fn forward(&self, xs: &Tensor) -> Result { + xs.tanh() + } +} + +#[derive(Clone, Debug)] +struct ChineseClipTextPooler { + dense: nn::Linear, + activation: Tanh, +} + +impl ChineseClipTextPooler { + pub fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let dense = nn::linear(config.hidden_size, config.hidden_size, var.pp("dense"))?; + let activation = Tanh::new(); + Ok(Self { dense, activation }) + } +} + +impl Module for ChineseClipTextPooler { + fn forward(&self, hidden_states: &Tensor) -> Result { + let first_token_tensor = hidden_states.i((.., 0))?; + let pooled_output = self.dense.forward(&first_token_tensor)?; + let pooled_output = self.activation.forward(&pooled_output)?; + Ok(pooled_output) + } +} + +#[derive(Clone, Debug)] +struct ChineseClipTextEncoder { + layers: Vec, + span: tracing::Span, +} + +impl ChineseClipTextEncoder { + fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let layers = (0..config.num_hidden_layers) + .map(|index| ChineseClipTextLayer::new(var.pp(format!("layer.{index}")), config)) + .collect::>>()?; + let span = tracing::span!(tracing::Level::TRACE, "encoder"); + Ok(ChineseClipTextEncoder { layers, span }) + } + + fn forward(&self, hidden_states: &Tensor, attention_mask: &Tensor) -> Result { + let _enter = self.span.enter(); + let mut hidden_states = hidden_states.clone(); + // Use a loop rather than a fold as it's easier to modify when adding debug/... + for layer in self.layers.iter() { + hidden_states = layer.forward(&hidden_states, attention_mask)? + } + Ok(hidden_states) + } +} + +#[derive(Clone, Debug)] +pub struct ChineseClipTextTransformer { + embeddings: ChineseClipTextEmbeddings, + encoder: ChineseClipTextEncoder, + pooler: Option, + pub device: Device, + span: tracing::Span, +} + +impl ChineseClipTextTransformer { + pub fn new(var: nn::VarBuilder, config: &ChineseClipTextConfig) -> Result { + let embeddings = ChineseClipTextEmbeddings::new(var.pp("embeddings"), config)?; + let encoder = ChineseClipTextEncoder::new(var.pp("encoder"), config)?; + // see: https://github.com/huggingface/transformers/blob/e40bb4845e0eefb52ec1e9cac9c2446ab36aef81/src/transformers/models/chinese_clip/modeling_chinese_clip.py#L1362 + // In the original Python version of the code, the pooler is not used, and there are no parameters for the pooler in the weight file. + let pooler = if var.contains_tensor("pooler") { + Some(ChineseClipTextPooler::new(var.pp("pooler"), config)?) + } else { + None + }; + Ok(Self { + embeddings, + encoder, + pooler, + device: var.device().clone(), + span: tracing::span!(tracing::Level::TRACE, "model"), + }) + } + + pub fn forward( + &self, + input_ids: &Tensor, + token_type_ids: Option<&Tensor>, + attention_mask: Option<&Tensor>, + ) -> Result { + let _enter = self.span.enter(); + let embedding_output = self.embeddings.forward(input_ids, token_type_ids)?; + let attention_mask = match attention_mask { + Some(attention_mask) => attention_mask.clone(), + None => input_ids.ones_like()?, + }; + // https://github.com/huggingface/transformers/blob/6eedfa6dd15dc1e22a55ae036f681914e5a0d9a1/src/transformers/models/bert/modeling_bert.py#L995 + let attention_mask = get_extended_attention_mask(&attention_mask, DType::F32)?; + let encoder_outputs = self.encoder.forward(&embedding_output, &attention_mask)?; + let encoder_output = encoder_outputs.i((.., 0, ..))?; + let pooled_output = match &self.pooler { + Some(pooler) => pooler.forward(&encoder_output)?, + None => encoder_output, + }; + + Ok(pooled_output) + } +} + +fn get_extended_attention_mask(attention_mask: &Tensor, dtype: DType) -> Result { + let attention_mask = match attention_mask.rank() { + 3 => attention_mask.unsqueeze(1)?, + 2 => attention_mask.unsqueeze(1)?.unsqueeze(1)?, + _ => candle::bail!("Wrong shape for input_ids or attention_mask"), + }; + let attention_mask = attention_mask.to_dtype(dtype)?; + // torch.finfo(dtype).min + (attention_mask.ones_like()? - &attention_mask)? + .broadcast_mul(&Tensor::try_from(f32::MIN)?.to_device(attention_mask.device())?) +} diff --git a/candle-transformers/src/models/chinese_clip/vision_model.rs b/candle-transformers/src/models/chinese_clip/vision_model.rs new file mode 100644 index 0000000000..2d345e0f4a --- /dev/null +++ b/candle-transformers/src/models/chinese_clip/vision_model.rs @@ -0,0 +1,385 @@ +//! Chinese contrastive Language-Image Pre-Training +//! +//! Chinese contrastive Language-Image Pre-Training (CLIP) is an architecture trained on +//! pairs of images with related texts. +//! +//! https://github.com/OFA-Sys/Chinese-CLIP +//! https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/chinese_clip/modeling_chinese_clip.py + +use candle::{DType, IndexOp, Module, Result, Shape, Tensor, D}; +use candle_nn as nn; + +use super::{Activation, EncoderConfig}; + +#[derive(Clone, Debug)] +pub struct ChineseClipVisionConfig { + pub hidden_size: usize, + pub intermediate_size: usize, + pub projection_dim: usize, + pub num_hidden_layers: usize, + pub num_attention_heads: usize, + pub num_channels: usize, + pub image_size: usize, + pub patch_size: usize, + pub hidden_act: Activation, + pub layer_norm_eps: f64, + pub attention_dropout: f32, + pub initializer_range: f32, + pub initializer_factor: f32, +} + +impl Default for ChineseClipVisionConfig { + fn default() -> Self { + ChineseClipVisionConfig { + hidden_size: 768, + intermediate_size: 3072, + projection_dim: 512, + num_hidden_layers: 12, + num_attention_heads: 12, + num_channels: 3, + image_size: 224, + patch_size: 32, + hidden_act: Activation::QuickGelu, + layer_norm_eps: 1e-5, + attention_dropout: 0.0, + initializer_range: 0.02, + initializer_factor: 1.0, + } + } +} + +impl ChineseClipVisionConfig { + /// referer: https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16/blob/main/config.json + pub fn clip_vit_base_patch16() -> Self { + Self { + hidden_size: 768, + intermediate_size: 3072, + projection_dim: 512, + num_hidden_layers: 12, + num_attention_heads: 12, + num_channels: 3, + image_size: 224, + patch_size: 16, + hidden_act: Activation::QuickGelu, + layer_norm_eps: 1e-5, + attention_dropout: 0.0, + initializer_range: 0.02, + initializer_factor: 1.0, + } + } +} + +#[derive(Clone, Debug)] +pub struct ChineseClipVisionEmbeddings { + patch_embedding: nn::Conv2d, + position_ids: Tensor, + class_embedding: Tensor, + position_embedding: nn::Embedding, +} + +impl ChineseClipVisionEmbeddings { + pub fn new(var: nn::VarBuilder, config: &ChineseClipVisionConfig) -> Result { + let embed_dim = config.hidden_size; + // originally nn.Parameter + let class_embedding = if var.contains_tensor("class_embedding") { + var.get(embed_dim, "class_embedding")? + } else { + Tensor::randn(0f32, 1f32, embed_dim, var.device())? + }; + + let num_patches = (config.image_size / config.patch_size).pow(2); + let num_positions = num_patches + 1; + let position_ids = Tensor::arange(0, num_positions as i64, var.device())?; + + let conv2dconfig = nn::Conv2dConfig { + stride: config.patch_size, + ..Default::default() + }; + let position_embedding = + nn::embedding(num_positions, embed_dim, var.pp("position_embedding"))?; + let patch_embedding = nn::conv2d_no_bias( + config.num_channels, + embed_dim, + config.patch_size, + conv2dconfig, + var.pp("patch_embedding"), + )?; + Ok(Self { + patch_embedding, + position_ids, + class_embedding, + position_embedding, + }) + } +} + +impl Module for ChineseClipVisionEmbeddings { + fn forward(&self, xs: &Tensor) -> Result { + let batch_size = xs.shape().dims(); + let patch_embeds = self + .patch_embedding + .forward(xs)? + .flatten_from(2)? + .transpose(1, 2)?; + let shape = Shape::from((batch_size[0], 1, self.class_embedding.dim(D::Minus1)?)); + let class_embeds = self.class_embedding.expand(shape)?; + let embeddings = Tensor::cat(&[class_embeds, patch_embeds], 1)?; + let position_embedding = self.position_embedding.forward(&self.position_ids)?; + embeddings.broadcast_add(&position_embedding) + } +} + +#[derive(Clone, Debug)] +struct ChineseClipVisionAttention { + k_proj: nn::Linear, + v_proj: nn::Linear, + q_proj: nn::Linear, + out_proj: nn::Linear, + head_dim: usize, + scale: f64, + num_attention_heads: usize, +} + +impl ChineseClipVisionAttention { + fn new(var: nn::VarBuilder, config: &EncoderConfig) -> Result { + let embed_dim = config.embed_dim(); + let num_attention_heads = config.num_attention_heads(); + let k_proj = nn::linear(embed_dim, embed_dim, var.pp("k_proj"))?; + let v_proj = nn::linear(embed_dim, embed_dim, var.pp("v_proj"))?; + let q_proj = nn::linear(embed_dim, embed_dim, var.pp("q_proj"))?; + let out_proj = nn::linear(embed_dim, embed_dim, var.pp("out_proj"))?; + let head_dim = embed_dim / num_attention_heads; + let scale = (head_dim as f64).powf(-0.5); + + Ok(ChineseClipVisionAttention { + k_proj, + v_proj, + q_proj, + out_proj, + head_dim, + scale, + num_attention_heads, + }) + } + + fn shape(&self, xs: &Tensor, seq_len: usize, bsz: usize) -> Result { + xs.reshape((bsz, seq_len, self.num_attention_heads, self.head_dim))? + .transpose(1, 2)? + .contiguous() + } + + fn forward(&self, xs: &Tensor, causal_attention_mask: Option<&Tensor>) -> Result { + let in_dtype = xs.dtype(); + let (bsz, seq_len, embed_dim) = xs.dims3()?; + + let proj_shape = (bsz * self.num_attention_heads, seq_len, self.head_dim); + let query_states = self + .shape(&(self.q_proj.forward(xs)? * self.scale)?, seq_len, bsz)? + .reshape(proj_shape)? + .to_dtype(DType::F32)?; + let key_states = self + .shape(&self.k_proj.forward(xs)?, seq_len, bsz)? + .reshape(proj_shape)? + .to_dtype(DType::F32)?; + let value_states = self + .shape(&self.v_proj.forward(xs)?, seq_len, bsz)? + .reshape(proj_shape)? + .to_dtype(DType::F32)?; + + let attn_weights = query_states.matmul(&key_states.transpose(1, 2)?)?; + + let src_len = key_states.dim(1)?; + + let attn_weights = if let Some(causal_attention_mask) = causal_attention_mask { + attn_weights + .reshape((bsz, self.num_attention_heads, seq_len, src_len))? + .broadcast_add(causal_attention_mask)? + .reshape((bsz * self.num_attention_heads, seq_len, src_len))? + } else { + attn_weights + }; + + let attn_weights = nn::ops::softmax(&attn_weights, D::Minus1)?; + + let attn_output = attn_weights.matmul(&value_states)?.to_dtype(in_dtype)?; + let attn_output = attn_output + .reshape((bsz, self.num_attention_heads, seq_len, self.head_dim))? + .transpose(1, 2)? + .reshape((bsz, seq_len, embed_dim))?; + self.out_proj.forward(&attn_output) + } +} + +#[derive(Clone, Debug)] +struct ChineseClipVisionMlp { + fc1: nn::Linear, + fc2: nn::Linear, + activation: Activation, +} + +impl ChineseClipVisionMlp { + fn new(var: nn::VarBuilder, config: &EncoderConfig) -> Result { + let fc1 = nn::linear( + config.embed_dim(), + config.intermediate_size(), + var.pp("fc1"), + )?; + let fc2 = nn::linear( + config.intermediate_size(), + config.embed_dim(), + var.pp("fc2"), + )?; + + Ok(ChineseClipVisionMlp { + fc1, + fc2, + activation: config.activation(), + }) + } +} + +impl ChineseClipVisionMlp { + fn forward(&self, xs: &Tensor) -> Result { + let xs = self.fc1.forward(xs)?; + self.fc2.forward(&self.activation.forward(&xs)?) + } +} + +#[derive(Clone, Debug)] +struct ChineseClipVisionEncoderLayer { + self_attn: ChineseClipVisionAttention, + layer_norm1: nn::LayerNorm, + mlp: ChineseClipVisionMlp, + layer_norm2: nn::LayerNorm, +} + +impl ChineseClipVisionEncoderLayer { + fn new(var: nn::VarBuilder, config: &EncoderConfig) -> Result { + let self_attn = ChineseClipVisionAttention::new(var.pp("self_attn"), config)?; + let layer_norm1 = nn::layer_norm( + config.embed_dim(), + config.layer_norm_eps(), + var.pp("layer_norm1"), + )?; + let mlp = ChineseClipVisionMlp::new(var.pp("mlp"), config)?; + let layer_norm2 = nn::layer_norm( + config.embed_dim(), + config.layer_norm_eps(), + var.pp("layer_norm2"), + )?; + + Ok(ChineseClipVisionEncoderLayer { + self_attn, + layer_norm1, + mlp, + layer_norm2, + }) + } + + fn forward(&self, xs: &Tensor, causal_attention_mask: Option<&Tensor>) -> Result { + let residual = xs; + let xs = self.layer_norm1.forward(xs)?; + let xs = self.self_attn.forward(&xs, causal_attention_mask)?; + let xs = (xs + residual)?; + + let residual = &xs; + let xs = self.layer_norm2.forward(&xs)?; + let xs = self.mlp.forward(&xs)?; + xs + residual + } +} + +#[derive(Clone, Debug)] +pub struct ChineseClipVisionEncoder { + layers: Vec, +} + +impl ChineseClipVisionEncoder { + pub fn new(var: nn::VarBuilder, config: &EncoderConfig) -> Result { + let vs = var.pp("layers"); + let mut layers: Vec = Vec::new(); + for index in 0..config.num_hidden_layers() { + let layer = ChineseClipVisionEncoderLayer::new(vs.pp(index.to_string()), config)?; + layers.push(layer) + } + Ok(ChineseClipVisionEncoder { layers }) + } + + pub fn forward(&self, xs: &Tensor, causal_attention_mask: Option<&Tensor>) -> Result { + let mut xs = xs.clone(); + for layer in self.layers.iter() { + xs = layer.forward(&xs, causal_attention_mask)?; + } + Ok(xs) + } + + // required by LLaVA + pub fn output_hidden_states( + &self, + xs: &Tensor, + causal_attention_mask: Option<&Tensor>, + ) -> Result> { + let mut xs = xs.clone(); + let mut hidden_states = Vec::new(); + for layer in self.layers.iter() { + xs = layer.forward(&xs, causal_attention_mask)?; + hidden_states.push(xs.clone()); + } + Ok(hidden_states) + } +} + +#[derive(Clone, Debug)] +pub struct ChineseClipVisionTransformer { + embeddings: ChineseClipVisionEmbeddings, + encoder: ChineseClipVisionEncoder, + pre_layer_norm: nn::LayerNorm, + final_layer_norm: nn::LayerNorm, +} + +impl ChineseClipVisionTransformer { + pub fn new(var: nn::VarBuilder, config: &ChineseClipVisionConfig) -> Result { + let embed_dim = config.hidden_size; + let embeddings = ChineseClipVisionEmbeddings::new(var.pp("embeddings"), config)?; + let pre_layer_norm = + nn::layer_norm(embed_dim, config.layer_norm_eps, var.pp("pre_layrnorm"))?; + let encoder = ChineseClipVisionEncoder::new( + var.pp("encoder"), + &EncoderConfig::Vision(config.clone()), + )?; + let final_layer_norm = + nn::layer_norm(embed_dim, config.layer_norm_eps, var.pp("post_layernorm"))?; + Ok(Self { + embeddings, + encoder, + final_layer_norm, + pre_layer_norm, + }) + } + // required by LLaVA + pub fn output_hidden_states(&self, pixel_values: &Tensor) -> Result> { + let hidden_states = pixel_values + .apply(&self.embeddings)? + .apply(&self.pre_layer_norm)?; + + let mut result = self.encoder.output_hidden_states(&hidden_states, None)?; + let encoder_outputs = result.last().unwrap(); + let pooled_output = encoder_outputs.i((.., 0, ..))?; + result.push(self.final_layer_norm.forward(&pooled_output)?.clone()); + Ok(result) + } +} + +impl Module for ChineseClipVisionTransformer { + fn forward(&self, pixel_values: &Tensor) -> Result { + let hidden_states = pixel_values + .apply(&self.embeddings)? + .apply(&self.pre_layer_norm)?; + + let encoder_outputs = self.encoder.forward(&hidden_states, None)?; + + // referer: https://github.com/huggingface/transformers/blob/f6fa0f0bf0796ac66f201f23bdb8585de1609add/src/transformers/models/clip/modeling_clip.py#L787 + let pooled_output = encoder_outputs.i((.., 0, ..))?; + self.final_layer_norm.forward(&pooled_output) + } +} diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 80cd4f810c..6ed7a8b580 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -5,6 +5,7 @@ pub mod bigcode; pub mod blip; pub mod blip_text; pub mod chatglm; +pub mod chinese_clip; pub mod clip; pub mod codegeex4_9b; pub mod colpali; From ca7cf5cb3bb38d1b735e1db0efdac7eea1a9d43e Mon Sep 17 00:00:00 2001 From: Czxck001 <10724409+Czxck001@users.noreply.github.com> Date: Sun, 13 Oct 2024 13:08:40 -0700 Subject: [PATCH 15/28] Add Stable Diffusion 3 Example (#2558) * Add stable diffusion 3 example Add get_qkv_linear to handle different dimensionality in linears Add stable diffusion 3 example Add use_quant_conv and use_post_quant_conv for vae in stable diffusion adapt existing AutoEncoderKLConfig to the change add forward_until_encoder_layer to ClipTextTransformer rename sd3 config to sd3_medium in mmdit; minor clean-up Enable flash-attn for mmdit impl when the feature is enabled. Add sd3 example codebase add document crediting references pass the cargo fmt test pass the clippy test * fix typos * expose cfg_scale and time_shift as options * Replace the sample image with JPG version. Change image output format accordingly. * make meaningful error messages * remove the tail-end assignment in sd3_vae_vb_rename * remove the CUDA requirement * use default_value in clap args * add use_flash_attn to turn on/off flash-attn for MMDiT at runtime * resolve clippy errors and warnings * use default_value_t * Pin the web-sys dependency. * Clippy fix. --------- Co-authored-by: Laurent --- candle-examples/Cargo.toml | 3 + .../examples/stable-diffusion-3/README.md | 54 +++++ .../assets/stable-diffusion-3.jpg | Bin 0 -> 83401 bytes .../examples/stable-diffusion-3/clip.rs | 201 ++++++++++++++++++ .../examples/stable-diffusion-3/main.rs | 185 ++++++++++++++++ .../examples/stable-diffusion-3/sampling.rs | 55 +++++ .../examples/stable-diffusion-3/vae.rs | 93 ++++++++ .../src/models/mmdit/blocks.rs | 54 ++++- candle-transformers/src/models/mmdit/model.rs | 8 +- .../src/models/mmdit/projections.rs | 1 - .../src/models/stable_diffusion/attention.rs | 26 ++- .../src/models/stable_diffusion/clip.rs | 31 +++ .../src/models/stable_diffusion/mod.rs | 10 + .../src/models/stable_diffusion/vae.rs | 61 ++++-- candle-wasm-examples/yolo/Cargo.toml | 2 +- candle-wasm-tests/tests/quantized_tests.rs | 1 + 16 files changed, 751 insertions(+), 34 deletions(-) create mode 100644 candle-examples/examples/stable-diffusion-3/README.md create mode 100644 candle-examples/examples/stable-diffusion-3/assets/stable-diffusion-3.jpg create mode 100644 candle-examples/examples/stable-diffusion-3/clip.rs create mode 100644 candle-examples/examples/stable-diffusion-3/main.rs create mode 100644 candle-examples/examples/stable-diffusion-3/sampling.rs create mode 100644 candle-examples/examples/stable-diffusion-3/vae.rs diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index 0c1219d760..d3e23b922c 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -122,3 +122,6 @@ required-features = ["onnx"] [[example]] name = "colpali" required-features = ["pdf2image"] + +[[example]] +name = "stable-diffusion-3" \ No newline at end of file diff --git a/candle-examples/examples/stable-diffusion-3/README.md b/candle-examples/examples/stable-diffusion-3/README.md new file mode 100644 index 0000000000..746a31fa1b --- /dev/null +++ b/candle-examples/examples/stable-diffusion-3/README.md @@ -0,0 +1,54 @@ +# candle-stable-diffusion-3: Candle Implementation of Stable Diffusion 3 Medium + +![](assets/stable-diffusion-3.jpg) + +*A cute rusty robot holding a candle torch in its hand, with glowing neon text \"LETS GO RUSTY\" displayed on its chest, bright background, high quality, 4k* + +Stable Diffusion 3 Medium is a text-to-image model based on Multimodal Diffusion Transformer (MMDiT) architecture. + +- [huggingface repo](https://huggingface.co/stabilityai/stable-diffusion-3-medium) +- [research paper](https://arxiv.org/pdf/2403.03206) +- [announcement blog post](https://stability.ai/news/stable-diffusion-3-medium) + +## Getting access to the weights + +The weights of Stable Diffusion 3 Medium is released by Stability AI under the Stability Community License. You will need to accept the conditions and acquire a license by visiting the [repo on HuggingFace Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium) to gain access to the weights for your HuggingFace account. + +On the first run, the weights will be automatically downloaded from the Huggingface Hub. You might be prompted to configure a [Huggingface User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens) (recommended) on your computer if you haven't done that before. After the download, the weights will be [cached](https://huggingface.co/docs/datasets/en/cache) and remain accessible locally. + +## Running the model + +```shell +cargo run --example stable-diffusion-3 --release --features=cuda -- \ + --height 1024 --width 1024 \ + --prompt 'A cute rusty robot holding a candle torch in its hand, with glowing neon text \"LETS GO RUSTY\" displayed on its chest, bright background, high quality, 4k' +``` + +To display other options available, + +```shell +cargo run --example stable-diffusion-3 --release --features=cuda -- --help +``` + +If GPU supports, Flash-Attention is a strongly recommended feature as it can greatly improve the speed of inference, as MMDiT is a transformer model heavily depends on attentions. To utilize [candle-flash-attn](https://github.com/huggingface/candle/tree/main/candle-flash-attn) in the demo, you will need both `--features flash-attn` and `--use-flash-attn`. + +```shell +cargo run --example stable-diffusion-3 --release --features=cuda,flash-attn -- --use-flash-attn ... +``` + +## Performance Benchmark + +Below benchmark is done by generating 1024-by-1024 image from 28 steps of Euler sampling and measure the average speed (iteration per seconds). + +[candle](https://github.com/huggingface/candle) and [candle-flash-attn](https://github.com/huggingface/candle/tree/main/candle-flash-attn) is based on the commit of [0d96ec3](https://github.com/huggingface/candle/commit/0d96ec31e8be03f844ed0aed636d6217dee9c7bc). + +System specs (Desktop PCIE 5 x8/x8 dual-GPU setup): + +- Operating System: Ubuntu 23.10 +- CPU: i9 12900K w/o overclocking. +- RAM: 64G dual-channel DDR5 @ 4800 MT/s + +| Speed (iter/s) | w/o flash-attn | w/ flash-attn | +| -------------- | -------------- | ------------- | +| RTX 3090 Ti | 0.83 | 2.15 | +| RTX 4090 | 1.72 | 4.06 | diff --git a/candle-examples/examples/stable-diffusion-3/assets/stable-diffusion-3.jpg b/candle-examples/examples/stable-diffusion-3/assets/stable-diffusion-3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..58ca16c3bf3083f2aa483374af7cc54ccdcb4e9f GIT binary patch literal 83401 zcmbTec|4SD_&z*ljAblYh7dBgW^5UTtYsSpqZkwu#aObmAuUu7$~IKS7E;->rPL6m z9?4pfh@^dw_I-PL`kmLU=llKq-oM`Wtq;aMi@DG1Jdg7@j_bVV_tftPI5`hjcUPPU z9)}Zw|G3}XIC%+CoG6YUB9Ft%ixA{Re&5D9<0Qny#Knjb;^N|xk`g3ovW&Epl(edX z;ykj3+CohYHFb3@U6Vyx+D1C+>eMAPBU3tq!C1J++TO~{&cvKyhF(NOQc_Y{N?Jum zM#YSxPBHuc{O@-!ZXvu+q6qd6_vfDoo*+sT6PJ)ANx=i!=seqVskg6xU~uU2*!Yc!$(vKRrXN0f{N(Ag=Pzbv-+%b{ z>GPMb-@gAq=OqFW|L@=2iTyv$3zkC!FDgn96-VbKg5L&T1bI;+g)XMx=p!B)r>Jd~ zDnWJ14A!zNZZI!HMXamJ=(ZweEy!FJ+{l6!4ZrQik+4xvX8@fr?NY)6N* zlJp7(S{64vhh1!pC&7=rM!g(%PL4x~a0niIVJvSlDoO0#`GXA zF)m6feeNgxS?~EI{6mMJAn*CozEbJ<4k2#tNQ<$^<~k90HV~nX3W{9O`v+$|uIC6@) zgQ|rq&l^8VkIv$T!1?01@Sm3D8o%Wr7-DI4(WC9L$<7^C!(OGFq73nM zbjb7;VLOeApf$mfC8#)fgfuOS4!=|3kBab73~(wH1TxNB#6gmeqpHA9B2EuS#+6ir zvUuL66{0mF4!wOiGFcza+1ZI>M2BA~84w*ZnGe6x$a`xzBxgl(4oOUplCHhC#R=g&^X8u6g3nI zh)GY1LJA2-#W6DAK({k1)S(yRB8iT}vd`s)a|fgXWQU&iD4ZFSLQ^JZptEO?`B315&=(jJfw*{mq6PEsL49GXNlDkn~ulNvLfQb{$Ef%`*Q zHI#~qKJ-W$nF2Xi38!sZ6OfVD3yFlDtpzDV$-bAG?I6jjl!2NfLP2?mkwp&rMe4zu z!^6VC;=!WgAHfn6VF3XsP{^G4lFB6|!d{f?gnE(aEUJh@9ZPsIT@=s;ZuA4h z_BoEM8a)h8P&i~H9a#n*P+Zz+DI}&O01*_Ev%@GS46+KM0c@}b`8Y05ITf9S99nv61)G+SmDT17A}SFf!d5&9Qic+pPafg{+W2UX zvV_--eEgi1(*k^T6U`jFS&%H=`qVnsEDysMaxbJwD!^7BR*tr030ew(JY)u}1%eTH z0L}y@5g$i)kV1j!!%zrLpNP|kJ4CTSzylH3qB288)eDDt92$0_RFy)$BkN=hM-GJs zfENnL7~SIymgG!AqLs$SOYFfk4BTBMr$PgW&K8*~$r-lP@ zsA1n~^0E87FGE`Kf73lBeNjj(&xkv)W zz#A?hGAMvhC24t?zECw>2Mll^Mi31wG#E0YT!JKuQY1w=jq6hNP$Tz22rSwflNHJ? zrd9|5gYi6m6~b(U#iN9DZ+Hbhl;>P7N)Q2oQ3@e&0q}=YhlP@2=Rq0(?wCrWwSm+@ zU7UadguPWUs!9pE0l5!=EvAPM8W*K9?Tku*OwNFM0yu%#K&}wA_X0XIECoC`SqMuu zs{(%KDCCV6mO4aZ7?n;SVRzJ!GbqrBys_}ph4?utfX4t{`Z8n~bO2x%Iv2n(>N-+@D_B$4f z%Z-kRKxcuwQ?sy>gj1wIwJ@*(q7jMEo1kVmY6QR+a+-kUfpDcF3OZJb2MMYsZEu4b z6cEB7Kw&^3^m@(?L`qf^R3UVC_%qND9U>-5)3qR95DcSb#6h3+CZq$DKsEWIBI-33 zZM^|FK$h5Chv)O=13HKEiF7GAYaBqZLQg#kj=3eYrm9jSTD~!0v&uubS5dhn6EVMZ z1T+7I8(0D2fz%2|HnBzoBb?}OLsYb&b)iO?h85C(^#t682X_J7vP88}z5vu9&H(|! zRp7={rp3t8&IoKr0e1OLz}`rXoB&`p!1Pe!aIm=PxjQ~a zP#MZZ5Ap{ie^dhxg|s}5nu5S-luma5GVGTN!6EBGSt{vC{^5^MQ-H~U=_+cXkeyi4 zphd%o)*SvuXC>mGOY?c5u<{~M2HE@3-KZO*8i1V1VYA^I5EYg$WJX%)A?5mFW6^Xo z6^L1Q4yvJ9(LM~PqVsK1F_i&S2!LH<4t$|$c0G7XVIQ6f@gv{64|Vz`Wcgd+9> zT?ihIl@g>24m}r;ZNVC88mFpwj`faYQ5-(dj?!o}-e10WL%e(@$_* z`toTCEDg{ZEE5owN#S997olqf8%jnFiiq4hAX^APV~p#o%hrJ9&Lr`G+CsgtI_k7A zbwDeHEHcRAjXDF>LEJDC>c=}BG7Z%nyo3a3Gdc-G7YYZS6heh)Ao-LIm4lTs5KSm0 zjOQUNf%i0u$ejcng0Mi-rprL;lM$JL1`gc>(GF--kVmLBOss{x&r@%0z0lxdcMT96Yga*T8bo!W>1lIdU7eI@G976;T z$C%4{;3tGXwdCQ9V51Vc5C%6$3Mr#=2XKb)DW@T|4KE4m%9#Zjg@rT~atM?w*83rm z5JSQNLg6ANRG|$a9T|dHD$!H}S_3!|C^kM02nt|z1yO)C0LkD^KqXSqp+Kd9%BHHJr#M0_nnK^^@lvxnrWIjc=!Jj-!x4@Mm0(ss z2zUb-i!m6af{`>pg1B!E9z@vA={Qi&WY9Z_bX0eWgGfgZsxJt+{6G$=QuCVLyG9z2D-OOSK8D~)CfNC#k0g9C7YtU;O>08|L? z5c&|57wXyQ_GUm8gsd1TMnuj56%sgFC!xINN(xEvQVLrD5DLiyQXBChs12aaJ%Gi) zS>VOdcc|Y{t^fE7B6-k-u-qZiiCW-Eq(D?4IAVJL(K`ge=+FXyXUtH+(& g+w-$ z3Vj+-J z7QAnQv^O46cLciP-qL7w`=Em4d{Mv7+We=Yu#mwyIj2J>q9C+^mV~&DuM4D0A9;v` z;-j2~2eSqR6OKU&K@)=D;(!`bV@N7GRJiQojm}8 zB%ULWq!7fz zFkIpC)KRo#fwcnSfE1bwOgi)%tVOXw6hMH@RJ^ftDTX!<{4P5O9<7T71`|wjv*1mk z&hG#(h6MuUp9;`hM7^=b4dp{i;mCQL)@i{BLx~}}i}S#7A#ub~7C@UtM>Y@${|P7Q zXm}hRYKQ?5$R8$U7fk}2d2yhmv; z;Zgu*U@gVSSRo-jfVsrTyn@)J<4d(s6ro-~T|vb_+)A|{0SPvuvonyEhW?GH9}xrr zONb+d*rbB@zy((r(L59wh!oTq_+L>dPq4&5+yzw>vacW_4gfkLh~8p^rUwo~2$0Q& zE5xZm`pn(VfZl@9TSrzakQ5{hgy2>IW+1W`fO+FGlAxeKKfz0*JIV+EMp`n zkw^t?0l2)E0W1%o$JDIxs#6M*Z-`_7e*yFZnTPTNc>{3Tu*&~IC+1#4gEeylf(N!1 z#!JAqx(IR_g4L*n_=^?|nzu22KHv>66oP~Y)r`>BCe;)}a~w)002cz2 z)0i&6+A@GhhQLV;M+Gev%Bn;D3rZ7cO-edQDP%>HK~Q4)K>EsYX(w0-hy?U#RO4Ww zKx!Zjj5&IACM>?(EY4>PTkyGuu?0X}VD&Nf2P`HV)DftODrN;6rHIyBfE6&XxZ$lt zu!A8l0ED1c0{~i2+>;HW21$?p)f;nGQ%o{+BHOHEI!7NE9F?qV+vB zHDs_F9N|>KX-I#|x{e$f08`jFkU^dUmO7L_JcbEfKNTJa$O8Vb3Xm0qMHtxZZBT*E zg+q7+*wRZCVFGEu_&FcP6#<%$Xav@rppC+rfF49~gKmY~J5zAL$!~zsaYcJdbpUzL z)&VFfNe}|y+yIk70Ym-+c?cvZA4@DET^2*=KQD~&8MLVavA{#Bpjf~cR1`V{6*AYO z5W*r{fzeGAcja73aT&}pi%=}2q`4QP%v8K>;#p^>IKIN z1q=)d=*DpVC|%I6GJwehg&NeCMCwu$@~%Q$K;9zzAL(8w9OV)?18>xO5bXjckJcA9 zilBQ!(}oNH2B?6l44~wXl?CY#o`QHBYE7-se*jrbL3C4)X%3YErWQQ@e;^m~M*$`0 zTz<@nYm~8w#-a8MVZm?({g`9^p~%dmWI@=_EfGj{WH9l(1N?F!B54?90hNSYDFaW? zQhcx}ogGHu< zf))iHj3YphGC)vGE9S^Rz9H!e;RCCtFhd3PB7kdYa5~U85DhXQ55Uy`%m%iEwxeKy zK%?Uh&xjOoRiIeld{F;Jd?f^rdez(wb?nU1-3WaEOyRx4?h+o$9aMT?*~~E)XmLPy z_2Fr#eG9=8gm&($mrggGi#;USe@TV%6aEwmI96E*iXpwUlv8LiHs##)p(Ps=auO1d8ZsGk?RX`bGj!<(iV%am@<7aGdE3%<0lcbEOk_F}!$m-?Qz zEYK+us9CG3gcjzx)owesYlVW(>C+1wDG2i*x5i3*Gz=q;x7E)RMq2p)KbH#8uuwlT?wZsn=p1JV>O|S=ALT2IBjYdh+-@5w+NQqQf^QbBw)WYJP)k$4nw$n=#1S%e6lgnxii2AYr0~fy#5va`0`HA#k3V$H4M+) zOV(Wc(@x{|qLbH|U3JeM_7KBRQPn7|+nqu-&9u+^RiXb4dfkDd!kbs^8Vhrt(RvdTl)oyXN8yl=?RjE8xBIEq-l6?w zE0hL&RjRA(b1gk>_#Yy~-|u?T+Qs59;`XFYcN&B}nan)yMyZ~^b#TMmi9o3<`Ny=% z!&fbyT$g<0#Ft%$@rR9%<|;_~)ErjNU6=HH=|rrCkNStfde4S2nQfP*ALFm@Dh``u z1;`(bTg|!3-MUV8QRc=I^ST@UifTV~z{I~;|EWfj+UIEheXP3+>Dib3E?vv7GEf_R z2IMXTQVN_+u&YI2-hqVM0J#p$7}Zw>85AQ2*d3oq1dA%51Myr~%RR99HpXz|=cg52 zVFhtIS4zW|9DkL;MZRMwoq72ruV?_)GJx%EE5XE3VCM}A7Pf3*w+dTCX&4QFT1MiG zN)^=uQViM&`C)2NszqEC6jroDhC>csbjm~2V4<);|DZ+0qZ+D8QKx}xj(jV9Y`YP_ zOdhHdIps((fLR9~O*U*+f*yw@1v*G!JECq5J|Eb4umNL***Va`Fa-jY)r&YVFz-2z zg{u);b8h3uTllV5)`l+wCUc_RcKhe9TX*A0qAgS6pwGO#2d1Aao~=4od+pf8n>UL} z&t>9dlUL0*Io9-ATM$Fdb8$?+d*FS>%(s=ZM=}kYzh{TFWyyDST>F};lv}COqC`8* zo$j}aGv$d{U3|Gp+E$=MZ>>FTV{d8A8I^P&vc`Q;diAir!{yu6o64<9CP6I)L!=Ol zaG@pyScq230qqmxQ(?tm&HxIFL{hE^b%d^@Qp25329Fu=Pnh1PJtoaDro?f$Pb2m$7k0L6kSi@7Evl7qHtkUNBX=zz&N!8$tvZ9!@~ z{0`lr#XfA?&-xYDqofMzj&A*p+dA$08z;hONjc20Xa2@Lofx=@@{8@D&XyF}^3 z!TTmOYpvYoPxQovxY?#?8>ZT_((Bv8qFZ*1bCwS@IGcKx#hWTVT$jDD+~Me|-?*2n z7Eb6U-M>wi6u&f&EcQM8%#ZJxN5^U(KHC>o>LW-R@76Ru6H;ref55Ie|NRfX67Bev zA9t07man>GnoCB5ZZqXw7e#IM`w?<8LP_HIiiN7dL(}>t?~0qne-tirYdIRLczH|B z;L%G-jq5CP<6bycPrDqA&~85K|If1BBcX6E$F9jITPu|`I12{U3aM&{o9_hWmTunLXGSFXiMNZ)G{Y*b*{qqv3fe za1)pdjsmj^z?WdY2}BW8ITF@%uub3&I+dU=q?2&S)q-vb+Y5-bKn(+tgrnFoeN4Oa zK%hY3p)nRr(V?yh9szK5P+*V(d@3XtLSgXck#8mk>=H1z1b&Euhl-$F17n#`J_k9ZQ>KpCTJv*zSPMx!FR6P}^eU4d*H`=H2#_z$=7>`GXvYnTY zQ%;t->*Po7n71_cO?Ta!OQq&cUJq588^(3`Ed6ul){CX`Qsci_2e+LuupTe&%h@sQ z682Q?)_{cq>qTza;=ZMFJG(y31aurOoQ+z3IwQog;Ei%+@9pV=HwzNZEL+oN8PQP5 zE1SQ5J??;yd!64pvCl^J`+iFQGFT9G*HuCMWP8zHJEYg!PaYIT{oQo&llewFdTr3U zT{R&;>if9Ib;FE|RQDx3u$7eju&!8V=4*OPt8{;W(r9njJIiV@uX6)d8mZH&vx=&t~%lfjtVUtpr%AO`8==*Z!~o(+i! zX$1p2Sevf|gaM$D;upANig04`D*W2H&h2BJ(Q0Ko4HHE3maOgioBODDsK@=t5fZ6J zJ<#zXwO4bZ*&xBN#INSTvkhTA|u5?YG@rX_&W+%7Z>sL2q_Vx7SWBSW`uhZYLBCAz)UOMrMqPgoUicML$d2&WS$ThXd!}g1m(LbK$LyLcqx8w4 z4ul#1u=%zHhG7??=F_!G{HAKPe!4LNOnN^KJx^4xUlpFV`)6nEA4y*IgAD`t|f1n$9?Y(9N#VxoO_{4KiwHkC?`JS-Mot@4PmvsW`rA55r?a-EUm@=D$dj z7B%|=A66UhxAlAX^xx9r?f*>u^}J^1f!?hxB{$fQ8ZrynJN-+*c}7l%5acXUuHGEw z8lf@bA;>^TCzraTL^IYNo3Er-V?`>C$$MZB*yx;gBj`HiVa#e{m-BU<{MW3vK^^rU zeQn>FJ#FAUI~Ls@-@W+`|54B6c+uf+XP4|89Epkxo?Um?Al%iUC2WaEoy|6-tjiNY zD^*tJ6ol^eRmzZOACl27kBH&KK{;oX^pC+t_5u(h0CI@S^wuyC+`;Aq8WT~q zsLX=RlsRz#LI5KaU`fDM{G5@864MGC1IR9bJ1{lngnNRhHQ%RXA3vm_=xmQ zpL>S~zpURjJ#bvT=Zn$i^SChW!$VsFZr37DN!B2a>#4vaZM~-e0JgnH`0P?T?Cuta^LjnU-p}N`nObnbkH;-aM72%+c*5~ zOnCD&yyfEE$Sn(3@l5JPuD#nY4B8}9X?8zz{g-3$OIxnT8xFoNIP;+)?2FgDpDx?K zn>Nj`tW_q?I{YJN&bAKBd~9gH^25lzudX4lHyky)FS2LPeepekD@yHUUoAO(zV1S# z$$VdE zOi6_Ce(f(kk5`mmN|1Mb{bE<&7FYLj^J7$7vsxb!(Du}sxj8S)JCa{ zbByvf2#l8EMBz9QmEC@2x+sGM+Q%br34OF{og-VbwFkU!HD>cfwOy8GoN8LUb%3e# zwrXDrlE4Ld)n9s+H^BY*hz3KTv_E2wn zapM!A>lHtDRqi(JOA>ViR)wIW4lnPJA#jk05~)kgiW2DnmET{6gY9)dQ#7D}YzXM& zV&N-++ChfSIXzM=m~#SV?KCciS!c+}0cQwxceL*Xy&c;sL<$U#=8h3!c4KxUI2K4A zLq4M^A2u820pSPIxiCYBlDH4P4Z&8Ba7aDZykPwMxVW;{my>&Npr^Zn{Qo92yk0S_@a}5!#_)W7&zkJp$4ZaR_*~mHa)4L$ zR(|rUrc10!-bsyNPEq#Z`Ad=&cl~|usMUq$UYkt&m|&+#`OGi-??0-2?WHX3+@PM= zkmyyN=5RLWukDk&Z`=))JLVhO9~0~wvDf*gw_Qa2my~z)g=_QaM~^RyTHIK*Z z{U-8V@l<=FZux0O+@{bguNE{G3JzVe&$<56)k11IQ@2muoOMRC!;0AP8y+Ubx`7vjA*B7zyU)SUps^akeM}H|`(h>_QcVE4}$2ux;yO^lub84RW;T+X6g0t%!G z?b?Bnrk9QJQ90pYDBI~dS!x@v`5?jSnv#j@yQ)F%Ta8)uk&=glT*zS!=ZFkS?4|NA z&3B!*Kg>9E`lf|R<*|wW1*cznPAi{$uxXw{bh6XGEg8S+3nmXlEpDp(jf>Vf54qNq z{pFPH=&W+R#K!FUgvdrd!TNF8kpw1Fd|E}b%_NeaPY$v$i9avUzUnZty}Eq&)x03t z{)57Wr*v1BD;_EQRgtXixMiNG&uizY{2l*P1b_t@%dz6fXHi9ZLsW=LV<`w6Z`pD}pjcwAmw0on=F0)_oO_-XolB}0| zJx^XNj|=)(Izfv3H@ogv-Fp#rLcymN)g_#*HjxunnyIZ$>j_W$4adI378HKyJpZzM zWz)JRcWfT#zF*@TRA8UPd6u1R6mUFDE1%}6u|YXZ*SE;#rc2;8UBATRysK)4Hk0*3 zRmw^(3>T&uYp{+rMuQzb+-2SZ3|+AyL*S^Cim(`0J@8OqpaZTBk#x9PHrfr68Kx`1 zOTsh{W+;xkv_`?k7aA=9^*qNGz(lkHzYW-E6=s7tH{|1ibP{2Nl#Wb5q?G>0v(my~ z6l4+17@UL&78azXqn_=Ric;n>ZnnpOKS}0X=|1b09qiAcE-m9_!&hhaDd?B*hp*ma z4F2_9QpXGH)IV33>`YPN2VHvN+;!>Yijtr-RT-1edhrr3 zPeBY}&zst-YpE9^Wz;YEXfCvm4r-EmCU<^&ZQPV-ht9KSMg0Qp;oK)rzEnL8IIMf8 z<4y9|`Wk`uQnMwIN9lG|ZkCgF*EgTM5N9>KG<2%SqM2X=u$MpW#|xnmXWFtf0PY+IsJdxzoLm;$ugxO$1d3eFpcbR~uZ()%+ZHxO8}} zY+ZAFS$p|k>^{Lcgb=e87cN9^c}tyW)trMZYdEm3k?te=>$|nTdBmlY#wNHt^YC!=qGGy5z=9Xn*!jN zB7_ZOxE>8`GMo*}YRJK;8_4Z6G`j%fr*ISFr9Gu+hfE*z5xRaYeEvcFw8Fq!O(OeA zzU8IsiMk=4)?o#c9Z${g)H+Qo9pPTQ`X=2>xiB@ba{*QQqnEQFhSKGC=YmgNh#NTp zpLJv_(IoUkn)U8v-)9=x<^!VF;{v=dsSMATU7fL`o6*UCI*;phppYQ%8{ceax~OdD zrIVFsug6V!$IHrk&6kon@H{?Ytf;5AJCF4Mlw=`YTxYBxg7WaZ6+^)0bZQ=(tQRaX z`>bKru)eL;G(K)v{UXo%XPnQ-m8V87&bg0MM}42Exx5z({g9h5PCo3y&ov3oUUAuW zf7+?-dp7aT9wb(q<;u+qTIg{waG!71%=){$5z|}G)yMp&e&s7|xZ6+9-D0uk&YIq$ zP@cG`)xyJzn<+o#D>W{xaa5{09Gb@`}s@RpdsBd)fjMu%tm=?{t?#bx;JSyLW7aV@_=Tr&QYov$7duD;6?@||uJ zrwO9CfQ#lX#_myxo($;Hk*#`;fv2|@hjA~vvE{cawM3sjRc2v4UK{VXZUbXOz@^Ua zmL+e^>;7rAJrwZIxh+03Yl>7p^nQr(_@IA$<&&aKk3McVbYrDT)<>7I+~Yd=XLD2^ zb)7Gr?YpbGM3O&kVNMKcN)miO?!DD*sb5(<9uOFf?t{d%Z$ajs&R*90k z%mXv1E=4U0_Qz#lkOKLMe9Q-ixh1fQ6jDnqD&Z0xbdf0nLM-19E>@NOG3?2SKqgDj zrZ%7=>`p?K0pR&6!pIe%9v#zEkg0$6#bAIF+(PK*n90kU6-5Y0A^I3mDPZMSmCj36 ze|CJ^vmjFq#nhn0XUQETyVlM!JrnbwHe3RH{#WzK)6c zWLm!BRqyRp!x6>l${M~@CWBOK(A^Y$fkEuK+b^-KFX&^{4-Lk9$v0cKF0o81%gFF{f3DhMO>=q0@lsfEZ1=08 zk$3SD$J8#hUHvF0?Mb`*`P%VKuYF`o?oK%mTuxrav^it_oLxM*Nq>2V>f7MW4GjaF zeM1%{qSxJ$_WU|HFteszt#;zTf#H)a3eV_|Igr!v=>!}S4 z=gHbbE~Uiq;YZx8l^mOjwJBR#0uKv^Ved_o9&7%E9<(_{95sZPm1A9 zuFg<$4qMFTjgx(2_z7jJbLPbt4#a{j|JZM(VD|bpSEa$1s}yb(S-lMQwtlwmO0cPQ z9Vh>co2}vb!163^gPd!0rF>a8t|$oXDC>9Sn?`(3@>AQRQGG8(XJ~*Sy?wn;TxIuj zzJu*nB1bVv>3NqR++XfFq5ds5pzre4PQspR)-U-hjajFTF$Ox?WQv6vW0l3(I=J^Q zr%wxNd}36(`b82hK5H#(-uF-;xv8%`w9NmA$va)HXOhxbmQeWtTxKK0Wy z>*Inwkjm2OjRMg~$l?DxlO-L%t11Lb9-LE-h!72I!6rAICQ@`yNXItRkj}yMy#VdC zF-@_d1GEPQUOKwg3+7PKY+hQ*{5G^-4fj-Ho2F>e2IHe3<6#pOMrgnSB_hfS$*j;* zN0H_cU^tl>q_f_az0buQnKHWr9->iXPX_r zwc+viiT3bS`gvk962#jMVlsGz)cijEF_9Vf!qVF4&3>;f3I$!^LHEypS7|UxkTqv! zXgwzf$h{E;5b61n;>R{|qu8P@Darl`k{8VaZmzq*-!@+_hamV=*;-@r>QTjbJbP!| zsR9OTXL8`da<$>pQPr!W+s3;dr`zwhZi)O}8&+j-yF8<{;nX?pB_sE@fAyL9+%nBL zdwP%JnrR(z-JeM}*H%8i_b4kzlP23!65taTzmMt`#&+I8mMD|FyyQ$Rv45q_fqAj{ zGuy6TZT46vZO@E*L~ZE(Wcs-?@QscC@P$^MzJt0=+K!pE)?Bk%L(QX0{5<2LWSVOJ z!COAijI|mWJ8SYaNoCW=H(Hov6(-VFXYhkN`>oYit=DU~#+kpQrtEU8 zQLx*BB9)Cx*C(s}urq!xRsFGidg4{Lj5oL=YRNiPGIp_(K%6ZFw7NVPTU zt|7nCwt@9hsQ>FlbNvkc(Ne|Q6D}XqN+dn&)hAC*9e#P`#QP(~3m;k^I`(8N&9cC3 z(eAdJqVdC}AKDxX~#f_Yp+zg?%O*Qv9L}85ss& z0fytifF;9JVYY*6B6zQ0ph7<4&@4VOQJp|aVI$nI(~IVpu>B~w%nDr(;!QZ`4dd@< zyAkGKL>y$nl!BXx^^(vPXRzf215Yqk2p2R~fbp-K=JJ;9GJnakb!)diI{b?2n-D2> zqJej(!tUhfmY4~Ky`70_f_?~dSKQWf5k;k8K~(!D!HszCk!EVYLRV7$#!oXrr`B!p z9=pHfhM$_Qj+(Phn9~BS!~b-+eLebpmDAWF!#me3a8@Q_Uf!e~12Kov_h>%L5lahO z=n)#~^k!!Aa0{hI{$g!NXQbpZy031mSWNG~;UQm@1g)k|chu2w7c1noRPDS8Sp(z@ z>I06-Lob4!c+9WplPv#dRZnX?=Nk`4=meCh`DrFzpD-&)>nhE%i{cp9x3_Kh03U+USG1!3M6yZm(v=5_hu_6D=xy1$X2Has7hg8$Y;<+X zo!6>;PR}Ir*5eyJXIFaqB^=QmcRkwpck!}>#Umv%pIZ3wT`MeWm%0b^H=4m^)^KeJ z41F{8xxhxOIFVVK>m#!SUY<*G7C8s9?iqDTZE4os-(k)jTh5HXk$ubS@S5zUx5hf> zpVLV?+x_z68h%pH$1L^(OY@-kUOihis;xqXw3%U}IcF86Nqv8t=}+fhvBM3`*E})= z);SG3EQH#Mf`g3GmXh8lhiI&cv#v5lcP6t{Pl+)G!`VQdqqB0yb)zf9)I=#2VlcH0 zW7U*7VaI@3CuG)g;A$W=^q&jE_^>koQ?GP@aqtn)fEc=h5DhZ`K*ROvX-u@`fIb?) z8Jq!A@)o$30p^ept?ov9_JHbjB4|7esCaJ=JQ?Pau<>h9e0nfo?5kJ)Z>#lB*Y4eK zmC^$PKAqgs(vra4`}1hQ)J-M#yGPC&Y-JkiUyOSlxYoJS*fX_B)qIPEyD>k1;7E~C zc)%jUq%d1ysv+rTi^raxhs>B;sljf;*N$uH>gbAGQL`ve+*yBKBj0t2=arPQ|@uh~4&A>f~w;5AY zuQ||DBeY*AZuh9R%;m*DwsOf+-)7@u^V=tF8q`nZJU8AFcI|%Ap?B(}nTrFS<)wSu z-K^~n-SSDo5w~fEWM$gUZ+Pp@9BgoRR5P6x?O!Exr)*{)qxhKg%itGmyyBVo3B3hV znbvyoef|6Fyni*yS*g-y4PqXSuX3*5C;T2Bc1=aXBy6FMy3G%@Cpqo8pPm#RWO%GM zjkocyri`}V5t}@FA1Qn-|~8`*(bp( zw*-$zO8HllUMmT8LylfZ_-a`jA-zhgoOdR7{rs4 z_?elofvcfAn8kKWb{*EaGV?UpLu{A%noQwoD<#eFH(S)M{5s?AYS$HXpmwWwed(iT zR;!e@#P^>qeMCXiyHG6178XL)6u5u|;$o+k5mRcWma(tu3f(?Tcgw{x%MTCuJ#jWO zgZIZbZe<;_=;PQN7HnH-JNSkWm~G#*%GTFp`OS|bPWglCRfwU)3ADEVrM9rfR=J$GG%nvSofSEq+j4=^f zYUxTIK2;WECLPgxaC%WWkYRfmh&=#)3XJ@U^3ebSu=WaZaE;+eIvO#k}Ho})rRv`EGq&XdR+&Hq>@93__M>(-0C+t^LU*y1aSo@WA)13?K zWrHO)Ed9BQwxr@gn~8B`6m^+}c4lG)k9oi}S$(26d;L1o#fA&xV_kX7d#Q;GMsl3R zeI1HxYz*nu)4prPI-jjCd-x5X z!O`OhcMtEZpZR3Z3#>?1UHe3ba=<5eOXFJSk>1c7Yt0P2#1HOBh`s1eKKbLOroP>& z&e@P-*9hbTcX!b!LxF+=T{33N_qg%SygT{tyXMw~k`o!~-4Uynm>f?H`BmHOX1}g1`BtWq6PECraPvsp%viVE?8XYCwQqNozF1|G zA{>|A@x5e~z}kHYc+#D9!5hN?l9fO5DE%f{$klb+Q+{&Ug8CHqMukOt2 zcpdaS?LhX)~Tbzu|h6;`$p}Lyr^VC%77gN9yM9f6bNQ(B`>H z|D6S({o7m{%fl4pUk7gr%UC8~U6p5Y#Qfe_s=7*F2XJMob0IWX2E&q7r~ZfwGAz96mjl%?*0$!gtwRXr$pS+ zbojnJ+Ra(Njp~uB7ri7Rp*-3Du7>1H*vP*+3hl+2>NyX8<0@`89sGModB?;Xodp*^ zD5pJ9{Eag%YwtV9aFq`>A6uX2u4Z;Z%rxOe(#kEd(@}rY-9yIRFsUU}F@Mo_@cw`2?onbm55nAEwMaOiA3}&;7#G(Gu zTj4Z!Jqir9!lz`wr>vll+(9LQuI7RdthkqgIX^xQ*t8m4>WQ38*wX>B<>(;$8f!Md z&=8436aRkTC@I57n<4)lUIJaU0^5@?(3*}O0HYw-Cy5Y6<*)r)x8kc8ZGG_KeVTu5 z`?tL-=JwImncqnBmyMlvJ*D0D8y9fjBYvvnO)z0kU|E(pqkA`<@2s>mu<+*ur6^J2 zpQbY!hn^`lTr*bn@4QIPR^LB(d3N)qvbe3G)Q>tzQsE89?#2LMy4E@N<%{p zMn~=@6u3F8E^FUIu*e>3H-7ynNA?MSDHq3I_hDRP*ifnUKwPp;e(MqYT`@P<>+D%& zw25u4n?{89$M8Rx9489UUAsFz>zvPP@K6IK*m3-x(*{qM;IFT5;<&+ipm zuzT%Tm9NiAW>0ye$E&_F_7lZ}VM8suZoM#cvRq(FcWt=yxz1VPuH8^!QUcpuLZ|h0882Up2DvP107Q?7y$eN?I+ixAoosWHrZH z#52D8iIV!)%JW{z%^weBoj7E;^Ed9=BhA0=o>*=2t#o+bnn%{ZagtXRqz?Re<+Ul7 zTWz=C`jOVEh60a$@94QJh9qpmsbJpY02l(GbE&n1sCF;h7w0$d;nfz`qT!~+QZL#$U4lT0;F{5I99_FJhIe;ew0XcW zrHZxNSFpNtOLiXo_mkVE8NXH&n;^|Pje$$=27W1j`=z_~UDJM}p!(F^ns+}w3+}D{ zRixkcAOc*%@GTkpPpL}s?vORq{Z;_M^ttm{|rN24DH9F69T<#LEx?V<^ zs3m-YAKGikgX?@Vux(c;M0B}MAs_Z!;d+6{h`>C!3=9ek=2Q^T6ZT@m`#|Kdxm1kb zp-q0&i{MHqREkIzA!ir0uQ@N52s;kw;tu3_p${gUn+1SzR&Wd4s8?6~J5y_%x#F># z$d2BguB6*X12ezLJ3T-4Fi>ZQ+;ll>chL%-PUkl3=HYaop^EU7I47qt2V%oMg}dDw zU+t|o*A0@hHwR4+ zT@H`eeY24;uK}`a;Q%))VJ@pd1&Lh24Mq z#J{B=%R|iA9j@%V3EGu^?HIumRvf^zKD`Uzx z4$mCZPihfVIaaSWSlzD~_VtOq`w4<#R{8z2Nn0M|Gj=uvJbAxnOT)yWbC%Co0_{+L zH8aYPg2al1)r|_t1y`-T8;(=F`m$SJNv&cR9_cOrt5?GJr$Gme$Wg9x@uG$NuWRe+T13G6H-MuA=C?eln-|463{}J z1FK>}<*{H_1SbFChd1Tayt`+{GEcfaaw8=Deg22y8n1}A3sB{7{9&Z_Qxe$ONaBXa|Ev2E%qM1byF!k*ztZFL%M_%o~rs|)U2&% zN7m&e`M}S#S7H6N`cu6W^@y`8&sCRA7vFX9SgCd5nvt~y^-QaGeNkG~!~et9TR>I0 zePO>F5s^l^Tco?YJEXfirBelIHl4zzI|QT!6_gI?1_7m0Q7I9{yWZ{jf8QPB-f_l} zP0BcDzH7~R=I=={E#<7K_n?`lRU7Hjs^-KBw}WGXs9+^1!w(X?lg7Pl@s^)nR|$_e zYQwiO|9GNscVf1G#ZB&g@O%50a(LAZL87`@(`_nBYinc574oWOnb7>YHnZBXf|j|SWmmPs9O|1+8TMa{L$&= z>AUpBm(MHnMEKE5Mc0UXKaPfG3(i_t4oOk3?3MluInXMiq1@vfKC!f|y50_)EBcy8 z?(?zA_i(U#J))IBT3}FoK7+nKS(b&x6yTMB6vI|OGnyy~e?!+f7BhXiq{2m5tn1VA z7n#*N#)aXb3uN%a^fzKGGrp4SeK%CErt#ZPgmz*mv}N}tU+jG+TQn%F5R-^I(8{w~}drZ6*gr=akfL2@KdQ6IV>#as67ynDR5~(|5+iuvwemNfx9hQO{oo z`y$SC#)ji-6?mg&nV^i2PYD4gh<9f3!P5fT3;<*UNrM5;1yJYkr)~ksI<$lf@HQxU z8qb`;ED+ju0NrS)R|nlVU{Yu!^9u`n4s1NN5)4!#=h#G1fawIu%#k+{R73rHK}dpN z0(uOnqH+cc%YZ>McNe)MV;>D8!lx%EtHkn+nj3oVwM9*}_}s$viPF9GyQ8nK-$nZt zXhh8Z@&2WE&tJ(+$nD-5E0G!wkDamadHG<}zN?bU0itD@S6PRNvrlP_y&>Aog)x>m zcGoDrMmo0jkAuc9zhcqrX~N-#zc43E_&(UuG)&k*@SHqm4D?jR?0GWaz`?%nr#Hs> z!I!o5MV3VM@1uh0bg`@;$tVjkxu>?$n3m&NUyS5>&4y6+v2+9Cb#zldV^Iw1cFt>I zkfc4;F+mV=+a2l0+b8JLyWn64{lH}-R}}9c*+BnrJ%4h@TB%X?1nzdbnjGd4TYXvk znlUT8{!2U6XFSvF4~bZZB}Nsp4Sj>5U^mkIq4&7FQhfRgg3cP})^SR#E6!abBetX2 zJ5BjqesTd~ac%0Hs@Q>Xx1;R5sGpX09&errd#D=s&Aq)>Ws#k|lxg={hxSkQYnwR4 z>P{gE0guBsf!hB2>rW7TIG#-`dqG@d4H9(Ui`F&9Zw}`}{kUtE-nnf2+>mAp6Rz2w z{=%hOoc7|*b1J9H@svH?ZPTQ7Ar6Vo2Z}mzb9+bT#_<-$U&;s_;Ef5ZXq3#8+%z{tzQ6qPY=>S@93VswMu$8~RJX~Du7?Ca+_kI~W1E;WsiFVy zYN5NC7E@Zh?nFr0(3XgZ`T*M$&~m_4MX$)H0FC0G)i{)ue2XcgDSAQ3yVA@&mhoQn zB|m=|tKM5}bySg4pO%@@*CwPPh3wm_D*;Z;2k8m$HvJSw?iI;{>xUP%@dve~7xoE~ zE)?Y*1RV2k-S9uFHKPJubg8{aeqg`wUs${b;|(HZ+pP?zURGPB(0;JX-kXolC7lO) zobDj(lR#ECh3bp~9wD$E4S~_ZJs|WYv|ZE+1Dj~z`2wtn75jhOj9Xx2gWT+p2M+)= zP6rPMm{%g@OV&sd81g9sqR#)Y5um{b-A^jeh*JS{_n-uih`|_u^q@-#qN#vocBqMw zNjSIVzaL-)Cc%i${mOd#UV_|&m?vLvh=-^5J&jsu-%0gGF@GqxEmdk_LfyGj-1I>c zmNfSeR!peYhi!eWV^bRBl>GpYD4Hse*y{z$v6CJiW`e$};^q@wrX>dgF5X!3hMdh^ zFLn0Hi>Vk7ACs2X>u6HNws`~XEIV*XjVn(Y%g=a<#ECCNsUP+YGRgY5F@F?eq;yw_ z&zLE0_L8z>Ey2^+D(XcL(uf{u$j7jdv50~_65ig76Wvh_YmK=i<1~G19h=nv)^Mr} z-BH|#toMQI?}_?!eDtm7R(8yTQ0d@QYPt_h$O1nH5@$pR>2VCbu#it`2>NVWl}7x9 z{3+hObv_iX5%g>IaVdR4KIH}M$Q#M6+n2jp@Ll@!ubZ-cXz0Z{aF^7zkxkvBNz?e@ z4u$Dw0bAB5<{5!vb|sz!jiM=q+D`{N9n)UhH+ibJ-*0*Ib=#+GCgzN#eKPe~r%2r| z>?&_oJ|SGg244E(;J>gW`>}G( z$gZdsR#dHd&VHHv!)LW)g^^_5hLGxx3a+U%C8at4;A`b5Vtze6#tq(t&inFN@ELSm zzR;zgWmY{hU=+SJ=4<#ZH?!EMTiRnq!EzRF+imXs=s9l4;k3kZQmAs_@JO){1r*IF z<;W=x;x<@_1N?X}x3SX%+f+h1D8+WZV1OC(Z&41yGsyG5nWaN+pCZc-2o0$YFdv`_ ziGWas|A|EbFdi2OPN0pZ|0)@9bwD{5=sZLBi9Lk8&|m?8Yix7~dP0xX<@G}yX-I58 z;v~6A`DBtR+@ESuFlhChhuEWJWStpWa*aFYdJp@lUwwEA7~eee_`_Z}v@`7jb48d)wYZFHbDxS*?6lAiJ?>1&l*W!!4$fgx`pIds7 z);4{*!bIJqUsh~h_v5y-u%$F+$KbYVK6;`(^RMNgO+N2)!Q%~s;-H-6&;ue)6f5on zQdF|tilaAZ+Gd58k4)GeS9ZJ2N?9}l!6o3jaH2)i7bDxaV3Ih)$pe1*s+zR} zQp(mBPlJEFe;GTr;d*@i7_CB%qnr8irO2@t74v0rq3jWNmkt+G4R@D-!H40ebAP;!h=%FwuzArphJr2sfnFcAk* z4{%h{#{&gCRP{bYb@?~)P_ZRMZiiwY-i8pqP@?hvzpZI80F$%@vQNOWhJkYqIslP9 zssD45LM_~XqjG4%0p;|m5Nj3^>BWa10ns}|!@~f~0*FPg0wl+LY9U$gRyfH}Mt`t5 zHY9nSi!yl|ee+Fck0{40F42=q&g6YO99l!1Bsu;7^^xY9zqT zPhv)z_YXmA-z&&`s|c;aT(#EgJD z&K;O|RF}1Kb`g65Ia9@7ddhjJ0~7bi1UUQqtj6X(WUF?CrS61(&oa*^zhyeiVbHTJ22!x>CW;n#%f%Fq-prsG`A<+2*E%E->M1kaSpb!K>27TTh+_^51 zfZGINqrvDIkgw8vL1@8|Yu~NNHGYVvhY!$1e8yeiW+|8mHJXWAW0Gh8{Q|RfQX4DY zj6c04>A8_kqAz?S;&Rf}M=RI!08>MW7hzzKYaDFJgx$rCTESxnk8xJbhufhH+(PLo z*Ohygp;FT0t0Gsm{`vbl;+9;;AZr9;aKtTfATo9x)?PzL6sT#Um`LKXn#A%kVSoNJ ziwLms6CZyr!TP&qEkTA)`civGap^9T&F=?4N{`V_t*)!UTiDq4x2UH?=GOP6*B*@~ zL=O;lYma`)LhTqo#<g(SQ!zj}+4_u*~nhB(^ zeg$iv2`)Dn85%1`U97lkS7Lq(Ph-|y)wqoY)zE$YeuX0k84Wu*qy2~J;bzbGpZ%rI zqtaT{dDWY=ZA?DO>whQ=@CYb0NOq-C*QoHTHC9hodnLOb7irt+fTAVotErHeI^2^i z`Fc^4#UcmDKK3qkDaUPhx2o}?Zw!ursAVhijv@|XsdLv z?~zaN&F;W=;jW-0no;}GMdtgsh}0hfarpH>oOn&0p8}nqz`PMfzY!CjXf10lK5AB0d~{B+tVIe0pdY|gQKAU z2EB@h%s?4b84cJ_P{bxc9S)!;2Ft$?ODwlZ1^ccwthY95aIR-^qKE2-_MfExv zwpT4-lS+SK-8K7kr;dMNg-L&5Pjq4bUS3V1?2iA1Nm8Dc4s+4?eL9_fC&m%6v8Vap z#{zATFKJK9WZvM3J-XWe3o9Uz4WI#S=F{oFFfoqEho^hc&%Bzpr-&^Q{(bgnB7yGT zmwqasKwB#O-_Ja@lJW)v9q2C-%`LMFq`zXiA8V_u77w&Z<9R0P+*?47aX;=8mn>0( zMLhn-&)p1G5_!Vp5JMlO>R0zZ-@7v87m9bdzxD$$Ql@XvJ%=60+Z%f~@-}Tq$Kc_% zT0p>qYo*Ak@Y9g8$LylHFx1zW@MbAhq`a%W`t zMxQ{7>=5c34UCuo%n2N_&c-9{1kx)nQ3X@prc{jVT(_GMUN6RuhSsEW+??ayUoCiW zS+hg#$=Y=otYjJEnOVnd1-E3)ZFLIDdgbi<#OP|V;<<|n%~Q#z2l*&3Ym~=%V{#e5 z<~r;?a-kg@t>6OB#Fm4UnvkT;mckm^B!(_?FknTI8ARN9SPmVju+|06U@YMc`wi0DdBLQ`5VY08%Rv*cgI??+0*S zaNr{OHjvo`pby0}a+|iCem*67JjzZn^;!}e?WwY(cXN$0&=Tf~EEnmhxs=|~RvR5W zNB}7gyf_1qwwk4>E7x(TQO+_5q|-0;0O*gUYymNduq4xTkb@Mro z5AHQ}(I0`|17KS%r?hu=L3>PRf@WC9&gFo6mt_yno_mR7^7r}g0jiDT$ZM(D>*gE3 z&Y|O29I?sk=Dj}~$|9lkA1apt-oxs8AAIKku7%KW(Hr4EXRM*J*F@siZ2kTw?@mh8 zS9q^)i@1aYUOIsXB${SAwn@gXOC${YFYI!v4Ep4Q3(wrO2mgJtQLZ7kf^et^JV2P1 z^RbOg6qbl8Df97TW6b9|+12zhD`>f6s;z#QxGn?j4~VG~bm(CyGd2AT$Lg*1^6?cT zTO#j)UhFv?)m-x|M<4V4#U*iOTDkE~hRc9_(g$7kQHhVyj$dYK*ct5Shsw9#u#%y6 z3BvZ6aMDR749^ZMa`ay~)Vy@hV=zB|S=>9b9sljBo713Xw4J$Ob^rox79#=CH5`Dc zga%Z`{}6g0N^>03N|z9bqXj zU%F~Ubzbe<{F{_)P!pWnvfla7m4NRy^Jv-!e>yA7$VL>9W9V$BIz*DyHFw!FGB><1^QNyECzVRAQ3Of3L7jR z0az~h3^WD>@IuH`42(#jbzzV$K)o%XzJw#AG#-eG!R>z^dIJa`zkn1h6dH?x z=aC9N8L>oUPv!hn8JCf7`hwrWtDM&&||=@tp&j|W{6_q|P8 zUCU9E3)tKre(TLiJ^rrJBwPCa_CWWdqA$R$OJ?Jn~Divw{B4rM4y z=upoqdF{6GLn!iaH{+&>(p1VNcnm}aRZ4q&$jY$jrP^xwvMWm9Q{iiaY-X&ONhc=! z`a@r7+R#>WnK24O^*dJE*ikykubdyE^A>u#W-9E`r6Q(fMh$R0gqGf&P*#K(n@hcN ziq74dquYEX_}Zu;D=Q^AGrwR7K*9cj6(NkR4Y0I8g4V&dWJLKi8as{vCWX=pTs;KW zO)Sd>EttPDhNi;ExzyyCyM#fwEzP16mD<~qnV(dvbn2J+J9ORj7FKQSZ(mO_ld6i{ z5kI<0LupaPF4+^)hXd8>xnxGg zwenf1VZEe+bBbSfmbPl8!zx1k_${3rQ?TxZNBlH-7FKR<%qUQyHQO33IuO#@#9Py} zyU;tD{)Qi;Au`hi$`9yUfjbAK3_$LIF(8;^JS( z|A9*$8b?429AMrDk_F^!189c9NB$Mu(8du&OuvPceIZ*)1VfM~4j~dm3W62V2ZQyv z+l_!f3(9Q(OP8Ak)sSLwHn9Q{iVeECszetDz8Fb^vL^>>I{hIF&A=REYC3D>Y#5`cFkzW=Ub zP31=v>#a~8K!lwUB)Q*l*GiICZHqLBOr#hcLsOW^Dd`J8oUWKrIi**1##o3azjKW;vuN zQ~}^h@W$CR+jJEbO+3dQx3sj(Rg7Xr$DjDa-oOGyD0+KS951b?M4`ymOt%|Bm7!sV z)DVsue#&e6$t`(~idR3LerN8qBsw}x*zQK1a&OT#CG;kfg`aF*`8_G~zgOO{*dnMxD8L{i$r_Q=NNddCUQ53Q^SW%%ei zFv-<23I|)0#1ian4+g&4y1u$6SsvHI?o=g#xbXSvFR8P)%NyG)+GnNkxDise_ur?3 z&Q(-RfZRoeHZ?#NfYkg#JMDJ>*qc5Y*gCNQlmtW;v<6)tC|JQMi(F9$Mf?T1Nx(&k%PJyg?qUrXz(bhu8n4^7$den+(M6li;! zH$12(KWDHuEGbiDPmAXXuIAcG?EYDfs<8cCrM&<2o0l`L^=XUBz6Cqguo!ZKWy5F!^0Wne>E@$|PgG?ZMKr;iHEp zMMZhU64kRS;shM^e@=BbF)i)->;G)2XOWa&rw^VS&f~JQE|tMG0FsgA&2&M~a zBc#q825k;QqH-+ID?$Tbgxrjy1D}Qh4>+gE0L%giR}Ao;KD{%DG+>$#4HUA#k&ej} z!vXXXy=^QV%=X;1wup3j5_}#HEprT%~3R z5`@FRG^L%$xg>jK!JP@rpZj`;egR?vhVS_|K3y4kh_QMGiPVi62*g(;8eTN*iVAn= zp@0opY`{ALXf1#p&-`a(P}~UCTN0o+D`6#yX}puB%)iOa%n|I-t}9BND+ICOxmR5}#Pp!E+I?D~iHd!#mQp}Rao=Oh;kvik$MJh*k~r4guXy9a+2}7r zdMlKY4rVm=>-^XQA5+T`;hFAc(-)}0l~OLKYUqD>Ze?TVtW8MD^!_-0v&8p!XQL5aMXfUbI{Nr zwmKTuIphE02)b2)UC2KRS%9GZw~Fvz9JPY5f_gTcDS8 z7iw=a%wqt^qf0u_A`syZ0BUMFWW5257Qk5u?qH%3q&EXHy}Y~M%(eJ%G$evwwfvHe z-#bua}3<5Nm2yAeL9`yz0sva z^%qvgOM9HZK+va3{cX(oLroELkuUkxhQHL2$!lF@wf@;qzB{i{!!Ul=O6Ca|l>SQO zeEfv5I;C`S9S5Bo795F?_f@&Y{zF0Ud{ zduJ-x5uAdh%#ul_k&d3FX6c-7iZ-OPbSsDb%T}{Ewk7dz3|uqlH?3c1+s<;)Qrh_b zq-c}V7&KmSt0_T`eOW$evm(7BQg0WN_Cb%of~K}L0y`dHP>8U@}1B0GS<{Ii>ZjyOOO1SK+L z$wCK3q#Sq&K<_w`E+omc#)%0dSfbLBUW=xM>%{wZDoY}l4QAAfM7Q)rYn;n9?D|n! z^hzB1XgK;x{pA+AR=8{$0Wt{M#Ve{guf5dqEI!3+i3ivg7h(0zeF zfvqGoaR;7vsDMt0X>q6q`ENlm4Gp9-FT|sSCg{lNI|@{aK#EMH&AMP>L2=dQ)Yug8tL2%I^f`wNKzaRZ~z1sEf~oIjVCmw0p}rb(S#A99itI< zf73fB>eOS^PhU=SGYhz$QVRQg>v>Cx?E*vTa+*+{{h9~8^fye)QR)Qa*B@%sAI=K= ztb34Z+;2|LJ`^%h!NT&&t_&yPr5Kz`IY*t9%RBld%_9rErpmfk^q>)Wo?E=Z{udVX z@YYrRVJi$HCoG`gy;UpEK1o*Z#zn65axvG4UAmI89UU3<-J`Q=hx&MW>+hZEM>fMS z;t3bN&4wtKfIrKxJ_EI(n6vgALuXp00lkXUe1+r~o4C?m*v22Jbqn#w?ZcBDiOr{$ z1DM|4^R%ovwysJ62|^^j^I~TG(w_xiwWQch1R-A5F!WKJC819%>cZ(4^45|EC0UnZ znSaH@ZK0Ae2Eqb>Y#fYpp)*en6BD@p0EO>;0L_PPR^$cS$_%*LaQrF$q{Ta26|`TB zaC|@ZK4!*_Jj* z;Os(f(?TXP5g;%iatqiNfdIAu!H-`G!TG`E2VIED|Jmm#5H_{i!wlu2K$jP37KahIAx?%h7P3HxhND0BuM`sLm_-EjH0lpQv2Y@6#J2dEF4T8~Hyx6W9enP(O&-4X`+%65vFm zhtdp?P60z$@Kyk9xow@FOM9eV!?zmRB=Z?BHAvB!78xb)CeW^hg-j^LWodcDRSTGr zl(;XY9hh{DKkv1SEnwdduB+;-6B0g-r2G)D;x$#cK0hgxT+ts)!2Sl!USvLVI{(qK(`1|`9YGG*o6dzg9sqV5-T{^yQ1Pe0(i1pH z08|BO5Eu_m8OTZWo*FRef?62XEX8C<6O&tD6ao!8{;9U1h6d7zbr2%AA=N2#qM(uzMLQDV zWnJLEPOf`bKq@=J)k|JR`(s-Aq(qsIeObG)$R*0ZQ2P4tIUj9P&CcO96Pm^xca^&WO>p8cEmy)pwbL@eEm5yjPho&2EZ|hf?)}?kb&f3(d-vr{~G5 z=_p^>-KF@{0`|SOnh^%9os#3ukCkn>O)zP_EjM41q@>n7H%K0+n0L=B-}+H|ktVuM zP`fYUDinKT27fxk^vCd}P4C6hS+4V5+!NC(o8Dc*83XAgsi1_zb8%dP!4Q$(`@3;n z(OSI`cyiKWqN$-21+g9v+Ng#l6R**;|H9S;eCaL(#c^2reroS1^Wiy2^?FFf6d_;W zyI7#N6amEbl+TC8hZw-IngxJI|M}2@8Ss68vw#Et-G~ed7b2f-x;vCH>DKFJ`K-i4 z7BtFaB_tN~YKUg|j+P{#0~i{h z00SjJ2|y0{z>W`?H-hONXz76!9~Ks^IS}|n*lM@+syQH`3?xDWy*q%WhwLkmhzo#W zff{&7UrumTfsr0*8z2o4>HTkj430(U?1Hj?r~?QZ&rob5%|{@++*x{%34%AsBM`&E z?P%%P=9D)747E#RZ#0SJp}&Kj@C|{CzDH!np_`!bmz&8&@2Y-J=^X^Bay#us2*V?S z2{AUj-l~s8&1VYhES=OsH~L*~1Wepj%m#!Da-JG+hIM3W#vp5s(`SrhqH zXPtRc<{kGD0L9YGG&jWOmdZ2_pd$K(Frtnp#@O;MUslDvt9m{7oM%qCPV|ORJ*Vxq z+Ovq`5k^6dwh~XF%xZm^yC?ARr4?a^p8{9ro>`4zzlVtPE0$ISuCo=|Npcu3XOhGH zejn=FUkbdnYbpqRBC);GX_JT>9%7^@Q0i^y8A5Cgm#3pzRBmq?Zd?@TvA>1rZ&JS# zJ9tv4g(h7`c+R9vRX6s8^^Zfsp*QCcBc@by-=PjCQ-$}0^RExwhrGuoCV%#S&7wVv zwVg9!()~3H{Rbm0>9kW`ShiaJ_5>^5>o06u=X{WMdFoh&OXU!KM|oi>%&D)5r!$2T z9{Cefs&(j3k#^$-6TyyJxVCL+G`wIW*q<7~^68`Eg}3*uFAsO9@J!Dd+g8t<#&&-a zE6v6Xx-`u8L4&>Ga$}Gp0Oz16PzwPUjdESGein`pOpO_~uS54Yu})fVKBF{gJvlWu zsCBdZ*lW}IWUJ6Y35Ht=7A&#u9g`jCF00ayMt!!eXin$)XiCE9{l0F8+2d}R?V`G} z&GSEQ82U~MtYsCmev+bIvuo8f1#Za|CVt}dnvBlWnA37~10SWb;1|tHD+iuJM>?QMAFPzULy^MKJLE+2Z($?8UYgmwNMFY zk^@jIq^SlK60o0>L4*DGTp^_y&{G1q?l5S_06bcl;EsTdd4Q)6ltq9JV>mLmRD>4j zfb?5Q#C+5gsu<3lSu#r9x&Dc_X-uu{d^$2qb2C={LJN6 zggsWOAtTNE8F}i=F86Yh6LZCIg@%ot-h>g|%wR^Y z<(QnNHqR9Y^Py;Drj~0k{2~4y=UrMlViCZIi&UH0#g!HMJkxQds)HiPF~gE&Lw@jZ z*_9=1x#tT4@Z+dBNv9%J$0T!}q=rzH1MMqE98$ov zVOpLMq>ZL8wGr5=g8jnGHg(s#o$j-fw}dFPPuOwyYM10y&wo0e9_G{u+e+?Y4VCN4 z`haoUa5`0ae6~VcaOsMG@p`;B!zh#?Se@52{T(mX0V3J^THpika+dk|9~ZH@HbufZ z&2S;VP4FA=*?0_(ZWXDSA3q%@v1=RttRJdmTAsiw0}n{|g=R?t7I0#jyXc|^+64(| zgf<^wZ!rULb(ZMkhD_TS%$0F!?^3H)jcK!_CuV$1ZY#C#NL#V^Q@LUy;;IkmF)10-c2}1w|iSim6h~M?I3G zzDBii29@{u`4qbal!ZwgZfZl4!4-mK)D4t0^B=?vCD{b8#(Ky2f53Z(R9>*-hU>Q) zjHJ6<>kA~=F`LDhjs5QYeI%;H+?1IlQuZ29qX0m)@p0TfJWtS_V`7VGB<-W%Vq_J0 z%jB5bM4}&eDmu*C=p`9TJzimfsmUec(MDbTrnfaN4f)#28G_lezMLKfsT?XJjVMnxA z)ju8-4CmpHta zq2`7b5i=OC6Fe@fif6c@rO2G{!0@)Oo^1wAGnZD7w5cYZELrikV*=*qoV9dSX*KJ3 z3?)xxZBXN2^Z$h{l=I2Rg$L7pKLaSNW*I)RA3+0})6VTPPmfWx<1-p~Pm@Ev)wVe0@wc!}F~^%z|H4=}61VwwG1h_6fj z{O5c%pU7R6w~qYqbDpKxamUB&F3*+pyUIz*EnduZ{s=|m1txjD0M7zi256A8jx1zL zFY9cFPn0oJfsi+{%bBNOwf9K>o^M3$!no8Yr_vi_jlSFQJKDQtS~Ft6o>VF{vu@~TD>WT644S25ZN zILm>)_}^MH1n`VQI)#Eh0YJw?J|ubot#J!79K(jPdv%+l(dukTqIYYQ-I>NQ18l<> zY(Z61SudIf4lk2mk1j~-O8XR@lot~0+f#|VgrZ{`Xy8p|QbjMEs0 z%>8TF7;=KgQElV+f<$P}ZYd;rq!lUx#|X|Ae*tq2^=P8Px ztILca2`=F|wtSL@?t|qCI;&`?S>z=HndJ!##^IAWvDCo_ zdZR;Q@o1~}(sGIR)+>6FG;&ryHm7{53|p_+c5bjq@b)B25o0{oG{}|c{Zq;DySZN~ zNo!0LIJL1z`48LT3CvHm+DH6ico!UK*AP?{Kta$K=-wZj&LuJ;GN9|OS-LVpJ&1pI zRfO^Oyqsz7xa9+J#=HJycrJ{3!~4p2-ne{N@9Yur)$?|Y3ixF`I{8!{w?wbv369XJ zp0rf|%){zgPQ@zcS^PV-gn!O!JsM)Xqa5ZQcT?bHID~n4@#f#@sVgv;J50uWw87ip z!QA?!h2BbAt$LhaEYk@u9!TP#Rw}Su1WbHjoE_JI;ucKg&*tjT)W*r2jQg863&rmJ zOQG7iUliPOtxeDJ4IjufGPEjCj3X3UY$%!=Z4l!vDva_9iKLnMBFxRTXTqgzLTA^` zD|c0OI-2%q&PPPW3mTOM^GLI*R60kn=ex<*OGUj$=a$VEXE;+5LX#?y(|;(70hZrj zPVk>9;J--~uyF)ZL8uc)0T7SzGQ_S}$czS3r$Aa2WJ3%&Vg>0yzzIc%kMlWX%WsyYI9A(Z#xbHL-WIFIb>fi>^eA z!8SSE`-o7TKPyJ!52?S+dCY|*?eU@4&s+vKMCs=j+Q3RBg=pU8g0NS|E;jxgqk-|2 zmqE_w=aSu-3k;e|+T2ydW4529Ng5SIl3er#hZl)9>dHzWrTQH6;u0sY=xtmx5sDMa5WO7WO35g2(D& zcJH1G8FdZcRHY%4^$a;rTHN*A7jMfR#y8xcDt_TC1VeA`VtWw;&AjD zNw!QKQpz^fk*d+?F_E|-8DG`@3VKaVxVsHgV1-Sk^tdCn^A4`fz{l&g>K)lJ71cC=O$wxO>Gg?QT2EGW*s{3fP%lsrSGQo zN^zo}P_^Ufw`Xus{HtZ|6>25a!jS3F$bx5u=1)%5v+xRHSD4&!29x!v#tgGn@wkG@ zXDBk*J}_|PNU%z^)=_Qu3Q~Gu=#7Y0v+@~d-WfhP(Ouy*548*V-gBfb7oDW_o(yOC z536bt0f)Vagzl~(D?|PFLh|C(F*+_`;nD@i2WGkjn>M-#T_wwxq29Rl4CRZ(d@nqn z5j{(@9He;Rr98g%ZQ8&x^2?)_+N%EUq3@F{%*RgG$S#zXtJFDr!;gF!v7daAOiS6U ztTg@Vq1oY{Fst2KUom(QA0w|8O)e?6$d^~e*zIQ%SLNpFXl|D0GUk+)_yRX7WU)f) z@$l7SB^rG;pRPF9MErkUOajQ`H0B-4)6Bwwf$QOoM?)vEo9q)WI?s&5qFy{WYw)eX zZKq{7bYEcl6=+zu?WQp^?R^pdj$*K3@GMYRU^Xu_fc6vPd4pPBLF&>PW_1jgt8 zJ%i8#_pc=h^0j}Avg3R&Y-lD z=i_}fRZf{(xKwqX@KokBy&g3ZCV0h+{%f4~!H>G~@eMq_g7R`J1vRBRp%iiNf@Zuf z-PMEs!UXKr;pxO=`(iX9zZN;R17)4f4E9pgRN7Gu?`-SNl_-Q` z=X)FGa^ew`amyNv$-_QGgv_-IpfsatCUHfRp{^$o`Z3ga#u6bMn4ftUY9FcV>}z&d z!{!w#<32>VVCylX06_)EqpT5uWAT(#sa9hn{Eh=KfR=mQ-HN&Ibp$4i#~&4$#o?5a zuI{6+?->VFIEt5iem0B9hEr~rr5ihDS98P;1~i~+s~K|gIfxMFr8u-Ua1W}HC;&>u4Ja_`jqu6AGyR7~&~gPmsYHddbrM>H0)M#+bjcdtJE zz9?wyVj)1BJ+}SUxs=m%t(|ubwxK1CIQ!OosZGlh+im}G4L_*u&7(_4#$QIJxqD(x zMis8lr=x$=`BzvOS3dY<2sgz)D62+%J2d1YeLYtg#M3J&$Ki2-v))ouE8Hh#a<6>z zME!RD2+GigQDn`DPyX@(#o5)gVVC?)6sXo80{1?!i7&6MhCa9HPuvfhE*>u?>cJD? z^fi-d8?VBbzaGn7o1rDGRkii>dVx4UiN3R{?D4DSdIuxB5b->O$v7h5BRk$gC!1%0 zyq%lY?)v9|^?5((xWuNSJ_l4Q-F7Jovv?VENREbz>{~%AHIS*?9iehCzWyg&2l*aU zwm?07P_N8^dIe0Yks350r-0HqAX`%ey?JDR6TY zty7R-4ctfrpjHQN%gBlw*lXU4+)RihPN$Pp75tD8U?{0kOPb+y-}4x*^5dDrS8v2- zWYh7yYD#DfJWnm8VIWo|wqafY(D_TQ4%kKIV7YqfgZy@Sa=COao@a7Q%-g+1;b zSIE&bC1mu#Nv`bNsaYhL`OqdS;jKwDNKRq(=6#KOdCc%1^a^!O98va}bV8ckkBQ$k z@VCXy3Nz{KhEG7rAzrSxd>!#$KhN9mo~r}{pzA{(Lh;N^|JvH8Zu5j|+8#r`=} zW(k!MZb!c{3)^ZvrrOtTU)uWP45XFLjlK<=lpm?W>d|jd6(ftrv|pKog`JgUivAw_ z^Lq07Qw5jKwKfmwh2770YNGNhnEsSfV_|II75YVV(PiEb@Rd`O?aSNgU#@gUxg1u# z-ZkG`hkX+8Ny!aFtu_tFdI!F^AQ1nMzGX`yTw*84h{$a&d{LznB(BL`^WZW* zet~+FEPA4jr#qikDfYazx7GM%iSphTlO-gl7}>LZ81pROv96=zkqeHOp=sd_>1QRk zGt(U>cV#oL?I&)ml>O;_@k@q2OEZ*+kVhjm9d%VrZIa{TyuSCWetEj3mn`EX6$o~_ zm7h5Z>OU1wV*lx09)kE)ef*f?hFK_=9wc_)F=Al_Of{&Vm5Ggm3ULS_@iMf36!3o@ zGO+&q|9Z&eqj^A+7}*vEU1VTW3)yyqln^SOpe*n`Ndi2M z)S+hbzo7^{XmMfQbrRg1dYq%}>6Sv$Dw$aF)koyMGIkin>8#{J%opj?F}#{u{N5*j zVQd;`uS7bve8)w)6z`Wf?%#a(JyLCiH*>syQ;yos&Q+zhl+ys{3bLxW;sycmh~eL5 zMWz}R)|l)0x-IwESclpKqLw~C!!;|eRiT_pRGCR{ii{leHgF?fKamy0N@=8R9=K`6 zE9kprO29Zjf z$2yZ{G{wM_pEeT{OK7>SM9|+8y4$(iDjYfYZmSceZRVM1|EHOjo_c#j`UZWUr{b_A z!6EXb*hBW2-gAkuK>0MvcirrDr_+w8%p37#Ry8$_X^%SJIXd8HXRC`x!&2C(&W>i= z>?fk1?&kBqeeU`>f~VYWpH(-dyUxurpnM|S2$T+!hxf;N^Y}5-z@`WUN$M{yOI=v> z7<+dTX)bsv`Ihmm#`Aslw899IPk(HFE%-pR(1F=&L*1nXN`#?2MGtzHlXTPy?Y*u_ z*yBk9M8_d@XX&@Y3<#b0{mf3_9Fo@JVy(AGcE50D}VBc#+Aa{zv90!HvR=sR7IOsv@gBlIV<$%fwBudb}5(vllkS`qYs|J&d zSWF+~Z$T1lf!t=NwRt0-}|?Ql)s7zH+c`bi7R#tY|nE^eqpkYff_= z+{=2R#nOqJ`G!pDL85jg2L7JSk@?T8pu!l#RW_4+S*kD+w+~XOHh*D$H~tDq($RF( zC>ncTv3_pr)R1FzdLTSZUVcoNpr+3$TT z%l)1Ss=|#ySlBN6JIq%SV~^ys$!|6hM~kFTU~TCe)T|W_;_Vz{o{?apgIfX>s;Z+U(YOC`iDwRiA@DKm>17)z~vfCeue`kWf>AV_^Kx>byOtJHisa?s;CwDpiU1CJkio1X zXXjwVfYeTaB`IiB2Yw4saF;+Pt!OZVY5`D714Bc9pqfemS;gUl~u5+Dx!&~ zmDeTn>wLdI(8Er?QjpzI6?Yu!=XDQ$nHc!GP%k?zWy}0#M!ROIMoygm_K)O-FtMjI zvd3FXZI51<_aT2Mk&)GIP=?9Sa9BaY`(^58yWoduS z0mjPO@{V>F!*Yo$QT2m*Wj(FPd|M@kQnTkc&wv}TM2c3MLgMPGV?uI&ZU3x>CIL#T zV?BfAz(*3Rmi6Vf5kK z20po=3J;!tQR^8~X(ZWa%QWFC1s1HAVr8jg?GmB~ul?Q)cc z)YUuIwUuDdN>h}6W>m3#>UXgx zZlcF7>b_4=C!Vlp6-KGMM0`w z7eX=NCU4C5DUn*bS>Rij0zd>b_da%?CHyVm)6L@XeQB#G>s>(z+0&}$@W7_Y=*e$R z-y0$mL&Cf4B*VI7#yPo`xxQxK=cY&vZ1)|I7mTSq6E4%#rf0qzwX=Q0(4Cp_1m^*f zqt#LNz^JLh*t<`u+lba}+wJTachl&eJS@i3DUt)j#E+8b%5#ogv)SSe4v7x5^H2CK zI!s?ZuKU3+Az3z&;J*^o3%qqe@V;MsNU--O&3wMzZn~#E*<4gy*)c2+^Vpeo<;*|WSWr#z3k zPjA^H+dHb50%a+)Lr99BkAc~(u{D(r279!oVa$`8Tz?yOj7>KFh3-5YOMbYK6gOpb zjh3h3RE34ch6WHXg8K>SG!ES%@+#oD0#6=tFiwxd5Cyy`0fAW#jPqc{m~FY$VEF~g zY5p~Q;ZPR~JXv(eCko8WL0t{Zi;F)hKhcML`1gO?=P)BPP77b{|_)a~V~ zT!bkS-`J8_9*1D+@2k19ZtN07f!T0AI3Moz7oy0=aa5l@`1lz!EM6Rc~5HY&c9DTFw5u8Ki zjgJ`iY~xpMm=|lY-C`DVb)>S|TKl;Y&%)#zuVZxAiO|xr7LkVC*S6m3-b=M~UX=`r z=72;h)GChhq(&CyfH7LIOJsGce{LuKOE=$7mvWaI0=G8;_Jzc{{Y70ziejL}@fg z$Kj{TdAsxYg0P1bt7SEhdA=>RWM!EO-Bm02bGQ+g5zv%mPDD>Km#o}Fi}S@_)ZtAR z(?^9JV&(4?>s2EIIx@&EeKHZM=^EhO$A+#v`hSUT|u7<+u zly?Qf`4smh5838-E^Rd_2Yp3tD%S?eap$lkoPgK|+<=Z>18*2ko)Bujp$yxh2D)yZ%Om5exlJ*IKB zZF5t)`B-L)5LX&=661UGcj-Q5_LuE8%?AZ_VWwv+oRj?bn&K+r-`5}e5b&fk6rW3M z)x-S69;@FLkHy`}&|wTw@~li(6-#<+2lu8nOXDn#=VzlzTO<-1R^=R3jun?3bWKZq zC4O?N{FRp`%{rs5?Ek~kSw}Vfuu&WZ3?w9_5$Ti|NFyNKH5!4Dqq_x^?(XiZuVj9aAXP_z(V#*O{ez_6l0S@%pvT;W9OgfQ~o$`_w>dZ-&fCM_*3{71BA8-%oNRv1w6l*-S)a{L60VmnbTvm^ zx-eBOmW# zfiwP^hWpCQ^3L&`4tVcXiMO^J5pkuI&SeTW34JX{WeJH|DNfw#H8Rm>jAq~P3pXte z3#F%5;S|MvGLsykNAYL*>BoOa5^mW6b9jiih*$Wy8`vDw?O*CVAr z2QxedA6hEUDpJkT!aOKst~bqoy5>Ekgl=~os2u=1V&o$si1GbDqz%0Y&t9A_d*{D= zJAmYS)}N9TtrY?i37B`N85rn#ia+`43N;#MUW`bo#@t&%-GZ&ge`>rhFI5$=Rr)>! zYBSY*m|AyRv~Y5~JE%~R+bWXha?KN*T$|p4U-{#BhUiOw_%O6=IoeHK=+Q*fD4of^ z?t=rhEnDiDjXlI+d)2h+s24gW(|3j^SmKaoR(!{Jf>E&I?{t*_-sh%6*{~BlNWv%h z#m9e)PlbP&lIOi9C^4wwtt!zvyoi@1iYpH@w0f9S3H;b9tv@>3q zY9qm^PDHEA#z{rOiyoovUI9im7}NAJ=~ZAkctf{7q8H;GU}LAMg;bnN-3Y@Q9rpC` zq8D)@>LDup8sy}R&d&VywN6zB@f!mC*={gwu2+`vB5X@2v1WEc)3e&;6ajgoIbwLT zg4{OdVD0?O_led9$jnN2{!YB-Wd{o6a7NQ)ZFPNymc_g`>+SXK7LFDswwyiTX=xTc z@`OC%@cyJ^1%;A;^Xl5;R`up7Ck9A!bf2MjN%!V@E&L|~ZN%wKr1Z~o0nhbO{0f4uVT z50osMVmhk(*q-C@J`JTK<$k%IT|HMKr9tX@GO7SI)@EK5vRT((tcDvZn_FV^x;0;E zFGStg-ZR>Tt(X}7YKpYf>Z%8_KME!P&U#Y~SD5+08qGSE+_dMRj)E@?`t1}!-Swu2 zIG|+YP~Aq*b`AG$D{FnWNgz^jJbb$aHX%vlF|jLOJZzANN=|J`h0Kn1Qm#ixT9Utc zKk**Z?cFb(fGzi<*GCl2&&GtBOCpK6TTSup>+^&(OEVgh@-4nl_?7)G;*jh9W30&j z^Zfj2&OpM4oHELgih8g_@mH|nR>8QUG#afB5{BXo$=nRji(Ur?tJ$&!`MQ%#k{cE- zeGz7AX455$#6!h|_ketJDH{9cl*xRCr^Miv=xlE3XT~^=pd{6Kx!NSJnRu_0qsQSS z+&V#dcH7GAX>H~4((X?UuU4s%8IP4Be{hU9Hsuy9B%; z3&mFhv=4F?%^3C+eA_+97d^m}8=$sN4G$}vrX9~-twb?uk0fXa>QS@);bx-zP6apz z$$SJ(GyrKAAt0wm97L6XEd%i9v3n#&h`%U9K6d&t&Y0#r`u2cv?f>->xQ-$YqJZY> zF`DK-?f8GD>PK)OFunm?Xi7|cQ36CPtq7nYM>8@JRM|PaEVBLO=%+c8#jKD{Lt()g zIF*m1hxC2J?~lI6bmn4>#7GLdhu0UwPnKUfhAvEjI z?i*}$H*9%A=umtq9;|HLH*b;C;cXh<=TlB#nZpsA1lGQ_9z|Nb{<7!O5{~zf8$dVL zKN)l;{aV&C59*ierX$x%mXu&y$ioE=Kc7=jFTS1K7EfbxJ10bw7FzB76Eo4^uZG>L zonOsyrJS%X$Ys067XU-w7qV`HJFdSi_(S#MiN4ye+LT@$-L|^Lse6%K;Mvv+!hT9) zeGmEY5<(a$5-liLoVw4ba?4zdN$73Ksq(w`rm{03f{%boxRpXgX;6otW0L~l@Hdcu z&uq%Ui1j<2&ZDI=<`#ri=cFIM%;(44H~%;j*bw<F7!99-IuCReZuC3M)^~zZ{}nNvElJh zy^N+NnSxj1+j@w)b6piGSX&rsR3i*9UV@9M>Q7l5ZYZ>%< zwq8&dtsVmt7aZ4X3pyZm)%oz68|fKSMOvOsP>izC7M*9NdYR@vLFzX$QKEMVBZ?_g z)BVoklNOTkS5X1I5A?vI{g=l^_j_-$ITBA^a*TBwFA)Hrrl<3FRyU?!{5HsQ)fRcU z)SRV6`#DcNEe3Wg>2A)}NEo^M( zqrE`9Yp7-6xEG-^Eg-+<5X~dxWEi~@K8woh$=GO?6I^tWE|_b2Ekx$gmnJAm)={$_}RMm!f(SA#viP-Y1W z-2N`WUcqbQVK>(hG+Qpl>_+%UWgJo+%Ybe-DyL5S&Uo)yO{yvG6Spj`D>@S|Gy>B6 zJfr{zZvOo45@ZN5g@?L^^4{xzY~)_DquM#E7_gLZpI*)nT?l(w5CI`Xa_W_vDekn{ zL_tv(KbRV02_tBU>tftAd9K}bL_7SXB;~Vxbji0#u9}1fcJol#z$r5%69{<`NG2E0 zVtLo~hHlZ8#i%Mc*;)goC*9@>GSh4c(blczx0|=3Vs+2(*Jz;#giu&ykXw1rwL|_P z9aQj4#TUR9Rfx6BHR9G4qGLBLPK8OId!g;CSdi8fU}qiIGE;Bd^1bD=XB*Zo8OIDY z9$X)OdS_&=|JpHQu-S)QTcTj7K)gKQ7Nq*gc+w^jcmXz%yakV^jPgnEj(56s_k=_Q z5R3HndQ3Rml5oZ*;+K5yTVHsq%GEupj!$L>)$G>CpiEpxtatrp%>hk9xSzhWJIF&# znY*Z%JBv4z(r+qCn?dv@LAxZhmZo8rYD(QGL$qmC_T{ z%dgGC`#d`Ggk5H1zD%(}#+AOZ-ILggeCz|g*XhP}JS7tuA1o*nB#ekrfNP)_Ldpf4 zEijO1-gN`qHXtlFyA%kq0Aj=dnK58eL{cvse!PJ{_PGCPyyO6@4=}F({{hhSA{t)A z;`cv`55QYd1`MtLVE_Q&1)y6DVCT#@LD~7uMDC_Gx}!@7vxLkI<{jyFO=jPUFD0SA zvKD6aOHZX2y-77P8{4z0gPBCjvTM$rMSu8k&~eAqfcK|)Yo@ygt#9RI&VF$DwySE&7(Y7x|SV@h;7Q9 zi+)gmth*?8#T`0JwK>O110xhw5*DZ%tu6;#n)?(s7HVT##}?A;qrOAi!yGaxEVjc- znQYYmG`89737fdsm#GO=4}?EuiawW(J$t?;rj5Bd?o*<=&qQmBvp$u|612%s$RTbB z&?3i{j`{^*OE|dYl)fnEImmPKV?8#es{fFLCjKGqf(q#m?S;F4sp_%MttW1Dlzw90 zXACy(Uw~m#X~=%(cC{aBQ})BJ|6uB%5yT;QD}CLT-xe86Oms26qs_s~i?c_hg8B#CHHZrFmu=?#aS4rtei{ zfw{d%PKa5MMH4&C8=cvsJyTVSi0a@9E$}Y6ydjA990UOoKh+MpgWqbE@~w?)1hdroJmnO+b#aw0{~bUzmNWB@>+6 zcl#A4peP(%PuG&tw3|1kAwESli2FjqF-hQxw|I+jzF47WBGml+2g;pPmRPX%5|+^H zZT{&G8Q4UCdPR6AKGbj}bocikq4yRsi`9({G5NhD5fD=HmRfgH4(NDfQ37;@t!>KL z@ylrz8@I;u0%rz#R`u%6U4=Zy%$;0wVZygMYGJAYsp5nyLFJC47)1Q=xx<2Rn7qTV z?(m5%yy1Io?4(?9X9e2?Za?0dDCok9(TEXzrBWkcdbw5@lbjwmEVt!iE0e`YFK9Fp zFKWTjlNuXMcLTqf8Z=sKu1aSRJc!iE4Ckf|&+5r>wOPK%;jiEsB}IxScasU z2{QsS#6aC)nUf)Gc{xLTq~g-VqRGb;R5=@V;e% zxHzdwdI>)ne$qFHL9Fg5Rp2=x%#k~iqPda6A2hXQWNP>)HLc;pi|(3CG=o5?wdR-M zWGUrqGAg|L>i!>$71z97<>U4CzI@pdc7-qq7h*Uu;z}}~;C)tU3@NLZR}{U~yANBG zw{hj?>G9#4oEwbaxLKwjO?WC9C3RX>j%bhT&uN=#t;BJ();WC;e8U*`H$A6r4Ew1F~j}T|7M08m&R_&YJOkuXaw> z(m+>BQCLOXC?XOGP&OiLcK`(g;0yxda>RKVh*K$9uPydUIR{r?d z8q+`N^kV~L8~@!k5L8s)$_#K_5Z)uel$RSFP=YY;KHfBdG)II4_fu`w%;Adc^_=&t zf?|OKIDd;NvjVdvg8vP(HKZ||8H!YqYNxDCs=m)#t3vfvur!C1>y=a&a5$CQj+e#m zdu^}o_#C_E%Y_DA^w;$bD}`m>0=jI`H?abPs?kd;%?D>1AzF=8hsV)d`7sprrnW|0 zrlHF3NI9a#U(PSlVVXe3&cm+jGeCd+TsMrv?k%SaXjBM-OuAw|e2^tjXahZ+6jCX8K=0`!j z)5ebTO)bRmN(9tz+Tc4s#eRVD)H4b>wa&C(O+C6_9a<(v zD;JtCMklT1nOXs%L+Snddmq|7FEspW=R-Nux6n7ER?eWm0eup>aWnaKW%Lg8O{v71 zaUc7!`}WjQI5fyjt(=R7`Zqtxb*Fw4H~c>N3QNOe*PI5<-eX1u1O7!xD!f^6k0AP` zrYop|G$h=e@B)-(Re%?IZ5!WCyo%J=^}rSbQX747IgU2XxZWGZWR=D=Ohx@YoGzzAyCq_w zSh#RfR6%uB3E48Dv#fQPtM87bE^7Q-By%8}x_btdSFIZlDO*glti>k$bFG1wXX;U% zB!J5~nVrue!MvXrxA3)*@0P^5dNLM_)WAc20areg=`9ea@9t{P8cpj{aZK;~HPPsA zRnH5RZ#1^tZ3vBw9P}01rMiHnM_7pVD!Y~f%UL+#t9FSF_2-S{a+E`s;M~w=Sgszl zQOd?Jl#sK-te@vb119!tp;Z9g)K(^KYu`qKYi?fm7m?kU{Vks-`VoALbffEzXwG%5 z;#k<%rRw7>cx5(PkuL@O{;H;K&GZrw$5-QSIgYK>leu=2Gx#ntI}k1gc67iU6>MZ7 zGhM62Yda%bE{;WAjygYX#NPJma$AlRFDd5aPEC(nSyUy5+Njts&GHvJmq6jP6Z+Fv z=xo13y*6(->T3F8Y7zw^15>AHa+$cM!k)(~4yG32L52oXE&BWSW(sA+v~3phV3hBj zdZT<8Im2ghkD7eo z(g_^#0ktDgDe(LslgNNU8Yg;SI;HCW2HgKkx&R5|W4d1eu(EvQkpbNs1K>qLm`xC^ zoG)NmK@dnEO}hrb#Err(shw{|YqB)Dv-F!Z-^*KGc51(fTyr^=IgGI1fL=v;De0yB zZeT+Zj+u?-t(q(@o;7;A<5Ktm&NC8aA7c0hg;)u%u1AEo(WV3YHCk2$;5BS`UX5FtMNXLiEg?`{&5g>Q z`+Rk(I`s=lyQQEudseJib%2xMKF>gJR`R$y!-u)$6YH!&jQ#N6{1o`pp@HeWX$jLS zX*|u2By575aQE;V)Do7Cf*f5ZxY&nCcvH`7k167B0ICQh0t1;0Dnt5>MJDJ3tOZQuRnMw{3x`8Kd}KDp%tyKLDfFi8GuSqgS=uuGy za;Z|CMtZ1hF(t=XZ5jgjRrSMp)*7*PINg=Jki>gy>qb{?du)0+F- z`d(=hYRI-Iy&X0vt5QEC%c8G*_xBw=w z$-V)fww1~km%U%9zpzTH2SX=5(2jo>bZ7}(TB3dJ7^lC5&aZIzSI$O=6+_Z~;k=>l$A=rpcY7!juqk8z)DA&e)p-p*5$Hvp|dmrc==ZjEZoaW}NqKOkQ zOB@vcyeMD^D{B5xrvF#Dml!L}cJTKfP0~8h0HcCLMF;` z%8@5rNen;OsX1lJcsEw!#tLBggdiGX<#AehlK?cP#!&csa&P%O7zX{@vHs3@Kx_JHG8CjQjLYT6ZE zK?~hdw9;>C$aFh|2O0z>Tn^<6R(>K>G&kx;CYDsJYD>Q(vp3Z)P^&iMJXRw;rSn)3LVr7Oj{hkSILe|%7rI&XB)%vhH`n|tqwbwxV{BKM zy6o+OGQ%C9wQk2OzoJ;0X1U-=bL&|z9 zjH~+(Nh@XfK0^xTdOX;5#hxIgPkSg*qS@?BtK~#Io+-aUfu!r^vf+9&XeLbU`z5et zZ-fgRl%cMO!6XaGzy7>>@M@R!{d*%EeCv|J+HUy?X(~Btf_I8(#wK?v$K{0y#o}D~ zo>U3$$_MH)`hb~1o4m7;aXwk-$?+ZEkC3-t?URFsx~5c3dNif>;RHC}QI7kZECb5pb>z!#V_XcuDYev&PINgw=ud7TFXo-*5RL*+#GuL@z7FRv&d6wr!r|?KTRNY zwo@hQHN0pECc%NqTd`w8ZtN~Jqo6+u`|Z+>bbQD5j)cqTIkTenSthNkVACqY;tGur z^6{n5M)wz|_FAPMhc{xMJIKkMs184V0ePDmxqCB_d((ZZ8y4=Z(iS3K?lZ`LNLVV| zlJ2*0tBInen`_LOzb@)%aVg4|zSMCFb(>|Vg9uIc%f6l}r<&?E*ccqf7+-bRt)W?K zq<1}9jPf|L)5&zQ7uq|@#J%<|nA?r0IlI6n4$xf0P!b5Rvch`#R&j^VHv zNbU>oc8ru6$-V@CB@`*TZF-DIp zf8~&CW|4%)!IVQ_iY*VF(`lQd$sJh(CHZZe0YL$<5lj0keHvresC>(x>xJspIe5o! zG}?=pd>pMN>R=3iBV~-580(ftU9pJZ^tHT45IRbE2Wif+Fws|VQpldF(cc(T5`W{- z6SbPjW@Dj`SAVqX9nGtoCAAZATfp|_0C)lm~07$T0C^t$?%P-^@0qw zNZc?GHVa>xt5Q#0d*d>FWQo=Sb)MiJPMW84$a_gksF9aAvX{x~Or})W#@W#XI9?vH zOEQ2C1|joD`Y!}C2TL2_Z3nh%Ph&I?JsT5P0>Gmn{9li4)_;8%f@6b#Ng(dR2$L|- zivfpYKyWFFFry&E!3gyX@FmGW9!o5J;ipF4Vg1cv7iGnPEcyy&afRSZJ10Ddi)p3~ ze$V{g!XnOlDnb}BjgnNpUWi{`ul6$mZdCq&U|mzi;u5*U(8h6&L9F3s>1AI&^i zF{MU7cq&58@LhAx8#~vc(CGgmS_#TH&pkUj-*2gpgEy%dMOG)f4F%KPV|YkM)QkE-R@W6H{C_Z)seNA;YvKYo zH4C08Nd+@`nQyi^LC4c59-sEvy9-;`{~_VIY6F#pB9;Z=8 z%fzx9jReOdQe&M7`?qvZM)>wve<(iMo zn2h9@SZc$G5Pb?9$r7F69qrx^^7|zCJ2Kn}Q7#Dz<_zTSm3In!dPA2di%6T2ds`m#Zo<-`g6?maV z&B!KWPk!iD(Hma!%y+qYa(^x19%)eRjP4@`t&mEI!m#+0<8j*+Zv~;xZMxwXkm@G@ zR{9h0d77miLfqwO%I2WhWJNwXHlY zj*&8n87VRq9CkJM5ghfN%csdqC-sJEbyLjV7JtZ;SBZl>duo7DW@sWqT4aUBlZ*p{ zPa}4&C6LfW;F~hB46wdp=O~Gd$%47Lyei}n{bb`VV6Uu-nQt33M58F-c&sIqRZ_>{ zeCSDhW~pN5A{6v8Sp5wLqmIJ(N}tF<`V5A-GsAppJCs@XEI4XN(NXg>gV7?lynQk3 zXIudD^F2MaH{b2nbvCWZ1gusZo(hK{@97T_oz`<((U_;{rCL;h>8Fu|3ggR%9aakt z@gP$pNvfJA2_~O138>AQ@cm2k4@a6-3+O8}OqF66I<5N2w?_Fb0pu!Qo+36GnubzG z*QQGN^Dh_vOCDruYbs1DoYVpF=l4uZc}83=Ax#B_Fc4Hnj#dGSs-8B&y}uz^49vkD z24X2P%GCX~^{(23)2-baeOOkOJutz;8SbQ-F&(KZE~Md5A0GO~;y{C1oZE>&>`!L8 zy>7k24V}#7eeT3l$)PvTT5Gl}Gld+P=KN=i-JU8v0ajOI0E#$z45*cu>yK)5ME@Cz z5Wgc3AizwE7YHtk^aTzM-H2l^;1K+$F@@(DU|_FzM$jUl3&h}ezCxS9XR9*HZ88gQr;6| z!IWE%?pGrnLkUIZA!7*FS~tQgzAa3}R=C@i;!e-VfUY&;3*i(@=^O9|3Du9&FOw=K zq^RYQ6WPu6cw1uXkgRNq5NSGwUtE?1ZYqj9j-Q9omzBN#Bu4Ng9sveYkH};YT6b7S3H9XhNl!&kq4GbZb6jPPRR@fR=KpG?Vf9Z{2lJbmzD|j@ zpkMfta%+{<=apSg{vqL&__SN55uH$3m9E7!S6tgn>Yok|^04oaSn~0NsLah0`35o+ z(N#YSTO<1*mON@hRzO9Pd|sr3&lftRY1NYPe5mo=Njr=@W9{#BvU?mxD5v_j!Y!5i z@H6cJkS<^ChqXMLaZ_6@OI@Gta1K36D%|xQ28-+I)pi02vpZvhJLb>Vu- zv@5<9S-yHmvZ;`iAf*_oD$;klG_#?4F7uUIH zr%Wzr)!t6=D~GD*yIApO{e7H4k3-WD%%hd*iNnM|P!i|&jClakt;O#axV$55Bx?ozA?5BtW8{V>8r{NX4SIaETY)=2 zDMRS!7W`3NC6)5>ki6W^K0E zPoi6n3g#;F8^-Z|mH{)RcF%Brqtrcx=H$_Wp*>3)r|?EPm&smO#T?lDjLBewX-*?q zpJroq$Ihp-ZhJ>=d-GBylw(VDjPu#j^8))RZSn#Js2E#ymjk*> zg}ZnL8>_BFoR+)smt#9rIo6Z{S!vDnB5+;1)5C!Vbeh~Mi8(S!F2zRqqd?@eKWsTe zsW^`$107-oZBN)Ygq>3DRA1cZA2L+L`2r$7q-=A5B#i{vG6ATsk`ntPrU>DRq(t;@ z@u6r)4Cv)r8Be4h6)68rzmF9LunPPaGzM58A5pESYyini^)Y$uKh<;eqptu#^~#Q; z)6KGCcrEO0x{uG~kv|1<4!_jpjiqQ!S1)m( znGW5T@e5vcE>Sa^OA0P*NsP0k$9z$8$ytqs{p!ecDOaF6*zd0d)+|T@L&~u@{R6eR zNcc%##fTo(htvtqeB~&Q?U==WRJVFS7-cqw^RPvrV`?c`nVfV!}W_EIA0?SPDHMT$^@d z_zkimz_go_i&{0jAx^=^jrl?s?NjHI0W8r+Va?3mmTC|xaL3(xi$!f{%h-*^8pNzK zZ+_TuLx@^m++r0q^N{6r(*4fm&Ye_Ff|E9@h zX6nM=D6xAY96Vb552;ghe3{@hh}{GGw|>u?r1=%PA)~3bTJ;aRE?KV>yaeegQ!SGy zPN;s+U7PHidTEB&Xp=9^B$K5?S{7*5K40^)5|B@~qv9%jFJr3t#$)%qlGLXU=EA?~ z!?)h)gH4`kcK6I(BFegQ@9&fqW%9f5MC7l62a_;Hhbv<@ytXp$YnI4f$?GiKgfKPV zo@Mlq$;^C~hc0325t?V8P87pZ1;8FH zdfk3r7lTg@^drf2#{}fWh0TI3oCW?NN%Q;&yI<4p=z85YxgR0?AOc9YF8X~E+#Rpy z|0q4+oD12VocVM={CQ|0KL6DbmDP1;JGI{EHJvaju|U#J(H1+a6eu+psDa|BBeJZy z4wo2Ap-{7!ZVJ9xhmB#&vdMc6G=Roy-Kg7xcs6I{4-!<#+c+4g7fbi(?(?FuuUBvIR zUuG`vqv4Tm2r_WQG3Ru%>~b)DXwIE8O*n&Ad7Ayz`10yH_~h*KYHdlS*L5@qP|adc zCi4x^P~*@ajBe;&2QQNsKJ}Q}s@H7LZ-oT?g}d8Ic%$o$ixAop?WHUECTearHZ2v! zoS}QD<2q`;Ok98cEBUJQQ`~|~mA=3UwRxy^3XZ4EuNg~${T2BmLe_?e%o zO4>4hP&ZwEBYU_*_fjcVXLGv37WF0Q(3z@R<5hwHveJ0Iv3_a?x-`xu*M$)7Hoh$8_2l< z+8_Wh9tyArfq@agpaGmgBVdrkMJV5Z6zUYJVnh`J)^P|hCSsV3u?PuxYQ|osxUK&~ z;QgS!%R7yjqczGw6csWD%k(4MFJJyJ^Sk7BERsf|WwD3ez8yVKD-(0@B*>1X-g|9z z<(N9I9_~RTi^Xd*IR0yxT%ajOG2f_8GiaRXJMsQtp4tyNYi zE5aNvl0AEPOEojM-?QjZ>pe`>CNA~b=WAzlB`L;Fd%D$-Kg(T^6tuU-sSR`C_t{A! zH~+2=*O!&fqg@vk8HVs=C?yQ-NvwKXB14LfW8TWN1rK}h=tX4=Cl>D=)SZaqQIgWO zH%;$cac!(Wa2|5>-o~p&9fhu%RfIsJ**^|HCH+op&MbrJsUF?S*1u8kRBm!?WLcZw zdG&)j4k)Bw#%IC=Km!98?ft;sEWzF$_j2qoP+1lx(BD0C0Bcjoc+uId;0s@j%@DSY znWQoh0iW&L>|BXhgk0l(^Xcv_+b}$fJKht<{TAD~!nhBw(a|$yW_Sj`UVR6H>~ErK z5_iW=MuTRx##hvImp*=YPbOKQfYV4_KIWcDve1>upO%5y zPb=DKrp88J`l-_MI50~Dy58GI~fwx!wA$282 z5lpmef{|ycyXlC-_dA+KzVg~)dvV{YpDkNd6jSq>r)3LGPf8NT))RLbAI zI4+j_^`(eFXS>YCgJ5iOg5pigy||@(dUEk?(F`B&bIDrab3}El7vlDq`@Gsa_IGsM z(g&{WCcMbCE9^%uPjfr(k?G2?-8{e7Yt%}dXdWCP0?7p$yDaS#VlhX1QgWu0j8Y>} zZ$>Qx4n`t#@MS)TvyC!xw45^6?bj8@dm}_4E18RvAevIU0SZ~p*N1lkR*To@d!@@l z%m@G++B5L?#y4zuf}U#fdYTInpUs%kGw#!5Z3aZn3DH7>_fGfL^h_)YE5TytNwr(ItF{;>C3Wj&=N zO?-CR>dHuJUZ)9gNrb}Q&X#X{to|f@2w2th=QVp;n=DK$bj_ThF{2;ss_aRQ7F!oW zB-UO!w5G)jQy;wuGe1O?q^yP6zb{D8UYQ_zTH84Crh-YvPCgl2yg`yd_FZ3&F*D;} z#$QeO%o)l%sG^HB2Eg%vq+$yMj3pG%nSK50izdbdmi(y1 zE)GO>Aizh6sSen50X{K=dY6Um$$w5vL{WQG`2uS$;4q13wh)n;xyh&;sM%tA4Lw7i z=o5*-PSe{lJxkm>rjY5{)YHO0RC|Ub;`pf&jsb+qE4kajF}OUVTr7z(e2aJU!z?m2 z%o0t#RQUD*{qcv|;S~}xQP=pk#O6PznVGAd&Xgz|HTnX6hbc?EAlIMyhvZx8N^y#U~fK_crJ*6 zNhVGV+qHGmV&Be!{_bMIdc~PoB&}x_$?}vZ3p*TodXMT(_$&Bk3HGo>bbO}!yeg!6 z7{txP!jIJ4i-&ZngAv6#VyaiNpMnQsdEj_KO}UwL_I9cz$5NeQnm|xn@QcWq?f8*$IIl$4}25U zS$REzPQ30?#m8XN@abWdAAxYWgb)O*W)n-c+|5 z;I8L#aq|hjleaJ5e{V0~e|8#vjz2$?-E3sn6TdHHab)F2*Bb|%H;nz;YBQBmLwdN> zllPj}wuUjHOsY?|Lp+gu>R$AdVK@xFHVSNoBtZSm0vG_r884F{eH7KquY6V(Zi>)MrW&X zY31#_o1t?K1G@^T4Xgx~;MTI0E2eS_#U1~O0woTaTH~j2iBWGnD@Y9b)>|%x{$&b}ULO@HjPSneH6J=y(NW)j0a%(U zK#qJ8TvvaDMo}PElV+coR)iRUpm01Iq-W6sjRY}+NgY_>W>X$uX9gHmC01|(FU{E6 zg$}MQy9ov5JyT!mRzg&ffAe}uGDpN|)&Yx%m24)hDqB(@9p>7qk5g_s)3sepzQ zB;D#7;my16&M5oZsEI0Tm&m8xI%<`0#Y%lD(v5abrrl()FXAssV|pUG$Co!(w~)B z2w$r64Im`FnN)txUZdOl@_sYtPxnxy9ETkv5WC(SuPa^-=^@9set=T36kt#63BLsq-bGX0SmLCHn%IMNckjhDvjjN61l*znX4Mn zFOqL>i6=PCcYs^ol2cM8t1LV9?W8jaEl6d~XhftB)EsKpYz*amap@k>ILY;8JjGl8 zLtF5}gk$>D*_*@uydQ242Qk;vusSnFMzQ=U<7lrD#SQr>BQ=~|uth&0@*7rT<}!P` zi7Vc-8pgh_(!)k=AXE>{lThh`pA)g%#d`11?OX6gBoH$=F@fg)s5+{@LO-+bFEbq5bEd zLAN|DEJ*EIm*jdElJ694W>1%TTKeAYrcAIIJI*oB+CeG9|NOkd#YqdTMSwZwj7Wjc zPZU#)-3jK=1yb=OXIR-h4Zg7&92(r(xNbC7wm*#uGrymjgunC56n|nQR^HzKfyq|#g z;Zwg51T9AeDf}evU`!8U?mRjivAc$wIR*jYd zJt;8ra=>52n45`-VIiRm=r+!}=rFrqQ=9JJu{VBe1Li3yOuZD+CR`yoN&_J`{|oe_sluRyI?Ch}SgecKBhGL!LQuM@by!fePj6=;Lfn|%QBm0;$E!;s$CzZ8 zI3SwywH^Hww9^&@B@nd*)xUbXWIknNgLq^kys0w9{LRsGy!g<+|NNeQ3WQ6Wm^UVu z%0>Rwcw!3lI!*%dNIUSGG^k>LI&?)kO76XbC9m8J+rH)ef`6|GK1Ld~t|?J3bhtKK zRD_EP`xZ?OLXufeg)L2(ufiN|RFD|p6-<#iEhytj;H3Zl^T~fNf47vzW1=y%E5(R? z79j2oaXfuJ!N5j`; zbqs&tiXbh%?q$%S2;5XXB>~uHmDWgD7mYj0teL7uzBDp04#(aFhXpZMEl0(RTYanU z%eHt$mL7+LjVZmGTf05gE^0hh5hBb$p*#DDe!!D-nrX7=08KzV0oh)yufz^Fl4YCV3bk)SyKk+X)F_kq_tD)7n%I^}$R zV8D+)qIqKXDBVK<49D1D41gm8fSID;$4mgPbKv6#XorC|`Twxc03fC@0QmdI`To^^ z0gH&OBVa&A`r2d{OZ?p-nY3BzaoTt_LBp z?iImMhv{)Ly-;@-_V?NNdt#8S6U+-Dqq1&#TwY<^${QD-y$tuJ;n9KtilZU7XPeKy zcdh8)o*#Vdu$M=sCPP(uU7l`~@pUaO3MwThf6bFW5K4_%VE31*vHlINcrdG(uT`XV z|9uY;ei`j}gbbr`?%k>O%yfHo=?-zsak$?OA z39Yng#l2u~eRC_8b$tRHklPu&A$>l|-tgNmK3gNR57z}3QIz+RL&9|&J{z5YB%#CN z9@6(&`3%|T+dm|7f*)6K3QeryzajTItq;1^z)$?F3Ass1&a&PSHmLiD^uwvGm11hv z{>#Fzvn`8%NMH7D4Q^78C~Sn!b`TGDFFS8yfv1z~Gni3ye_Qk$e!;aQv~*w2BK&8W ze901}HR53`I_mnOzW3e%&(jY0V#*)!K;%Ms<^2s_Thl{+Yp^X4(VL}XdwU_fSbZ-e z*mGaw^Y^j=_`gMy8X-67o_^nxfp2(adwb5ncSwLK*yCLy_792qE$|gaNsT7=3c$DI zs@QLK0naDky;V z;)lJ~{-~f$EqqKo;5tv-$>gGcMu4}7le;r+uf29$6W7XX@3!ESXH($i`bU3@hn@v1 zu3T#>&Knuulw}DJU_1jdLg`-tZ@srofqqN$uOjl*fsdL;qyI~((vXi|0Xit~cGLwh ziKvKt8^EUe2ow5R!zb$dUyKbB;%%mF1Ux_}Wpfc$)Bj81fX^L7$}}P>^}jqLg5-*) zfKmgI(+1FAyOpQko>{+wY-{(_*+iC+DQL~D*}VKvX2?M?{T89-niap`Wk?)5#JL6keOY8>g6kfB*v=q)}8q(lhntzcu;m7-_)~hxZlrT?J#Rd7{5i3LO3Q@#4qrXib5x19iKq zoIbp?C$6N<+sPQ{i!BqaWHE19rFKH|W#3Q}BS}~}^%dAhoMF;8mC-}KsY#VmVb)df zzo6#bJdxQD^E!-E9vY}Rq4gh_aun)kOPG5cUYDAmIg4k*10x9Y~q$PCV__-I)QjPPFM*H1{r3 zEQ7IQSyF`GeR3Yu_f~y?mt^lT#`r{A*Tz#IKLR-P0Fp+^Y}eP}q2NiCNsC?e#1UTp zC%V&Y)tS9TK|@7~(ueBS{1r@J-DJWb3q{Feu@q!!CDB+r$`>BNvj_2?Rou!iy*EU= z)S?AC{qc_!C$lSBG(qiD1Knn%3m=(k*th0Rq3X_Lo1J9?>q>9X#F~K2H9(JzusA;w zsGiuZoCH8iw$m4Yo|L>8dNO~d)+Gw;7P6r=x;qOO~ zqvB80{5#zm*r^J6a{Sazp&z4>qsrAjT?&d0jOpw(n0M{W5zfM7_$h=#%Uf7N_XX?` zPc#w&N4^^H*~_}96ejk5rg)rew<1v@aFe$^>^R}0-@d5QieUx%V|MUC21V< z!|%$Z(PH_|Qd2Pb6Dy=%3vsx?&B%t$juV}79%6UMeiV4PQLK|=0&y>P)@WB)5sxo6 zL`*Vkn9*zXzwmFBr!wg!&h5(B_AL~e>N;|~!J zvdebG@k8+VZ~*Ns5HAS)ly2aMeD$3moCGo>F&^=;fzSR&)mMN;wMA_YEiEvFNOwz1 zr$b3MI7lfCQi7l;Na@fi-Kl^epdjj{M!FFJF=$0F2)$VMUuU@Y`@a8qm~+mYb7o+e zcfadhYp=Z)(AEWoj5%1m18aSdSpbl~;IV=%V8lbFgv-CHd3|=xD5rajv`XqTye$qqpdlDK^BZ_78It1L)E9&m1_6ryeUairR99ocF7k znf+d_AQ-C7FGykDhGx_|wu(#2&W+&^yqnp1N~gE`$q!sRS~r6UU-T}dfJ58BRg!IS zslHxp4WkwEo|ffASDDx6I;X1CF8|53QfHs7a?H@{lCaaP>_HEfJ|_kEmBhxJ&$QJD zuu+b-$(MmdxEPMNw+AMme9ZE5u5YIM(#L=0-RCVeVIW>hp3OBNzd=E+UXa@Lwfgu}(cg?%8I>uEipD>)`Z3qPPPrl%A>Msl3Ka50^T z@b5WEVo<#(8EqW~2vfMZ#oY2jA(+65aP~$sfkl(iA{QZ&0q5vw z7U>q43KF{61VOMEjBqpb)9SzSAR(@7-MNlrc9tt?u`*p<2T>!T6~IYnlwmkgD^CS79k~^hhXZywaTk^xU5FuHJwIl} z>$`;TJZ=@=NPkH#QDA%8*vz|bXVv21!OOR!!4SNg857;|8z|B+MWU1T_PVXL!3)Y!j_9o z=OH@dgJr+#lj#xO9v-fnjr@uO5y38sKv>0-4c+3I(gS-nnHL^TzTe1@B39+P^3jT~ zY)s_!!vu?o$ZWrFr%s7aw|N9DnP0&h+Opf+fU0WrPYniWh<1!gg}>JOdAW{D)AHhu zRiUnb(_b~h^D_6y+moqHniCb+5uNZNYLF)5VPzvuA0nY0);d8^tt)@KobOZG4_k|+ zxpV9Vk$>JlR94tpA9<@0u72l}Uy+P1O9>-gZEbCX3(U7? z5$A+ZUJcrW8G}S6pOai-xlothl0-pvZI6+pKBpF9T z0<5lJhN~`rk=U+RupyS8RoQ_f_faDziZWU+EMU-YF9XZ`QB<7k6Fr5PS#fKL#a`?y zh3ApQv{$d%uMze968FN(m4pQGJ@I_hX?rWj*=IbcrCsa}=3LTYm0ti)ln{i$`D8$}KcSdOx=-{_{d)Uc+fAj@g-&#v#zn8azYOb->T5dg zdv`tG99wZ6vZy+D58AmSy3mQ{1_Q6+)T2rU?0og7l7}|prUU%P567-NSo$&VXOej} zwCWsZ%A4nT@izQ?dD|lIrjLhcO~$2D){9`2V7Vfd0T0ug7+PTxd9rxj67O=^pw!n} zamrWN%G_2<+TxVyN6ARI&YdVDC65+=Eo4Q~c$VBvkJR$!Fx^}aMMtCCsDS-VEUzsI z(QOj-dU$ekHGAz+&l7;L6c6s}m>8yeW#a!!Y6?K$QBOH%P!o`aqTm(4kG z2WEvG9c4NYSfRv`!{>5KhrN&pn;UQ&jf_D!6Pf@yr0$?Ac%~2~V$g8>-NQ(o#ehqD3OcFyn->1O$Q`T~<~>)xSkWDgvgl+d`W^Ub+KNhN{TtgLab; zwPnU5>s~Qv!fKTO5i}XT#srUS#c_yL1;mmrfxH45fgZ~OK(jK67e7`t6?cCgbG`K! z@2lw6d-c5rRmw{d3TvfizI;M74P>%TBZ@CS2Sf&NB^2but>mjcY96M3aIHU3bPo%u z*?hL?nU$#rlJisIcNX!+MkC*&H!4duiVCo^)AD92o2O}rPx&1*ZA%IYhee!75S&S= zXqVV3u%6y(5fBSw7~qf$wXRv(pI_-)sbrTN38`s|KId4)|F@oa6T^d+RonU_J>BXdQT~apM!$FS{hAt*ByACl-nj3wPJw zzw4Tb{40`Ke=f?>6Oq#93~N(b0*PacTaM}H4ptA}is3bEhP) zoT^u3N#|Ql+}OQi7R?>v`!%b9ruf5Lam;9^IfbQ?g2wRp`cEebx1}wn*Vx0PyzZZO zYuOsDeBDbdlK#$E{5xR7VgGj)qZ7T=OXjWRwRKLj{85*3)&50%D3^;R5iBf4ne#jk zdGF=(u4?tJ@~PwA#AEo^k4q(fuXFdE4--pi*Pp(a_vTWto~1#&zVUD0anEPP-`Mu} z7@mF|Ga2q!6;j_7eW*~!`+7jzD!73(GfH>Zq}g2n>0(#19Y1he@EIaH241dhZPBW?{aD06OT{G;N)jy;#yxZhU zr&H!9Pm2EQvKNtE##}z%yBVVS0b8`p?X;10LtBQ7LXs0`5}jiP95}ZYj+qD&i7OwB zO>G6qoU}RCE?Bxx&QCR*4|CwScI(PS7|ZBkcH! zTMSkdw~W&Uv4@q>`Xq!1zE5mb#0W-#HNlBQo|j1pA>obwDk8*YLwR&SBm=>e=T#Aa ziOQw}vz&d@iU?YQ%TxhSjruw)IiF9O$AQ5{JdUc~4DWPXVRLhF9Fo`=h#(HZU6QdK z+QJokS-PX`7$NLYBZSC8ph|Kp;J{qSQ*gOLs}O z0Bz=)N1Yu_ynGz*L^}zZX38~$_9Cz88(LM_h=el!laFVbL+Hdzq%8JO!(VaAx_P>L zmn*ZwS-FiQSZPZY!!7EtVH1%O;y9L=C1TH~GL-8%eyJwn#O@D^7Y-`$SgO4YOneaO zxxyg%o2x&MFHNEp3&LDO6a92lEQb4vX{{H{s-?^~Sd&95v#-6Urh%Pu_wl}3`QSup zGLm+V2IX|mRIhdS4PAShKv9Q8Sq>(gEw}XDV`D7TEoT|^mGG)$3#8wMf^Kf3PYD%o z3v;=R7qzIb$>g{)6_*e4Fzq;67+q%cR1~wU#taJwp$*YKbsZ?XzmAEB^Dp0fk>e0z zl7@E}JAyGF8Tl~=(jf)yfJ&#Do&XGtAvqye+f%Ag?3(1t!KLXhyc$l1Ye* zK^7EbTu1=EG%LDaN(3x#+?7n5#gCn4?uK~v;S^+131!>42wAIjg}5&twjmqv4X!7694TWq=Zddte#?2UWu;uB~^ zTBz}gmd?l!ZK^#;c0Hk4<@mDN$kfEDww?R47n`^^OfCNw`W zLmydliGqc~Lb%1I4A)9Oa|qUA;f} z-|D?Q-BloUOU3ieQyN9ivkqJ=7xNyQ<_|>BMeX8j`)Cw)#ar}+{(*ti_ zBK+rv{GWTq&WjzX%fI7mA?uC)*OjRQpQu=4a^(Pc2Hi zXWw9qbv|c=oizS6!D_Z(6C{lH38utpv6Yb=NhTU&b0QG$qM+Epf22V~5X#t`m2jZ@ zudIT`9WhHfgpm#cpD>(Jr@`lBFenEN#Rntz=w+5{c}QdwT6_@nlnpY7kxQ0vL}F}& zsj+2_Srn;Zh1jUq@nsl>u#6;+N;FdiqNQNCkdX|bdT2^?4#ZI)VyaB=cw$xR+NX`c-B)b^ zHWfPhwXm*_mK(#KkvVzmZff&i?{55fL4JqH@5bJ^^Ta=>MohE}>#w~#zy6@@rPz}G zM2s9lqItbNiJ$X-H{4uq(cDb_3s(O+Mt3Z;!DLjprlt41l!)n#**c9KlQSljRg-F` z6@Dct^u|4+m^pdxK?3Kaun4y8=kj+X!qv%@rrC0pX2nE$V-}w(Rpn&twu`A`cCqBM zQbD*Au##Afiuj%)bgJRlUM~-c!iL-HP4|zow;WT(!!GW256fRjtxWe#)o)}C?o|FG zsA+*4Dxi%0{o>b_9+4R&_!N`zEr|s62hnNXzI&pLwo%bK0kapdME~jBDdDe8j=0Us zr>uOwy$z)f`bNn|^;Y6h@SQM7=rerotE6#9z9@43<)ou-E$N-x4D+(9waPvB>$;8h z(L2y?wSm|=og2=uguwQqS%gmorJXD+~alcX%TsZ2q=$dMG{uoCPL0%fo}h#ygy zWUR9iLydDG1n$J+0re1ffs4r8l?>EXMFs$bC-BlV=EN)MAb2`zh;XR!m+4TxjGtf42ojvhKiO! zW!jUQ8D6$LUj!tgWMZ^rVmY6ixrj{_s+JPhi-v8MK32MKMb@$x@oIrHC6Z#C+JHEZ zjEBaY6&8v3JmY*cOLUo7+3XQ{bG?qNIyBl+aXjmOdsbexkX+*eqyP5c0`ulkvd7qwqe&v?7Q|tYyCwwjb1&3(qH9h5DAD`u( zkaSez@bJC*yif(Ng=4lB;Y2ad=d|g^i*~BEc;6zd7$QNwWundVGHD(iV?`JBWR=wH z?%>WJ9xPbj*XA$H=(g0j@@l3|VQ|7$Brfx6rLFO-$zS^uyYdMl7NrL!htnrKL=M>= zQT*P(>w2{4Z4^!SZ}nbP)qdTxq}AkeTjP$Zx8Zk#rQEHlg*PMPv!5USdKpf_pral% z-I9FqTbonLD>MIVW1K|93lC3@(kw8#pBz_Kx?eM5Q(sEfZ+ei?cs^s#6$f87IfU&{ z5W72a6#&N4cnA+u{d>TJP%weB!Jz<00lW#6H^Rz*b6{|QFyRK7s{jAQkp^3W@ez~4 zhxD}pxg{#w7+i9z{Y;WplgN?B0I;0J!}JCxpf>+B+FZ&NBO4J983}DvE8x&hil-hq z5cnW15L!r}&km>?A>FoPSY#EIIYa?70k$1AC}=2zLz#;}fRIq0h*V01NDEo@i6=%N z45kL`4knzw)LY*9t&W&bQ;OHw+zQVeY^C0IlL`kpr~cd%pNv+G#q8{g0d{KP2?2^+mRx%yL|44VDF$QKIz z(FT3A;sL+J7km41DF!wUtnV&_s&LpjtdfhOtdML$DKuGkbRUhZI;~Jb&1$RY*~$+y zMMIt^G|JpsLo0r8$~>P-ofdnP5L)vO>Q1mmiq*lDlM>?(wod#LFJwT;Z&e;AN6OZ!HgxgraxHHy7b#^1)pR)UD4H#XX~#kxySjZFjG|2 zG5d)61rIl@cX8t?Ve%vof@OrS9KO3GiqT=>GOUzOFjVOJNX436e52*gT?C!w(fzqSvdPvnU?F(M*>S&^XN zQF!oQhZbOmKoxKrhfpzsK<|HP83>l3I%E($M|Vtv+J`q_Fz_5qdRAvNqTv z1(+hD0i*yRZm46~I026y>Oiu~5aFNKB!UpBZank~TEZe>LZrnZP%s%xg8W;qfsm$7 zcuojxBde5S$OzSO$Y5#U5i;2+u5xfE?cZ;8`hA!8*81l9!T$Zz%w~5rYQ_8a^fQXe zH`Pe?SDtKK^)in6y=>qvtNFz1cqLaANQ5xU>oqqMFYI)8|F~RcJUJCQH`~tk&Xh>m z{k?cayoNW&mB(*}3q2ck8GESClwQ^%emZy$+^mn*Az6Oc#gcW=5N-ibBD3EJ3m_r_ z{wCdNi_CxOw#gBz-&ZSG=Mr^1 zJ^Y}E{Yy>5L`Sut$ZO{AZxs7=@*L!&YENds z{FxogIZ=Pj9AioO(mk9G92gPwXw4u`%K6WSbu7*>#m zcp22lCYhLhEM&-i;G@%Q0q(Kd=ovv}hr$BSB%CyE?X&aw zgBdfXsgm=;5|5vlKAih>vZJqm`Of0wRRw`7HeTa}t!EDOc#X;M@TVg!q!B^2IZM^JFvpL@ge}f&mB;5;6+ZncuvR3Ua+PSP$#CVww#7(=>-oQN z7r2Gaa>W&Fmhe_iRY%7z_2-5^Nmw_YVoZ26O^?C)b%s61H}aO2rvst`!5Rrw zZAMj_zuoD3tEwE+9(}I*C3d1A>a0cHYTphC#pj_hlO|_beTxbwZEv5<5nhdu9ohwh z?WoTCW|W#9?P?t3are5;e6Tea*nPW})kLrRN&IDUmYs%PBc=92=8K{V#E9?^g1JY6 zj{p-;cL){1>R>{|4VWpLMg}^INh8(~roaGIM;4MK2%>HX6Ul=;B|ZY2*7KE^7<6>N zQQip@?o3$8FsQ~3lpWRE6k5T#foVjWYeA21b6%g@1WyLazZgUM`7F%p!;C!!dXO=K z_=6c)m=C8m*4Ba|eT2gJq)N!xMrh|~p$rPqcA)Sed0`T?T$l*$zH+gzL;B#{!&+3} zC^Qst@=Nojh_&6!m>BL=P*(N}R~Qj4-?B=1oiQ;YXBhM}EpjdRdzzunREwKK|74ER z;D<8@9L%^89&7+}M*9$HMyZdK%o{)7$qQ3~511mZC&hd;T5aKI;--2MGy9{U>%A_= z0}bMmSX{MKf@RJ9FeCnM8am2G){xWlbJOjH+a7jy{1h4UFEL_`97+umKEmZ66zxQ> zdUJf;cb#aD49s~kk#XN}x-dx??+^`eLd+K+4ogV6MWXR!AXh=XNFfDwYVSHr6}#ZAqUeG+|>im>JZTk(OwNHQ6Vk@ckc{SWOg05WG4?T;UT|Pn{k~RV< zYJ~DZ(5V_(Muj9siC_?|7e&UPxmZG(1~u( zJQVy=UeD}zbRx7v)9Uxq#&bq2xER~cF$fqacdOqs=*k;b6cDipj%1&n)1zJ5r$2VK z%dn68@ry0j(vE92z9+3!=-Q_vc#%ME$C2?n4)PGp2sN@nPz2N4<`l}SeAjzeR01|S zTbwDkFH(O`q!N5ha_YO|leK+~!pv!Lg9UZp62FhNXsteljP_dv8c^M}Q`cj;B%ye~ zwJR@B&l^R7-PJTAVyW!3c_(BxU0!J;>-WZqwxyif>Qlwj4_+ypn)-R0{KuA9G^Z1- z>y~%!sr)4R7I^C^h0ps+%je7=l=*AhN|bl{W%p7`%3fBQ*3>5AdY$Aa2Xs=czuni7 zHQ^S%RrO)KPb@X?U&;lGhs0EXYAA8!q=&wxkl3c5#t8T3w#7>1IdggKCyA`N>U5{Y?yp4#pw&d zkq1Lv5ALXUGLJaDq4_S2J{03mptBqOeqYq+rAGOHYt`3+VdFUwFXg(*Oe^OUzxFZit;A*XS)3_<=Jbm}j{A)l)~4K&o`pY%S0#kNDKqZHN)^;j0Be~qQi}J*dqmaks0o7fz$VG|Gx?7#$ z5?67HQk#P31VSBI>Y1~zW78f5tt!0LtH8CYsd=234cB+SsA8b4o8H^yE?z<#dF`?6 zj>8~K6QgQ~3<%(rh6O|2M|urpTFB#_A-P-GU!MAwRL&lJ34PHN^5E>Iu58x)OSR$| z{d;V?Lvp!y)`Qi3hjWvEu&MT^q;XiYx>$U)bQCryIghFjczzrEl(Wb}#Jh;zXYAov zra^P&E0kK@cW%c!EF+c%kI>TwT0CSwtUDgej#QZOvkj~|D9L#ZPR-nxoHu9{op4d1 z+N`ITd>@yku$^VvmEADK_8?HedHDx!GrO88;M}&R`Inus^wYNeW_}TG64h+1`fGOh zI4+BJn@%@W8>oM5GzsOieucSrHBE4!eCK>Fd6#9&sz-;7WtQKp&V#zp_rC8}RV()j z_9HI3{uC>eGe>})?v&jGh^|;&rv-@q2iQarf8bA|vL2Okkwo1F8**u_9LpS790x zToZHRm;%)L!H%2ix!b%;h9v3Yne8^5^q0tDvsbFz&T1C=2~g=pG#O=0Z{F9*sdZP> z?)1eOAGpOQD44SyG+3rCL!avz^dt?g88Px{E?) z=!}uck(o9i@gY25vgfWuy3VdYBb&hNzjPZHy}8` z(4HE6E)?SJs+e<{YFY@L&FB7ocfG#zv+Mp%=@Y-Ya-Ge6bGo=5*x@`ld;dXIao0Do zCt1Cy_&tqMwbeJ0&?Voq95LuRaq-aYeDO#1G27VO`H|f8OzqE@Oy*Cj6F7$GO8#oT z$*))X17$3SJ2@UPwh}wu5jo8JyIpzt`Z0~?HP$ntHePS0pJ?5BB~{V;=<-MG<-pIO z;gVoelQb7aLeF*Uuy9&xg?Ii!lrx;Iz=P>ORJJPqwZ~0Y+t{0o@1!>hQD{n?K6r~ zK+zAAQhOfe47j|)5FV}o#b$rV-2qI;`V+Yj;4Z~cfeZ>Jf1c(-Z@4|(-UM*EQxG!T8v5y!(j!zs8K86;8`RThEGFfQPB z0Do^AR@u1RJd_|PfSIA4Jp{Tz82pEFtK(aQ)IdZqAl}g|WE{`|$3w{gNL+@HI4u98 zLR@fo1|AS%2tdt(L`A~oAUCRbpz;!Gl;OmLmYiV@*_c3@@NARGT%~;N-$gG|Y_T&M zujtJ(Gxd^d{fqctU{tzzm)o<%Vmel*JJupC9I4PP1JxcGcPWMJS!ZXO3q`x0Y)TfH zOFrKwiJ7i`kd4uOlx*2C5Iq>qTjs0ty86w=;z9O1U&?~7zml__4@4GiG-pl^SUa_2 zejd8d?ikM#U*wkN{Ehdd?zy9C$Y#-f9{Y{sWxc-DaR1M`)GZtPHO>(tr$-Oe+F6A| zC^v-sUw`ECH<=A>@p+?zJsmXWO;Mj>&}5tDJhQ&H|0?2Yo6E~~lQ$xNW3drFZ4UR8 zZ^AMGh%1mI)c;?^hb`s13F?tj;X-9f-rzgh=N(&ZY31dnmWBc$tl=4FBWvW+l@#!z zH4{E5qIxVBeUn+2RN05iXSCmI-Mb_F&^K5A8s+S}2KVF=6J1F!pEGw!m}co_72Ljy zPl#Tppi&p3YQM$I%8}~D*uwYD@*=~a-Y5E5n={5KtxHxd9Aiv+%N?%|=YsX7a>fgF zW=EYngAK-K_M12`0xu@?;?F0V(A5hSMmn54wzOKKC!VyhCte(2*k~Zw#masw-!4IO z`l$rkUm$;T^=-$tE=R9Wdx!8R1G#OvWV_itasGE4Cm*ZQ*xr60WTXh%cd*&NEQ|?c zui_*SIB^fCUd8DS0529UVkLwYDG=DZDo(O6`2&zAfu~6bC`U;Af249Wa5`r~4gt9x z1dMw-5`&6{yAh0cG!U@vy(A%z({`=0qNe4GZk`r_U5vdenk*s1AnYWL__=fybT9^) z^6wh)N#0&i#t_MdxDfdBh>(F8Bf<9~O)W$Hhjbuuk>gY@AY}+0Lm>iyoPjjh%fyhO zfGh#Q$UyiAg(fO(TKb)(*KzG#&Y~v!a9gL#_$Zl10Qx*ls-;w{$Wya_q5K{mnYSgav5;qt#mSYvJ(x@y(!h@3~&1wp`*p5ymDek zMc+nHdzkM_?M*Up@Ckkyz*ZZ=WQf?#rn%7gfb|kaC~JN?^GT*!tnJf?`vRNmG+NB7 zc6Cb)QFDyA6#0vvRrlU>ey(5er+l)rS`c6-sGpsw#z=8rAv6Y35?gOR?L+O9ilzc`<{kQ*=9^Q zzll!`jN6@? z2Z*I@qwDpr0>rwyyfOg%vQm>HNYw)1CLRHHDrnxMjjGqbOaes8Hm{=;%SpgCka8S> zSnYswli~rgtBAsm|4*A7$?fU3PvW%VM$5zUjjD-=C{fnSsaRQOL!|h>&z4=QK%pm# zV+P{LajgX)sXbx$=>5g^4N-Hj)Q`t|$rTSD6oZf_Twqa(y)FeWaE`hO>1aM;b3+32)*&lge1 zvdNX5f7Y#vM6@7R-uOwz2SeV6*i_y&CL1>gp>BPnFy`TKi(`k(lFY$q9?ah59V@rn zjl~HvlLe;yN%9VwL0uXLcO>(l?h8E0mQ21-Vzl?2+;Q!E$Vpojv5#tjWm*JrL5zg|k7A*) zQr(BP)vV)^LE)kG#ImOz^Z5odwoY3))$0oH#x&eN*I!c1A|mW+pN2j=@(WpKYDKT!3@HfX zTMHbV<_PWHD8I+b8mvb;G+`WM_>}rw?mLdqHCy9Fp}0F-M!%LaeHEw+I#b)9>;_~E z^&T|3pOiac4bz`wFJ@bBL=3StRYmMB9cYqFd`qJ}Q?*_mmi&hzZS#Bzr4tvO3QO^L zx@17$yi*(95v?An1p|awuOVuonSfv^U{-MHKnnv{;TnZVp*I;k9dM-rol$tCl~w;m zD~OJt7D4mg&OTBs2yOwuD~LJ>W4BubcOF<5d_)ATh=%Am)MEzN5tV1m5s{ zA)*_?^2po`!BxbOsvtd-K#6F9@BmnMgAc03L7&`YIF}K@CxD7FiRh@mIsn;DJ9@vz zLx>AalrA^3SgXQbD)EU~h zmuYd1)Ve)~ZRW~O|KybF^2Fv?wy9Ft89N#QtmVZjtDGxT+0mW;T=UTr^0+g5h7Tnx zUoz9OJL}ZcUhZU+nUwS`P`DzDL#e2HCQgYvVL}pV4m^hS6H41`Bc|PMG;~vwf$i-(D{!1w)jmM+waU0RbwK;@1w*{ zJUFGVXr{h(;_ijhCsOw2r%xUew2Y9qXJR>v&!Q~Sts2SLt^Ph@W z*d7!)y3Zjk4Ht zlaucT1jd)BKB)cl0S2|l0_9q$uF zZEOs|GNhr#e@H}l*8=hTSmlplXO4H4*>V=3%)O46A;%IT@>IPWdC2$b8g9iLfArnb z&QkS$Dw8-mNlpDYy7^L>Eo&5!Yxtx%6<1y-=B-w6(gqdT@Tq*QV4Of1w&F0GI@?*c z7!POQ4w*hq+ij6@zkka*&{a#0-* z72t}McZWAfZ%vMBbJ7NyDtfk>UK@-Ydi24ephjWRAefF;$YwBHmfJ@=cs86T)qCl} zvn$&X)j_vresz2{xDd$E>E=Z|SR``jO_S0@ZyAD#Y1N1@PIJ@~R(n18!dHh{rjEz6 z?w-+_?2-&c_p8|bkB)k*GDX{Ix?i@aqeLm{g`PT6O=;*$-%+SNj}kN2jg!LrO!Z@mJxtr?dSS z#w_lApSZPtE>?u@kn`g}UAxA3)?@U6QoTyRX*ih5Q*ZgLRqgZg`=$jTfe# zI0T$bmL6KEd44;~geQ;g3bWuR8<)YQU=d+gOQ*5nz5TLq$-7c2uyUDcfa%v?|;=r^YpXXBk(6=Kum-Rgf_T%{}DNXvh9JIkrM==fgqil2GJP7C+~%f zH3!rN*_+SeU0=P2b>cp5&0TdJ_D=g$y`O6UpfPYr7XKE5ALhEK8Xvz0F?S536>JDc zunA7xgAk+cSZ&}s>loMtdPVpx&-@|m^2{G(P6QHrIqZcmI0D%`yUa?b1%T)znW$JE z zNj>i`@9&j)=SMYOe1_Y2ug|J`bzJ4WAIWf{b;__>)PdPFWi3_kZD^|Dh|IWnOjqms z?5No!%4n4rm6QRR@_4tLHEP*SCD%7C^`uiTbnYd!qzSzrvcme??`Jn9+=lG0z`!f= zuvR4IE&Aq4!oXL{=1;0&SEpQ)y2{4MSg4aW-a#T_AMd4V%iDCiT8+mpL|ZZ6Ugnlw ziR*tc)kqas`S40po?Pm~L|4yKS-$L%RQqMyX?N~(c|SDyc0aPoUDJcxhwurU+pkWI zat%7STa2DMItL#Pxul*~O4y}G-A#4*rVX!Bj$NQcc8S|@Kh2(U^lbMZ?VcNItX-Ti z#~7C+WG{OQMf~1c(RcoKZ}e`I9_M?Uc0>ci!en|fRjcIiZjj{|1BEUu2NSvOt%$5uQq{8gZmy#2L4+KuJ8BnoMaOfYjqMNdfi z^sG+zKPU$7$hNBO(XGCGPMd~KEyIncN~Z6>`pm3HGO4;(d%4&>%8@+q;x3cyH294P znVl*QeW@l9wqjH*N#~$lGr26tDx7^KQcA-@7h-T6?0^~o-XR7`7VIV!NNJdlu~L)4 zk9e|?6cNZ4k&bOP0{IUh2B6n1=V;FWmylRS9N_^P1`VnO!^#}T2R9Ic93tRRiY_VZ zWgtOtGz=6R{7&F`WRU!43!07&K+Z@6zL<=7$SO}+HYV#Q<&6VEw?7T<4S!n$Jnea$ z6d(T@1GXCehEEmCF0YMkSue9vQ$I^0T#JoDj*EMHo`G&UJr&q|3Bry99#@0dyg7`Q z!!yXBNkph!tq(S4((v)=Wm8>uk#p&0$nc41YvL&1K!QXY1yXSI6S?K|8)OjG&Lt(K ztId3bXg^mFyf?BL6fOd@b=+yZcj4;?3rok7lQl(Cw}PG+e#F*qMF!j23fql&hZL7P zG>a}7G6x8#(t0XhGVyUA?r&*w;QY!r@YKUTL<==Wf+qQBKt@!@Rr&DI_c|i!%4a4h zkr68fzD60wyYPSLxr+1f~ou#j~{qj_}l8n*d{JEQGnlIv$mXluusavj3 zMtScsZR3Ax6NW4W{J{%J-(o*v@G#Wr!S&&ZD*FL|A9ZfKIu*Faw8OQJRVLX%hfwfD zRiwJC(YJv~V48qg4tmWuIel4Hxw8pUh1_qN*k6@wur7=4Ecupf`X_LFzuT$E8)QCW z5Qn?ja2g#ADQN0-Y&(q~w8#hcp?NyPqhL)H;=Y@7I|fybj|${n#2hcyq;z@5o*Ir& zFn5f5lc?(lDTY1xNc4o%U6VVE{(T#41CcX%0~zWG85?JlJt_E~L0MG`0ays6x^>xn ziHVBoC5dK^iP#;TMob8k#6hicQk8*FWzqS;D+NGIyuFUf@!JEpqdx3W@GJ{*^saGk zz+?O1b9x{^ zbmg(v$`LF+hOoRG_^@yf;UNV609^c^8!9~D!~XZkhnr5;LEL8+i!FbK1oZd88e9VJ z`1p{~ss&HSwf^4Lx_uStuir-#lu$A}9v#?4^>|YW8*g-AH_oT^%oACc3+|t7cBZ&( zc+g*~QeP3=d5%Y~* zApPn?Gnn&3*Jb5N_z%#`XwFcZZ!Gu}=v0t!>D$U<@6qcvr$+@jbwo{^W^0q6jQTj) zxzZu?=F(GER-TAw``GhiNib+fIbx9QDG#o2gsbOE2)n+CBWeeO7ec>qD?q=IV|hDo zCbaS?$NTF;w=S{H-BA&rPyXqXx_|YIrpxof1I`au&nP5%IBl;uJfd2^CtebNY{5R> zj5$`7FXomDxiT}y+^KAB@;`~(=S5Xwzqu(S7bjd#xTEyDkj<}}dv~mEliuTbp|bw3 z)*HjA7fyLRkIL2lMSEzup*`5Qy-PQCXa)ZX7sPQm zSvecO#~Vu*-EHwE5R)2>s)Nm~Hg^+DbE}dh?b?JFGAX_Hk`~e!f6i>OwsHy9j+?<2 zM&9U^+)|~ddCLkKZc%y5k{YhTbd2zizj>JbNy?(v2qN9rhQnEr;TN3n0}u~fO}-V* z90xuE%B!4SwtMjHU$c)DX-P&GcGz>b;{MkfqSUi2rzIO}CV%nmrmD+KBF zuXO>70pT3nK#85h+PnxgnT)+Cu1704p2*dU`DS!;8&!|#vI~qS_$jlcl0sa)|1;uF zlu4X`(Ew-Q0d!wr2iyPwdBOn78D6KE{=cJ(Kpcr8$f!bwpOE(wwt@*vNB{`1Gsw&0 z;DfxLg}E^Sc(Y^)Z|t=yISun!llPaj9rgSM0)MbC=WgC(E?ZibzF{<0P=1%q^OBM6 z2{DOyS+nVw`9N7N^taDibVS`f1M6DbQrsjtzv{{27zWq&2RExpnpYlhFz_r_kb5s;yPhl#fmv+iTwpoMoPRZMc2&Qky#fO*ckS@TM(%DIY=c$;)>hRF7(UB;;zeHT7lC3jCM~)3BD$f-IYP3v>cnV)viI0F!qTVlCzd3=Vku;;RLO!iy6W8{c1v6_+eT;N2xm*!&%fWT* z1&Itu7=A90P)+mAXZriLe1mHF#`0`hQLvm*Pq~=Uwl-ipb#}hm&2=ND>$=QmgYpee z9{-G8m1!vF6=8o*J{k4dw6D`=F)Z1Y(vmKiF65v-NY_%hV8x2z46%6%b}}55eSqM~zL^bHp09!$!4VX< zMfZ}@0|Drr6fg|329?g*jGOUPT#CH5vc@HJcSATmu;5-^iA67*>0Rd#`d%9xp-%~# z04E5snbiN$>$*{3cR-b_vw}<^-;#n@12zok-4<}R4NQ(a>_KP=@-%0IO#*)yQC(1P zAdjHDzy`s-$`T?#IUx7dHo7L*d!XDS8F4dYg~dY^JY?EKggL%U0>%uLHUamm$Xl_2 zKVuPy{J;}5U5^2A%)Cg4y^(ISxhff71DXtmiU*vcrqlv!c&)+;0F8%Vk0?C`KseaJ z;AB29ASoabjY3)$eU<&ZO&szQEGcc99Qdc=5~IpUk}ueX$#;~v_MG^*m|-VyHo>L` zofv$kNPKS+C!v}}*#k8~IzCIgcs3FRomqG;^Ektfk1<9_|II{*R+D;F&%3?3B)XLo z*>rt#IX73RV{VGhOA))7>igJICl-XS?d@Q0u6|EmqX9?YEKRcE{F9OcLna~B?a^Q} zOW!N19IZ*N-$EQiPV6RqLsFA4Kb7I1e|wGFC|syFf?^u%Vu>LPTj5MFnP;R6fSPKU zOqYkH`@uRkxay&it(Dd`GN}>oO+;PWZs--gPOB@Q#L?<9=g(eaB($l#QG>5B%DNEj zdopd=FW$m%%Zm|w4%U4RFeS+^rWy-)6sv1FDCFtB(Q5auq&W!gH@!$=vx0S9lco| zHZ~TR8dQ*Fz7+9ovFxGH0ZOsXp*0SH+HooEUPt6MM&yoX{=nH(0=1(EUBi1-AefC<0!RT`LI5g6A4NPDpXywNu>B`- zayW$r;JFp!U7=`<3!doc$2G`Uf~`8pUIunZG@s7=m4SqiXln0Dy1k5+JxtNgBQvsv zUN8rMOknLMEFy6zB9{dN*9S2P!@9KDIC`m=d9_w8d$m@uCc9&xyh1ZhmUOlG#;oSL)~A8Er`)@r z{z2u4Q^?Uv?L#fXu0PiH%6ZQ)pMU+-TsIw*!v9qN3M(8usxNSiIzMnOJq+`axVatt z;YaL^5t=9y7S09P50No{(9ZQ`r}%89?$f``*Po7_ZD&b(M7Hix(G$FNj+BkKRHP_F zIBUnYVlYH|%|;}BBYs!??yNyDJ^x~0+r!=OpY>EkepQ#??jE=|Z_ZxUvs* zi_9LR8DY!E<9?oM=cwG&HK}cvvegKcUq~~uy%@qZoNG!BK1u@% zg=jC<^uJtZTR^z%98Q&8?Mo=GFFkbs2bE^|nEQ#rW7C?q-LYG7eea1n_l zmHH9RoJgtuVaXP42;-?TF8eTkSR}Jlx>KK0bdL$VN*ltMK^R{Em<9(M`@l@_6ymZW zcmv_cNMdAnvK||DBzuT^3g8FQX+Zh|S1{Q=6v4t!JV+ko39uLyD2|Ms3Jlmz4yFfOb z1bAfnahVXo}N3tZ;=Adqy8`KL1s%mgB>)3nuKcBy;`w!l{xxD58IhdFv9 zbf+P;BX;B)tfTKeygnP8s1y+zOriA5iR0!b(@15ya^FP_txei&I2RH6J720S!mUx8 zaZ4tQs_Bewom7umcA~i=uE`SH=XH0TcTf>30W`m|al__yVOWH=rA6oMXs9Gu;US)g zwvS9id$A{uoYj7PJLarTmTP>yH%lSAR(+7(U1M+J^8P8Aj=FV817ph|QDbB4U6vZQ z`fnFQ5=4XPi?CNM*XRO!D$}p6yqjKg9gpveoEjV$k6(=P`sSxR*=3^X!Lbrwz1_D+ zM{TxH_AR!%dVBJ!c5kZIrhpYrnPOZ`QDlu(=_@b#|J8IgFimA?^p$RlK%quU%Sy*7 zv&y!Bog$<{@Tb7C{3-}DLfm$SHtMv7x)fK*_?H4wq}?z@VL(SG1qK( zm0^(7pG~mUVM_!xOEnpd8^)O3b6+(ruea~rzJ`0g``vr{&N&Y!t38d~!gYOT^;!-^!Afk>b==L;bylk7c9J7GiJzxljE_^?FGl zIL?pt%t4lCLQMLf&sOrb@K*+3FC~mUKc#v-AwEK4P(ngb5wog{Pu8&{Of+ zm806Y569%*u%L~pr=<8q8RLBg%CGQ{PqO26U=4#~3;EDI9IhxQq@dxo&~6$EwBX$u z6l@7307S9*iY=*=ir zALFe){p#lvv)aMsCUqaad`fu{k(OT9ITO$AlkR-0_~ni?n{0(mZ+e&i;tCJu{?s?Q zP5WtAzUW;qSK3AnKCcURHh8aq5H2alhgjAU;MwGRX=r}f&$vj_w zwz%V%`=rB_Kb=vC;&>}gCc9A$?R`rGaRDW1eKOTsl#*Ki_8v9&Qi0z)II7piO=Zw4 zXVg^Mzb3pd!uD2{i98I9yG-5+$Oy^KAfF{YX~I-#E`|H#c_xq0IOa~N&MlGhl(T+f7@nwr-Pm&dW`L|=+|EZm)1>n%-cD|^$p{Fy7)7mgb#ZLuNowEV!8c8T9z`u=m_#WxM4+Jns_nYq4} zuHudpUKI0#tY4ljaQClOp+e8X+nqSk=6~*2_5_(Ehn?`skCI)gj=B~{bcZWH$lNRJ z39=sMOTO?TipddI{)}`gGk+#tSNF|%?Au!FFYd^6>q+RLbd5y&Rs)&T8B&3Ez^uIZ zrg0uwa=g?0G!iZ%u0D;c`A39q)H(Vc3S>Nb{1UMpTttamG-eokHuyq(Bt^j6lFahV zDU`SvK&OkB|FzW%*V)3u?aU@Go?$KaEXxh!M>nidH;jVg2`CE=i86ezOT~oXOJ9%I zcoV@Q#+6;H5oo$O2!O2j^e9?`MF`#FaH0cAmMW8h9Eed7uG zW%pMHA>R@P*boR0iH!WOaQO&?h;37@a$w-a}B<4q^yE2OhSBXXpp2pGNn0HlYup z-~j9xGHU&KWCwvE1qkFd&)~z$ZhdmNOyBE~uzkBECGvYi%D`}^xdH)~+)dtr&S`dr z04mV6k~ht|sw<~<0SjEf=F)ZW3sNj{(yleuljTE&t>*6phE&*HLa;6K6jlP%N@AkZ zyxr_{R9$Hn!3t`;!E!E_;}r?GS)`#QGGGIt ze{pt)5khlFNej0Qn5@B|!%&jbu$@|}NDP=GoX!EYTq>D`-Hyho3AvMcHcDaJ5g5L3 zs;2Dk73+r%-%l7i{I9BS-TL1=QaOICQ8^}^`hF>xRe&fAJu(%>L)YRV?!=($)Y~#a zfSrybFZ>wwVXpwou3EoB2{NI{7Zw<{z5p8Jon^S4^zfK@P+ly_qWU zp(&{d2h@*>b|nQ0>WO=_3_?R@x=E_zq$#2ZCbBz}6nNN0cbJ0os{+9xe%$C~YV{rOkH_mN3&P7rG7gTUr)DrIny8P6v4w6j4?+odUB<)P$3T^JkZ67OjM} zL<`u%tnYgiZrTO7XigrewNyXz5-g3HiGj+57!rPG|MuH(e>LEVF%@npG&D7MnV&zq zNf7%7AY`2-t|`I(Vok8WROak2R+}Sglt(LuHkf>S9?8AQ$e{)_cC^ET2~Y?Qxa#>y z3Hdb~CA2$AK%fEjiI0p)5`#_@E%O&#G5I2$n)X7IB(|1V!;~il%#k6lsHl<^mnG1D z+b2Aq&=ycYVoe@PnUnO#?vrJzTkYs9<-5t9fAW+?r_ZcvuaAU1E@^Sx>^0nguP9KT z%jFaE(nYH~o#jDO+DX&vsC^A4E#qJT>oF4C0UIk?F%ACLzri&o4+ff)JQ}-O$cUpm zg;59eA(U1~{wiMSfOI*e&x8n#@Rm2ghv5TpjEq9W%;_d1cWC6BFhGF1n8ZD4<=i0@ zvQt9MyP?gqQ4zUkh;ICD=id_AJZWg|Rv1yG8cIwsr!|nO>rIl=kT3<2?yB)R53LPm zK_X0Y#T40KK1?RTw55rj3GS>3Rwj**j@dl8jSr{T34I@0)<|aXar=ydbmRXX{vWqHp*#Qp literal 0 HcmV?d00001 diff --git a/candle-examples/examples/stable-diffusion-3/clip.rs b/candle-examples/examples/stable-diffusion-3/clip.rs new file mode 100644 index 0000000000..77263d968c --- /dev/null +++ b/candle-examples/examples/stable-diffusion-3/clip.rs @@ -0,0 +1,201 @@ +use anyhow::{Error as E, Ok, Result}; +use candle::{DType, IndexOp, Module, Tensor, D}; +use candle_transformers::models::{stable_diffusion, t5}; +use tokenizers::tokenizer::Tokenizer; + +struct ClipWithTokenizer { + clip: stable_diffusion::clip::ClipTextTransformer, + config: stable_diffusion::clip::Config, + tokenizer: Tokenizer, + max_position_embeddings: usize, +} + +impl ClipWithTokenizer { + fn new( + vb: candle_nn::VarBuilder, + config: stable_diffusion::clip::Config, + tokenizer_path: &str, + max_position_embeddings: usize, + ) -> Result { + let clip = stable_diffusion::clip::ClipTextTransformer::new(vb, &config)?; + let path_buf = hf_hub::api::sync::Api::new()? + .model(tokenizer_path.to_string()) + .get("tokenizer.json")?; + let tokenizer = Tokenizer::from_file(path_buf.to_str().ok_or(E::msg( + "Failed to serialize huggingface PathBuf of CLIP tokenizer", + ))?) + .map_err(E::msg)?; + Ok(Self { + clip, + config, + tokenizer, + max_position_embeddings, + }) + } + + fn encode_text_to_embedding( + &self, + prompt: &str, + device: &candle::Device, + ) -> Result<(Tensor, Tensor)> { + let pad_id = match &self.config.pad_with { + Some(padding) => *self + .tokenizer + .get_vocab(true) + .get(padding.as_str()) + .ok_or(E::msg("Failed to tokenize CLIP padding."))?, + None => *self + .tokenizer + .get_vocab(true) + .get("<|endoftext|>") + .ok_or(E::msg("Failed to tokenize CLIP end-of-text."))?, + }; + + let mut tokens = self + .tokenizer + .encode(prompt, true) + .map_err(E::msg)? + .get_ids() + .to_vec(); + + let eos_position = tokens.len() - 1; + + while tokens.len() < self.max_position_embeddings { + tokens.push(pad_id) + } + let tokens = Tensor::new(tokens.as_slice(), device)?.unsqueeze(0)?; + let (text_embeddings, text_embeddings_penultimate) = self + .clip + .forward_until_encoder_layer(&tokens, usize::MAX, -2)?; + let text_embeddings_pooled = text_embeddings.i((0, eos_position, ..))?; + + Ok((text_embeddings_penultimate, text_embeddings_pooled)) + } +} + +struct T5WithTokenizer { + t5: t5::T5EncoderModel, + tokenizer: Tokenizer, + max_position_embeddings: usize, +} + +impl T5WithTokenizer { + fn new(vb: candle_nn::VarBuilder, max_position_embeddings: usize) -> Result { + let api = hf_hub::api::sync::Api::new()?; + let repo = api.repo(hf_hub::Repo::with_revision( + "google/t5-v1_1-xxl".to_string(), + hf_hub::RepoType::Model, + "refs/pr/2".to_string(), + )); + let config_filename = repo.get("config.json")?; + let config = std::fs::read_to_string(config_filename)?; + let config: t5::Config = serde_json::from_str(&config)?; + let model = t5::T5EncoderModel::load(vb, &config)?; + + let tokenizer_filename = api + .model("lmz/mt5-tokenizers".to_string()) + .get("t5-v1_1-xxl.tokenizer.json")?; + + let tokenizer = Tokenizer::from_file(tokenizer_filename).map_err(E::msg)?; + Ok(Self { + t5: model, + tokenizer, + max_position_embeddings, + }) + } + + fn encode_text_to_embedding( + &mut self, + prompt: &str, + device: &candle::Device, + ) -> Result { + let mut tokens = self + .tokenizer + .encode(prompt, true) + .map_err(E::msg)? + .get_ids() + .to_vec(); + tokens.resize(self.max_position_embeddings, 0); + let input_token_ids = Tensor::new(&tokens[..], device)?.unsqueeze(0)?; + let embeddings = self.t5.forward(&input_token_ids)?; + Ok(embeddings) + } +} + +pub struct StableDiffusion3TripleClipWithTokenizer { + clip_l: ClipWithTokenizer, + clip_g: ClipWithTokenizer, + clip_g_text_projection: candle_nn::Linear, + t5: T5WithTokenizer, +} + +impl StableDiffusion3TripleClipWithTokenizer { + pub fn new(vb_fp16: candle_nn::VarBuilder, vb_fp32: candle_nn::VarBuilder) -> Result { + let max_position_embeddings = 77usize; + let clip_l = ClipWithTokenizer::new( + vb_fp16.pp("clip_l.transformer"), + stable_diffusion::clip::Config::sdxl(), + "openai/clip-vit-large-patch14", + max_position_embeddings, + )?; + + let clip_g = ClipWithTokenizer::new( + vb_fp16.pp("clip_g.transformer"), + stable_diffusion::clip::Config::sdxl2(), + "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", + max_position_embeddings, + )?; + + let text_projection = candle_nn::linear_no_bias( + 1280, + 1280, + vb_fp16.pp("clip_g.transformer.text_projection"), + )?; + + // Current T5 implementation does not support fp16, so we use fp32 VarBuilder for T5. + // This is a temporary workaround until the T5 implementation is updated to support fp16. + // Also see: + // https://github.com/huggingface/candle/issues/2480 + // https://github.com/huggingface/candle/pull/2481 + let t5 = T5WithTokenizer::new(vb_fp32.pp("t5xxl.transformer"), max_position_embeddings)?; + + Ok(Self { + clip_l, + clip_g, + clip_g_text_projection: text_projection, + t5, + }) + } + + pub fn encode_text_to_embedding( + &mut self, + prompt: &str, + device: &candle::Device, + ) -> Result<(Tensor, Tensor)> { + let (clip_l_embeddings, clip_l_embeddings_pooled) = + self.clip_l.encode_text_to_embedding(prompt, device)?; + let (clip_g_embeddings, clip_g_embeddings_pooled) = + self.clip_g.encode_text_to_embedding(prompt, device)?; + + let clip_g_embeddings_pooled = self + .clip_g_text_projection + .forward(&clip_g_embeddings_pooled.unsqueeze(0)?)? + .squeeze(0)?; + + let y = Tensor::cat(&[&clip_l_embeddings_pooled, &clip_g_embeddings_pooled], 0)? + .unsqueeze(0)?; + let clip_embeddings_concat = Tensor::cat( + &[&clip_l_embeddings, &clip_g_embeddings], + D::Minus1, + )? + .pad_with_zeros(D::Minus1, 0, 2048)?; + + let t5_embeddings = self + .t5 + .encode_text_to_embedding(prompt, device)? + .to_dtype(DType::F16)?; + let context = Tensor::cat(&[&clip_embeddings_concat, &t5_embeddings], D::Minus2)?; + + Ok((context, y)) + } +} diff --git a/candle-examples/examples/stable-diffusion-3/main.rs b/candle-examples/examples/stable-diffusion-3/main.rs new file mode 100644 index 0000000000..164ae4205b --- /dev/null +++ b/candle-examples/examples/stable-diffusion-3/main.rs @@ -0,0 +1,185 @@ +mod clip; +mod sampling; +mod vae; + +use candle::{DType, IndexOp, Tensor}; +use candle_transformers::models::mmdit::model::{Config as MMDiTConfig, MMDiT}; + +use crate::clip::StableDiffusion3TripleClipWithTokenizer; +use crate::vae::{build_sd3_vae_autoencoder, sd3_vae_vb_rename}; + +use anyhow::{Ok, Result}; +use clap::Parser; + +#[derive(Parser)] +#[command(author, version, about, long_about = None)] +struct Args { + /// The prompt to be used for image generation. + #[arg( + long, + default_value = "A cute rusty robot holding a candle torch in its hand, \ + with glowing neon text \"LETS GO RUSTY\" displayed on its chest, \ + bright background, high quality, 4k" + )] + prompt: String, + + #[arg(long, default_value = "")] + uncond_prompt: String, + + /// Run on CPU rather than on GPU. + #[arg(long)] + cpu: bool, + + /// The CUDA device ID to use. + #[arg(long, default_value = "0")] + cuda_device_id: usize, + + /// Enable tracing (generates a trace-timestamp.json file). + #[arg(long)] + tracing: bool, + + /// Use flash_attn to accelerate attention operation in the MMDiT. + #[arg(long)] + use_flash_attn: bool, + + /// The height in pixels of the generated image. + #[arg(long, default_value_t = 1024)] + height: usize, + + /// The width in pixels of the generated image. + #[arg(long, default_value_t = 1024)] + width: usize, + + /// The seed to use when generating random samples. + #[arg(long, default_value_t = 28)] + num_inference_steps: usize, + + // CFG scale. + #[arg(long, default_value_t = 4.0)] + cfg_scale: f64, + + // Time shift factor (alpha). + #[arg(long, default_value_t = 3.0)] + time_shift: f64, + + /// The seed to use when generating random samples. + #[arg(long)] + seed: Option, +} + +fn main() -> Result<()> { + let args = Args::parse(); + // Your main code here + run(args) +} + +fn run(args: Args) -> Result<()> { + use tracing_chrome::ChromeLayerBuilder; + use tracing_subscriber::prelude::*; + + let Args { + prompt, + uncond_prompt, + cpu, + cuda_device_id, + tracing, + use_flash_attn, + height, + width, + num_inference_steps, + cfg_scale, + time_shift, + seed, + } = args; + + let _guard = if tracing { + let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); + tracing_subscriber::registry().with(chrome_layer).init(); + Some(guard) + } else { + None + }; + + // TODO: Support and test on Metal. + let device = if cpu { + candle::Device::Cpu + } else { + candle::Device::cuda_if_available(cuda_device_id)? + }; + + let api = hf_hub::api::sync::Api::new()?; + let sai_repo = { + let name = "stabilityai/stable-diffusion-3-medium"; + api.repo(hf_hub::Repo::model(name.to_string())) + }; + let model_file = sai_repo.get("sd3_medium_incl_clips_t5xxlfp16.safetensors")?; + let vb_fp16 = unsafe { + candle_nn::VarBuilder::from_mmaped_safetensors(&[model_file.clone()], DType::F16, &device)? + }; + + let (context, y) = { + let vb_fp32 = unsafe { + candle_nn::VarBuilder::from_mmaped_safetensors( + &[model_file.clone()], + DType::F32, + &device, + )? + }; + let mut triple = StableDiffusion3TripleClipWithTokenizer::new( + vb_fp16.pp("text_encoders"), + vb_fp32.pp("text_encoders"), + )?; + let (context, y) = triple.encode_text_to_embedding(prompt.as_str(), &device)?; + let (context_uncond, y_uncond) = + triple.encode_text_to_embedding(uncond_prompt.as_str(), &device)?; + ( + Tensor::cat(&[context, context_uncond], 0)?, + Tensor::cat(&[y, y_uncond], 0)?, + ) + }; + + let x = { + let mmdit = MMDiT::new( + &MMDiTConfig::sd3_medium(), + use_flash_attn, + vb_fp16.pp("model.diffusion_model"), + )?; + + if let Some(seed) = seed { + device.set_seed(seed)?; + } + let start_time = std::time::Instant::now(); + let x = sampling::euler_sample( + &mmdit, + &y, + &context, + num_inference_steps, + cfg_scale, + time_shift, + height, + width, + )?; + let dt = start_time.elapsed().as_secs_f32(); + println!( + "Sampling done. {num_inference_steps} steps. {:.2}s. Average rate: {:.2} iter/s", + dt, + num_inference_steps as f32 / dt + ); + x + }; + + let img = { + let vb_vae = vb_fp16 + .clone() + .rename_f(sd3_vae_vb_rename) + .pp("first_stage_model"); + let autoencoder = build_sd3_vae_autoencoder(vb_vae)?; + + // Apply TAESD3 scale factor. Seems to be significantly improving the quality of the image. + // https://github.com/comfyanonymous/ComfyUI/blob/3c60ecd7a83da43d694e26a77ca6b93106891251/nodes.py#L721-L723 + autoencoder.decode(&((x.clone() / 1.5305)? + 0.0609)?)? + }; + let img = ((img.clamp(-1f32, 1f32)? + 1.0)? * 127.5)?.to_dtype(candle::DType::U8)?; + candle_examples::save_image(&img.i(0)?, "out.jpg")?; + Ok(()) +} diff --git a/candle-examples/examples/stable-diffusion-3/sampling.rs b/candle-examples/examples/stable-diffusion-3/sampling.rs new file mode 100644 index 0000000000..147d8e7380 --- /dev/null +++ b/candle-examples/examples/stable-diffusion-3/sampling.rs @@ -0,0 +1,55 @@ +use anyhow::{Ok, Result}; +use candle::{DType, Tensor}; + +use candle_transformers::models::flux; +use candle_transformers::models::mmdit::model::MMDiT; // for the get_noise function + +#[allow(clippy::too_many_arguments)] +pub fn euler_sample( + mmdit: &MMDiT, + y: &Tensor, + context: &Tensor, + num_inference_steps: usize, + cfg_scale: f64, + time_shift: f64, + height: usize, + width: usize, +) -> Result { + let mut x = flux::sampling::get_noise(1, height, width, y.device())?.to_dtype(DType::F16)?; + let sigmas = (0..=num_inference_steps) + .map(|x| x as f64 / num_inference_steps as f64) + .rev() + .map(|x| time_snr_shift(time_shift, x)) + .collect::>(); + + for window in sigmas.windows(2) { + let (s_curr, s_prev) = match window { + [a, b] => (a, b), + _ => continue, + }; + + let timestep = (*s_curr) * 1000.0; + let noise_pred = mmdit.forward( + &Tensor::cat(&[x.clone(), x.clone()], 0)?, + &Tensor::full(timestep, (2,), x.device())?.contiguous()?, + y, + context, + )?; + x = (x + (apply_cfg(cfg_scale, &noise_pred)? * (*s_prev - *s_curr))?)?; + } + Ok(x) +} + +// The "Resolution-dependent shifting of timestep schedules" recommended in the SD3 tech report paper +// https://arxiv.org/pdf/2403.03206 +// Following the implementation in ComfyUI: +// https://github.com/comfyanonymous/ComfyUI/blob/3c60ecd7a83da43d694e26a77ca6b93106891251/ +// comfy/model_sampling.py#L181 +fn time_snr_shift(alpha: f64, t: f64) -> f64 { + alpha * t / (1.0 + (alpha - 1.0) * t) +} + +fn apply_cfg(cfg_scale: f64, noise_pred: &Tensor) -> Result { + Ok(((cfg_scale * noise_pred.narrow(0, 0, 1)?)? + - ((cfg_scale - 1.0) * noise_pred.narrow(0, 1, 1)?)?)?) +} diff --git a/candle-examples/examples/stable-diffusion-3/vae.rs b/candle-examples/examples/stable-diffusion-3/vae.rs new file mode 100644 index 0000000000..708e472eff --- /dev/null +++ b/candle-examples/examples/stable-diffusion-3/vae.rs @@ -0,0 +1,93 @@ +use anyhow::{Ok, Result}; +use candle_transformers::models::stable_diffusion::vae; + +pub fn build_sd3_vae_autoencoder(vb: candle_nn::VarBuilder) -> Result { + let config = vae::AutoEncoderKLConfig { + block_out_channels: vec![128, 256, 512, 512], + layers_per_block: 2, + latent_channels: 16, + norm_num_groups: 32, + use_quant_conv: false, + use_post_quant_conv: false, + }; + Ok(vae::AutoEncoderKL::new(vb, 3, 3, config)?) +} + +pub fn sd3_vae_vb_rename(name: &str) -> String { + let parts: Vec<&str> = name.split('.').collect(); + let mut result = Vec::new(); + let mut i = 0; + + while i < parts.len() { + match parts[i] { + "down_blocks" => { + result.push("down"); + } + "mid_block" => { + result.push("mid"); + } + "up_blocks" => { + result.push("up"); + match parts[i + 1] { + // Reverse the order of up_blocks. + "0" => result.push("3"), + "1" => result.push("2"), + "2" => result.push("1"), + "3" => result.push("0"), + _ => {} + } + i += 1; // Skip the number after up_blocks. + } + "resnets" => { + if i > 0 && parts[i - 1] == "mid_block" { + match parts[i + 1] { + "0" => result.push("block_1"), + "1" => result.push("block_2"), + _ => {} + } + i += 1; // Skip the number after resnets. + } else { + result.push("block"); + } + } + "downsamplers" => { + result.push("downsample"); + i += 1; // Skip the 0 after downsamplers. + } + "conv_shortcut" => { + result.push("nin_shortcut"); + } + "attentions" => { + if parts[i + 1] == "0" { + result.push("attn_1") + } + i += 1; // Skip the number after attentions. + } + "group_norm" => { + result.push("norm"); + } + "query" => { + result.push("q"); + } + "key" => { + result.push("k"); + } + "value" => { + result.push("v"); + } + "proj_attn" => { + result.push("proj_out"); + } + "conv_norm_out" => { + result.push("norm_out"); + } + "upsamplers" => { + result.push("upsample"); + i += 1; // Skip the 0 after upsamplers. + } + part => result.push(part), + } + i += 1; + } + result.join(".") +} diff --git a/candle-transformers/src/models/mmdit/blocks.rs b/candle-transformers/src/models/mmdit/blocks.rs index e2b924a013..a1777f915b 100644 --- a/candle-transformers/src/models/mmdit/blocks.rs +++ b/candle-transformers/src/models/mmdit/blocks.rs @@ -194,10 +194,16 @@ pub struct JointBlock { x_block: DiTBlock, context_block: DiTBlock, num_heads: usize, + use_flash_attn: bool, } impl JointBlock { - pub fn new(hidden_size: usize, num_heads: usize, vb: nn::VarBuilder) -> Result { + pub fn new( + hidden_size: usize, + num_heads: usize, + use_flash_attn: bool, + vb: nn::VarBuilder, + ) -> Result { let x_block = DiTBlock::new(hidden_size, num_heads, vb.pp("x_block"))?; let context_block = DiTBlock::new(hidden_size, num_heads, vb.pp("context_block"))?; @@ -205,13 +211,15 @@ impl JointBlock { x_block, context_block, num_heads, + use_flash_attn, }) } pub fn forward(&self, context: &Tensor, x: &Tensor, c: &Tensor) -> Result<(Tensor, Tensor)> { let (context_qkv, context_interm) = self.context_block.pre_attention(context, c)?; let (x_qkv, x_interm) = self.x_block.pre_attention(x, c)?; - let (context_attn, x_attn) = joint_attn(&context_qkv, &x_qkv, self.num_heads)?; + let (context_attn, x_attn) = + joint_attn(&context_qkv, &x_qkv, self.num_heads, self.use_flash_attn)?; let context_out = self.context_block .post_attention(&context_attn, context, &context_interm)?; @@ -224,16 +232,23 @@ pub struct ContextQkvOnlyJointBlock { x_block: DiTBlock, context_block: QkvOnlyDiTBlock, num_heads: usize, + use_flash_attn: bool, } impl ContextQkvOnlyJointBlock { - pub fn new(hidden_size: usize, num_heads: usize, vb: nn::VarBuilder) -> Result { + pub fn new( + hidden_size: usize, + num_heads: usize, + use_flash_attn: bool, + vb: nn::VarBuilder, + ) -> Result { let x_block = DiTBlock::new(hidden_size, num_heads, vb.pp("x_block"))?; let context_block = QkvOnlyDiTBlock::new(hidden_size, num_heads, vb.pp("context_block"))?; Ok(Self { x_block, context_block, num_heads, + use_flash_attn, }) } @@ -241,7 +256,7 @@ impl ContextQkvOnlyJointBlock { let context_qkv = self.context_block.pre_attention(context, c)?; let (x_qkv, x_interm) = self.x_block.pre_attention(x, c)?; - let (_, x_attn) = joint_attn(&context_qkv, &x_qkv, self.num_heads)?; + let (_, x_attn) = joint_attn(&context_qkv, &x_qkv, self.num_heads, self.use_flash_attn)?; let x_out = self.x_block.post_attention(&x_attn, x, &x_interm)?; Ok(x_out) @@ -266,7 +281,28 @@ fn flash_compatible_attention( attn_scores.reshape(q_dims_for_matmul)?.transpose(1, 2) } -fn joint_attn(context_qkv: &Qkv, x_qkv: &Qkv, num_heads: usize) -> Result<(Tensor, Tensor)> { +#[cfg(feature = "flash-attn")] +fn flash_attn( + q: &Tensor, + k: &Tensor, + v: &Tensor, + softmax_scale: f32, + causal: bool, +) -> Result { + candle_flash_attn::flash_attn(q, k, v, softmax_scale, causal) +} + +#[cfg(not(feature = "flash-attn"))] +fn flash_attn(_: &Tensor, _: &Tensor, _: &Tensor, _: f32, _: bool) -> Result { + unimplemented!("compile with '--features flash-attn'") +} + +fn joint_attn( + context_qkv: &Qkv, + x_qkv: &Qkv, + num_heads: usize, + use_flash_attn: bool, +) -> Result<(Tensor, Tensor)> { let qkv = Qkv { q: Tensor::cat(&[&context_qkv.q, &x_qkv.q], 1)?, k: Tensor::cat(&[&context_qkv.k, &x_qkv.k], 1)?, @@ -282,8 +318,12 @@ fn joint_attn(context_qkv: &Qkv, x_qkv: &Qkv, num_heads: usize) -> Result<(Tenso let headdim = qkv.q.dim(D::Minus1)?; let softmax_scale = 1.0 / (headdim as f64).sqrt(); - // let attn: Tensor = candle_flash_attn::flash_attn(&qkv.q, &qkv.k, &qkv.v, softmax_scale as f32, false)?; - let attn = flash_compatible_attention(&qkv.q, &qkv.k, &qkv.v, softmax_scale as f32)?; + + let attn = if use_flash_attn { + flash_attn(&qkv.q, &qkv.k, &qkv.v, softmax_scale as f32, false)? + } else { + flash_compatible_attention(&qkv.q, &qkv.k, &qkv.v, softmax_scale as f32)? + }; let attn = attn.reshape((batch_size, seqlen, ()))?; let context_qkv_seqlen = context_qkv.q.dim(1)?; diff --git a/candle-transformers/src/models/mmdit/model.rs b/candle-transformers/src/models/mmdit/model.rs index 1523836c7f..864b662377 100644 --- a/candle-transformers/src/models/mmdit/model.rs +++ b/candle-transformers/src/models/mmdit/model.rs @@ -23,7 +23,7 @@ pub struct Config { } impl Config { - pub fn sd3() -> Self { + pub fn sd3_medium() -> Self { Self { patch_size: 2, in_channels: 16, @@ -49,7 +49,7 @@ pub struct MMDiT { } impl MMDiT { - pub fn new(cfg: &Config, vb: nn::VarBuilder) -> Result { + pub fn new(cfg: &Config, use_flash_attn: bool, vb: nn::VarBuilder) -> Result { let hidden_size = cfg.head_size * cfg.depth; let core = MMDiTCore::new( cfg.depth, @@ -57,6 +57,7 @@ impl MMDiT { cfg.depth, cfg.patch_size, cfg.out_channels, + use_flash_attn, vb.clone(), )?; let patch_embedder = PatchEmbedder::new( @@ -135,6 +136,7 @@ impl MMDiTCore { num_heads: usize, patch_size: usize, out_channels: usize, + use_flash_attn: bool, vb: nn::VarBuilder, ) -> Result { let mut joint_blocks = Vec::with_capacity(depth - 1); @@ -142,6 +144,7 @@ impl MMDiTCore { joint_blocks.push(JointBlock::new( hidden_size, num_heads, + use_flash_attn, vb.pp(format!("joint_blocks.{}", i)), )?); } @@ -151,6 +154,7 @@ impl MMDiTCore { context_qkv_only_joint_block: ContextQkvOnlyJointBlock::new( hidden_size, num_heads, + use_flash_attn, vb.pp(format!("joint_blocks.{}", depth - 1)), )?, final_layer: FinalLayer::new( diff --git a/candle-transformers/src/models/mmdit/projections.rs b/candle-transformers/src/models/mmdit/projections.rs index 1077398f5c..dc1e8ec941 100644 --- a/candle-transformers/src/models/mmdit/projections.rs +++ b/candle-transformers/src/models/mmdit/projections.rs @@ -42,7 +42,6 @@ pub struct QkvOnlyAttnProjections { impl QkvOnlyAttnProjections { pub fn new(dim: usize, num_heads: usize, vb: nn::VarBuilder) -> Result { - // {'dim': 1536, 'num_heads': 24} let head_dim = dim / num_heads; let qkv = nn::linear(dim, dim * 3, vb.pp("qkv"))?; Ok(Self { qkv, head_dim }) diff --git a/candle-transformers/src/models/stable_diffusion/attention.rs b/candle-transformers/src/models/stable_diffusion/attention.rs index 5cc59e8203..c04e6aa1ff 100644 --- a/candle-transformers/src/models/stable_diffusion/attention.rs +++ b/candle-transformers/src/models/stable_diffusion/attention.rs @@ -467,6 +467,24 @@ pub struct AttentionBlock { config: AttentionBlockConfig, } +// In the .safetensor weights of official Stable Diffusion 3 Medium Huggingface repo +// https://huggingface.co/stabilityai/stable-diffusion-3-medium +// Linear layer may use a different dimension for the weight in the linear, which is +// incompatible with the current implementation of the nn::linear constructor. +// This is a workaround to handle the different dimensions. +fn get_qkv_linear(channels: usize, vs: nn::VarBuilder) -> Result { + match vs.get((channels, channels), "weight") { + Ok(_) => nn::linear(channels, channels, vs), + Err(_) => { + let weight = vs + .get((channels, channels, 1, 1), "weight")? + .reshape((channels, channels))?; + let bias = vs.get((channels,), "bias")?; + Ok(nn::Linear::new(weight, Some(bias))) + } + } +} + impl AttentionBlock { pub fn new(vs: nn::VarBuilder, channels: usize, config: AttentionBlockConfig) -> Result { let num_head_channels = config.num_head_channels.unwrap_or(channels); @@ -478,10 +496,10 @@ impl AttentionBlock { } else { ("query", "key", "value", "proj_attn") }; - let query = nn::linear(channels, channels, vs.pp(q_path))?; - let key = nn::linear(channels, channels, vs.pp(k_path))?; - let value = nn::linear(channels, channels, vs.pp(v_path))?; - let proj_attn = nn::linear(channels, channels, vs.pp(out_path))?; + let query = get_qkv_linear(channels, vs.pp(q_path))?; + let key = get_qkv_linear(channels, vs.pp(k_path))?; + let value = get_qkv_linear(channels, vs.pp(v_path))?; + let proj_attn = get_qkv_linear(channels, vs.pp(out_path))?; let span = tracing::span!(tracing::Level::TRACE, "attn-block"); Ok(Self { group_norm, diff --git a/candle-transformers/src/models/stable_diffusion/clip.rs b/candle-transformers/src/models/stable_diffusion/clip.rs index 5254818e60..2f631248bc 100644 --- a/candle-transformers/src/models/stable_diffusion/clip.rs +++ b/candle-transformers/src/models/stable_diffusion/clip.rs @@ -388,6 +388,37 @@ impl ClipTextTransformer { let xs = self.encoder.forward(&xs, &causal_attention_mask)?; self.final_layer_norm.forward(&xs) } + + pub fn forward_until_encoder_layer( + &self, + xs: &Tensor, + mask_after: usize, + until_layer: isize, + ) -> Result<(Tensor, Tensor)> { + let (bsz, seq_len) = xs.dims2()?; + let xs = self.embeddings.forward(xs)?; + let causal_attention_mask = + Self::build_causal_attention_mask(bsz, seq_len, mask_after, xs.device())?; + + let mut xs = xs.clone(); + let mut intermediate = xs.clone(); + + // Modified encoder.forward that returns the intermediate tensor along with final output. + let until_layer = if until_layer < 0 { + self.encoder.layers.len() as isize + until_layer + } else { + until_layer + } as usize; + + for (layer_id, layer) in self.encoder.layers.iter().enumerate() { + xs = layer.forward(&xs, &causal_attention_mask)?; + if layer_id == until_layer { + intermediate = xs.clone(); + } + } + + Ok((self.final_layer_norm.forward(&xs)?, intermediate)) + } } impl Module for ClipTextTransformer { diff --git a/candle-transformers/src/models/stable_diffusion/mod.rs b/candle-transformers/src/models/stable_diffusion/mod.rs index 30f239756c..37f4cdbf59 100644 --- a/candle-transformers/src/models/stable_diffusion/mod.rs +++ b/candle-transformers/src/models/stable_diffusion/mod.rs @@ -65,6 +65,8 @@ impl StableDiffusionConfig { layers_per_block: 2, latent_channels: 4, norm_num_groups: 32, + use_quant_conv: true, + use_post_quant_conv: true, }; let height = if let Some(height) = height { assert_eq!(height % 8, 0, "height has to be divisible by 8"); @@ -133,6 +135,8 @@ impl StableDiffusionConfig { layers_per_block: 2, latent_channels: 4, norm_num_groups: 32, + use_quant_conv: true, + use_post_quant_conv: true, }; let scheduler = Arc::new(ddim::DDIMSchedulerConfig { prediction_type, @@ -214,6 +218,8 @@ impl StableDiffusionConfig { layers_per_block: 2, latent_channels: 4, norm_num_groups: 32, + use_quant_conv: true, + use_post_quant_conv: true, }; let scheduler = Arc::new(ddim::DDIMSchedulerConfig { prediction_type, @@ -281,6 +287,8 @@ impl StableDiffusionConfig { layers_per_block: 2, latent_channels: 4, norm_num_groups: 32, + use_quant_conv: true, + use_post_quant_conv: true, }; let scheduler = Arc::new( euler_ancestral_discrete::EulerAncestralDiscreteSchedulerConfig { @@ -378,6 +386,8 @@ impl StableDiffusionConfig { layers_per_block: 2, latent_channels: 4, norm_num_groups: 32, + use_quant_conv: true, + use_post_quant_conv: true, }; let scheduler = Arc::new(ddim::DDIMSchedulerConfig { ..Default::default() diff --git a/candle-transformers/src/models/stable_diffusion/vae.rs b/candle-transformers/src/models/stable_diffusion/vae.rs index 670b3f5638..b3aba80277 100644 --- a/candle-transformers/src/models/stable_diffusion/vae.rs +++ b/candle-transformers/src/models/stable_diffusion/vae.rs @@ -275,6 +275,8 @@ pub struct AutoEncoderKLConfig { pub layers_per_block: usize, pub latent_channels: usize, pub norm_num_groups: usize, + pub use_quant_conv: bool, + pub use_post_quant_conv: bool, } impl Default for AutoEncoderKLConfig { @@ -284,6 +286,8 @@ impl Default for AutoEncoderKLConfig { layers_per_block: 1, latent_channels: 4, norm_num_groups: 32, + use_quant_conv: true, + use_post_quant_conv: true, } } } @@ -315,8 +319,8 @@ impl DiagonalGaussianDistribution { pub struct AutoEncoderKL { encoder: Encoder, decoder: Decoder, - quant_conv: nn::Conv2d, - post_quant_conv: nn::Conv2d, + quant_conv: Option, + post_quant_conv: Option, pub config: AutoEncoderKLConfig, } @@ -342,20 +346,33 @@ impl AutoEncoderKL { }; let decoder = Decoder::new(vs.pp("decoder"), latent_channels, out_channels, decoder_cfg)?; let conv_cfg = Default::default(); - let quant_conv = nn::conv2d( - 2 * latent_channels, - 2 * latent_channels, - 1, - conv_cfg, - vs.pp("quant_conv"), - )?; - let post_quant_conv = nn::conv2d( - latent_channels, - latent_channels, - 1, - conv_cfg, - vs.pp("post_quant_conv"), - )?; + + let quant_conv = { + if config.use_quant_conv { + Some(nn::conv2d( + 2 * latent_channels, + 2 * latent_channels, + 1, + conv_cfg, + vs.pp("quant_conv"), + )?) + } else { + None + } + }; + let post_quant_conv = { + if config.use_post_quant_conv { + Some(nn::conv2d( + latent_channels, + latent_channels, + 1, + conv_cfg, + vs.pp("post_quant_conv"), + )?) + } else { + None + } + }; Ok(Self { encoder, decoder, @@ -368,13 +385,19 @@ impl AutoEncoderKL { /// Returns the distribution in the latent space. pub fn encode(&self, xs: &Tensor) -> Result { let xs = self.encoder.forward(xs)?; - let parameters = self.quant_conv.forward(&xs)?; + let parameters = match &self.quant_conv { + None => xs, + Some(quant_conv) => quant_conv.forward(&xs)?, + }; DiagonalGaussianDistribution::new(¶meters) } /// Takes as input some sampled values. pub fn decode(&self, xs: &Tensor) -> Result { - let xs = self.post_quant_conv.forward(xs)?; - self.decoder.forward(&xs) + let xs = match &self.post_quant_conv { + None => xs, + Some(post_quant_conv) => &post_quant_conv.forward(xs)?, + }; + self.decoder.forward(xs) } } diff --git a/candle-wasm-examples/yolo/Cargo.toml b/candle-wasm-examples/yolo/Cargo.toml index e03319a043..c492521005 100644 --- a/candle-wasm-examples/yolo/Cargo.toml +++ b/candle-wasm-examples/yolo/Cargo.toml @@ -35,7 +35,7 @@ yew-agent = "0.2.0" yew = { version = "0.20.0", features = ["csr"] } [dependencies.web-sys] -version = "0.3.70" +version = "=0.3.70" features = [ 'Blob', 'CanvasRenderingContext2d', diff --git a/candle-wasm-tests/tests/quantized_tests.rs b/candle-wasm-tests/tests/quantized_tests.rs index 8705df4219..ae448078f0 100644 --- a/candle-wasm-tests/tests/quantized_tests.rs +++ b/candle-wasm-tests/tests/quantized_tests.rs @@ -1,3 +1,4 @@ +#![allow(unused)] use candle::{ quantized::{self, k_quants, GgmlDType, GgmlType}, test_utils::to_vec2_round, From 6eab6b57f57b5e935460cce9a000d5029d3ed75a Mon Sep 17 00:00:00 2001 From: Czxck001 <10724409+Czxck001@users.noreply.github.com> Date: Sun, 13 Oct 2024 13:55:26 -0700 Subject: [PATCH 16/28] Fix the guide to gain access to Stable Diffusion 3 Medium (#2559) --- candle-examples/examples/stable-diffusion-3/README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/candle-examples/examples/stable-diffusion-3/README.md b/candle-examples/examples/stable-diffusion-3/README.md index 746a31fa1b..52ebfa55e1 100644 --- a/candle-examples/examples/stable-diffusion-3/README.md +++ b/candle-examples/examples/stable-diffusion-3/README.md @@ -12,9 +12,16 @@ Stable Diffusion 3 Medium is a text-to-image model based on Multimodal Diffusion ## Getting access to the weights -The weights of Stable Diffusion 3 Medium is released by Stability AI under the Stability Community License. You will need to accept the conditions and acquire a license by visiting the [repo on HuggingFace Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium) to gain access to the weights for your HuggingFace account. +The weights of Stable Diffusion 3 Medium is released by Stability AI under the Stability Community License. You will need to accept the conditions and acquire a license by visiting [the repo on HuggingFace Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium) to gain access to the weights for your HuggingFace account. -On the first run, the weights will be automatically downloaded from the Huggingface Hub. You might be prompted to configure a [Huggingface User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens) (recommended) on your computer if you haven't done that before. After the download, the weights will be [cached](https://huggingface.co/docs/datasets/en/cache) and remain accessible locally. +To allow your computer to gain access to the public-gated repos on HuggingFace, you might need to create a [HuggingFace User Access Tokens](https://huggingface.co/docs/hub/en/security-tokens) (recommended) and log in on your computer if you haven't done that before. A convenient way to do the login is to use [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli): + +```shell +huggingface-cli login +``` +and you will be prompted to enter your token. + +On the first run, the weights will be automatically downloaded from the Huggingface Hub. After the download, the weights will be [cached](https://huggingface.co/docs/datasets/en/cache) and remain accessible locally. ## Running the model From 41ade774e8606325572215b93ef2152432997fda Mon Sep 17 00:00:00 2001 From: Mikarific Date: Sun, 13 Oct 2024 15:05:50 -0600 Subject: [PATCH 17/28] fix: Allow marian configs to deserialize from json. (#2556) --- candle-transformers/src/models/marian.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/candle-transformers/src/models/marian.rs b/candle-transformers/src/models/marian.rs index 05804a1c1e..c4299da601 100644 --- a/candle-transformers/src/models/marian.rs +++ b/candle-transformers/src/models/marian.rs @@ -1,8 +1,9 @@ use super::with_tracing::{linear, Embedding, Linear}; use candle::{Result, Tensor}; use candle_nn::{layer_norm, LayerNorm, VarBuilder}; +use serde::Deserialize; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Deserialize)] pub struct Config { pub vocab_size: usize, pub decoder_vocab_size: Option, From f553ab5eb401cc3e1588db7fe987aae37f65d113 Mon Sep 17 00:00:00 2001 From: Anubhab Bandyopadhyay <4890833+AnubhabB@users.noreply.github.com> Date: Mon, 14 Oct 2024 02:39:12 +0530 Subject: [PATCH 18/28] Adds support for Stella_en_v5 embedding model - 1.5B variant (#2551) * Stella_en_1.5B_v5 * Separated creation. This is a critical step for numerical accuracy and would be documented in the readme * EmbedDim would require clone and copy * WIP: example * Examples added * a litte more in README --- .../examples/stella-en-v5/README.md | 45 ++ candle-examples/examples/stella-en-v5/main.rs | 359 ++++++++++++++++ candle-transformers/src/models/mod.rs | 1 + .../src/models/stella_en_v5.rs | 399 ++++++++++++++++++ 4 files changed, 804 insertions(+) create mode 100644 candle-examples/examples/stella-en-v5/README.md create mode 100644 candle-examples/examples/stella-en-v5/main.rs create mode 100644 candle-transformers/src/models/stella_en_v5.rs diff --git a/candle-examples/examples/stella-en-v5/README.md b/candle-examples/examples/stella-en-v5/README.md new file mode 100644 index 0000000000..5fcc67c351 --- /dev/null +++ b/candle-examples/examples/stella-en-v5/README.md @@ -0,0 +1,45 @@ +# candle-stella-en-v5: Implementation of [stella_en_1.5B_v5](https://huggingface.co/dunzhang/stella_en_1.5B_v5) embedding model + +As of 7th Oct 2024, *Stella_en_1.5B_v5* is one of the top ranking model on `retrieval` and `reranking` tasks in [MTEB](https://huggingface.co/spaces/mteb/leaderboard) leaderboard. + +[Model card](https://huggingface.co/dunzhang/stella_en_1.5B_v5) on the HuggingFace Hub. + +## Running the example + +Stella_en_1.5B_v5 is used to generate text embeddings embeddings for a prompt. The model weights +are downloaded from the hub on the first run. + +```bash +$ cargo run --example stella-en-v5 --release -- --query "What are safetensors?" + +> [[ 0.3905, -0.0130, 0.2072, ..., -0.1100, -0.0086, 0.6002]] +> Tensor[[1, 1024], f32] +``` + +Stella_en_1.5B_v5 is trained by [MRL](https://arxiv.org/abs/2205.13147) enabling multiple embedding dimensions. + +The following reproduces the example in the [model card](https://huggingface.co/dunzhang/stella_en_1.5B_v5) for a retrieval task (s2p). The sample queries and docs are hardcoded in the example. + +```bash +$ cargo run --example stella-en-v5 --release --features + +> +> Score: 0.8178786 +> Query: What are some ways to reduce stress? +> Answer: There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending +> time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent +> stress from building up. +> +> +> Score: 0.7853528 +> Query: What are the benefits of drinking green tea? +> Answer: Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage +> caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types > +> of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties. +> +``` + +## Supported options: +- `Stella_en_15B_v5` supports 256, 768, 1024, 2048, 4096, 6144 and 8192 embedding dimensions (though the model card mentions 512, I couldn't find weights for the same). In the example run this is supported with `--embed-dim` option. E.g. `... --embed-dim 4096`. Defaults to `1024`. + +- As per the [model card](https://huggingface.co/dunzhang/stella_en_1.5B_v5), the model has been primarily trained on `s2s` (similarity) and `s2p` (retrieval) tasks. These require a slightly different `query` preprocessing (a different prompt template for each). In this example this is enabled though `--task` option. \ No newline at end of file diff --git a/candle-examples/examples/stella-en-v5/main.rs b/candle-examples/examples/stella-en-v5/main.rs new file mode 100644 index 0000000000..2408262b1a --- /dev/null +++ b/candle-examples/examples/stella-en-v5/main.rs @@ -0,0 +1,359 @@ +#[cfg(feature = "mkl")] +extern crate intel_mkl_src; + +#[cfg(feature = "accelerate")] +extern crate accelerate_src; + +use std::path::Path; + +use anyhow::{anyhow, Error as E, Result}; +use clap::Parser; + +use candle_transformers::models::stella_en_v5::{ + Config, EmbedDim as StellaEmbedDim, EmbeddingModel, +}; + +use candle::{DType, Device, Tensor}; +use candle_nn::VarBuilder; +use hf_hub::{api::sync::Api, Repo}; +use tokenizers::{PaddingDirection, PaddingParams, PaddingStrategy, Tokenizer}; + +struct Embedding { + model: EmbeddingModel, + device: Device, + tokenizer: Tokenizer, +} + +impl Embedding { + fn new(model: EmbeddingModel, tokenizer: Tokenizer, device: &Device) -> Self { + Self { + model, + tokenizer, + device: device.clone(), + } + } + + fn encode(&mut self, task: EncodeTask, text: Option) -> Result<()> { + // Just shocasing embeddings, this has no real value + if let Some(text) = text { + let qry = task.query_preproc(&[text]); + let encoding = self.tokenizer.encode(qry, true).map_err(|e| anyhow!(e))?; + + let shape = (1, encoding.len()); + let input = Tensor::from_slice(encoding.get_ids(), shape, &self.device)?; + let mask = Tensor::from_slice(encoding.get_attention_mask(), shape, &self.device)?; + + let result = self.model.forward(&input, &mask)?; + println!("embeddings: {result}"); + } else { + // Examples copied from [Model Card](https://huggingface.co/dunzhang/stella_en_1.5B_v5#transformers) + let queries = [ + "What are some ways to reduce stress?".to_string(), + "What are the benefits of drinking green tea?".to_string(), + ]; + + let docs = [ + "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.".to_string(), + "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.".to_string(), + ]; + + // We only encode the queries and not the data + let qry = task.query_preproc(&queries); + let mut qry_encoded = self + .tokenizer + .encode_batch(qry, true) + .map_err(|e| anyhow!(e))?; + + let mut docs_encoded = self + .tokenizer + .encode_batch(docs.to_vec(), true) + .map_err(|e| anyhow!(e))?; + + let qry_embed = { + // Now, we generate the tensors for the `input` and `mask` + let shape = (qry_encoded.len(), qry_encoded[1].len()); + let mut ids = Tensor::zeros(shape, DType::U32, &self.device)?; + let mut masks = Tensor::zeros(shape, DType::U8, &self.device)?; + + for (i, e) in qry_encoded.drain(..).enumerate() { + let input_id = + Tensor::from_iter(e.get_ids().to_vec(), &self.device)?.unsqueeze(0)?; + let mask = Tensor::from_iter(e.get_attention_mask().to_vec(), &self.device)? + .to_dtype(DType::U8)? + .unsqueeze(0)?; + + ids = + ids.slice_assign(&[i..i + 1, 0..input_id.dims2().unwrap().1], &input_id)?; + masks = masks.slice_assign(&[i..i + 1, 0..mask.dims2().unwrap().1], &mask)?; + } + + // Let's generate the embeddings for the query, we are going to be normalizing the result. + // For larger datasets, you can call `.forward()` on batches and run a `l2 norm` pass on the entire data + self.model.forward_norm(&ids, &masks)? + }; + + let doc_embed = { + let shape = (docs_encoded.len(), docs_encoded[1].len()); + let mut ids = Tensor::zeros(shape, DType::U32, &self.device)?; + let mut masks = Tensor::zeros(shape, DType::U8, &self.device)?; + + for (i, e) in docs_encoded.drain(..).enumerate() { + let input_id = + Tensor::from_iter(e.get_ids().to_vec(), &self.device)?.unsqueeze(0)?; + let mask = Tensor::from_iter(e.get_attention_mask().to_vec(), &self.device)? + .to_dtype(DType::U8)? + .unsqueeze(0)?; + + ids = + ids.slice_assign(&[i..i + 1, 0..input_id.dims2().unwrap().1], &input_id)?; + masks = masks.slice_assign(&[i..i + 1, 0..mask.dims2().unwrap().1], &mask)?; + } + + // Let's generate the embeddings for the query, we are going to be normalizing the result. + // For larger datasets, you can call `.forward()` on batches and run a `l2 norm` pass on the entire data + self.model.forward_norm(&ids, &masks)? + }; + + println!( + "Embed shapes:\nQuery: {:?}\nDocs: {:?}", + qry_embed.shape(), + doc_embed.shape() + ); // [2, 1024] for head dim `1024` + + // a matmul to generate the `similarity` score + let res = qry_embed.matmul(&doc_embed.t()?)?; + for (k, v) in queries.iter().enumerate() { + let tnsr = res.get(k)?; + let max = tnsr.argmax(0)?.to_scalar::()?; + println!( + "\nScore: {}\nQuery: {}\nAnswer: {}\n\n", + tnsr.get(max as usize)?.to_scalar::()?, + v, + docs[k] + ); + } + } + + Ok(()) + } +} + +#[derive(Clone, Copy, Debug, clap::ValueEnum, PartialEq, Eq)] +enum EmbedDim { + #[value(name = "256")] + Dim256, + #[value(name = "768")] + Dim768, + #[value(name = "1024")] + Dim1024, + #[value(name = "2048")] + Dim2048, + #[value(name = "4096")] + Dim4096, + #[value(name = "6144")] + Dim6144, + #[value(name = "8192")] + Dim8192, +} + +impl EmbedDim { + /// Returns dir path to the embed head weights int he repo + pub fn embed_dim_default_dir(&self) -> &'static str { + match self { + Self::Dim256 => "2_Dense_256", + Self::Dim768 => "2_Dense_768", + Self::Dim1024 => "2_Dense_1024", + Self::Dim2048 => "2_Dense_2048", + Self::Dim4096 => "2_Dense_4096", + Self::Dim6144 => "2_Dense_6144", + Self::Dim8192 => "2_Dense_8192", + } + } + + /// Resolves the `EmbedDim` for given variant + pub fn embed_dim(&self) -> StellaEmbedDim { + match self { + Self::Dim256 => StellaEmbedDim::Dim256, + Self::Dim768 => StellaEmbedDim::Dim768, + Self::Dim1024 => StellaEmbedDim::Dim1024, + Self::Dim2048 => StellaEmbedDim::Dim2048, + Self::Dim4096 => StellaEmbedDim::Dim4096, + Self::Dim6144 => StellaEmbedDim::Dim6144, + Self::Dim8192 => StellaEmbedDim::Dim8192, + } + } +} + +#[derive(Clone, Copy, Debug, clap::ValueEnum, PartialEq, Eq)] +pub enum EncodeTask { + /// `s2p` is the `retrieval` task + /// Default in this example + #[value(name = "s2p")] + S2P, + /// `s2s` is the semantic similarity task + #[value(name = "s2s")] + S2S, +} + +impl EncodeTask { + /// Preprocess a set of inputs basef on a template suggested by the model authors + /// See: https://huggingface.co/dunzhang/stella_en_1.5B_v5#introduction + pub fn query_preproc(&self, txt: &[String]) -> Vec { + let instruct = match self { + Self::S2P => { + "Given a web search query, retrieve relevant passages that answer the query." + } + Self::S2S => "Retrieve semantically similar text.", + }; + + txt.iter() + .map(|s| format!("Instruct: {instruct}\nQuery: {s}")) + .collect::>() + } +} + +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Args { + /// Run on CPU rather than on GPU. + #[arg(long)] + cpu: bool, + + /// Enable tracing (generates a trace-timestamp.json file). + #[arg(long)] + tracing: bool, + + #[arg(long)] + use_flash_attn: bool, + + #[arg(long)] + query: Option, + + #[arg(long, default_value = "1024")] + embed_dim: Option, + + #[arg(long)] + tokenizer_file: Option, + + #[arg(long)] + base_weight_files: Option, + + #[arg(long)] + embed_head_weight_files: Option, + + /// `Stella` is trained on 2 tasks: See [`Model Card`](https://huggingface.co/dunzhang/stella_en_1.5B_v5) + /// `s2s`: Semantic textual similarity + /// `s2p`: Retrieval task - `Default` in this example + #[arg(long, default_value = "s2p")] + task: Option, +} + +// Tokenizer creation is super critical in our case. +// We are going to be `padding: Left` for each batch +fn create_tokenizer(tokenizer_file: &Path) -> Result { + let mut tokenizer = Tokenizer::from_file(tokenizer_file).map_err(E::msg)?; + let pad_id = if let Some(pad_id) = tokenizer.token_to_id("<|endoftext|>") { + pad_id + } else { + return Err(anyhow!( + "Tokenizer doesn't contain expected `<|endoftext|>` token" + )); + }; + + // This part is super important, we are padding the tokens to the *`left`* and not the usual *`right`* padding + tokenizer.with_padding(Some(PaddingParams { + strategy: PaddingStrategy::BatchLongest, + direction: PaddingDirection::Left, + pad_id, + pad_token: "<|endoftext|>".to_string(), + ..Default::default() + })); + + Ok(tokenizer) +} + +fn main() -> Result<()> { + use tracing_chrome::ChromeLayerBuilder; + use tracing_subscriber::prelude::*; + + let args = Args::parse(); + let _guard = if args.tracing { + let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); + tracing_subscriber::registry().with(chrome_layer).init(); + Some(guard) + } else { + None + }; + println!( + "avx: {}, neon: {}, simd128: {}, f16c: {}", + candle::utils::with_avx(), + candle::utils::with_neon(), + candle::utils::with_simd128(), + candle::utils::with_f16c() + ); + + let start = std::time::Instant::now(); + let api = Api::new()?; + let embed_dim = match args.embed_dim { + Some(d) => d, + None => EmbedDim::Dim1024, + }; + let repo = api.repo(Repo::model("dunzhang/stella_en_1.5B_v5".to_string())); + let tokenizer_filename = match args.tokenizer_file { + Some(file) => std::path::PathBuf::from(file), + None => repo.get("tokenizer.json")?, + }; + + // Note, if you are providing `weight_files`, ensure that the `--embed_dim` dimensions provided matches the weights + // E.g. if you are using `--embed_dim 1024`, the weight files should include the `.safetensors` file from `2_Dense_1024` dir of the repo + let base_weight_files = match args.base_weight_files { + Some(files) => files + .split(',') + .map(std::path::PathBuf::from) + .collect::>(), + None => { + vec![repo.get("model.safetensors")?] + } + }; + + let embed_weight_files = match args.embed_head_weight_files { + Some(files) => files + .split(',') + .map(std::path::PathBuf::from) + .collect::>(), + None => { + let head_w_path = format!("{}/model.safetensors", embed_dim.embed_dim_default_dir()); + vec![repo.get(&head_w_path)?] + } + }; + + println!("retrieved the files in {:?}", start.elapsed()); + + // Initializing the tokenizer which would require us to add padding to the `left` for batch encoding + let tokenizer = create_tokenizer(tokenizer_filename.as_path())?; + + let start = std::time::Instant::now(); + + let device = candle_examples::device(args.cpu)?; + let dtype = DType::F32; + + let base_vb = + unsafe { VarBuilder::from_mmaped_safetensors(&base_weight_files, dtype, &device)? }; + // Embedding layer is always built on F32 for accuracy + let embed_vb = + unsafe { VarBuilder::from_mmaped_safetensors(&embed_weight_files, DType::F32, &device)? }; + + let model = EmbeddingModel::new( + &Config::new_1_5_b_v5(embed_dim.embed_dim()), + base_vb, + embed_vb, + )?; + + println!("loaded the model in {:?}", start.elapsed()); + + let mut embedding = Embedding::new(model, tokenizer, &device); + + let task = args.task.map_or(EncodeTask::S2P, |t| t); + + embedding.encode(task, args.query) +} diff --git a/candle-transformers/src/models/mod.rs b/candle-transformers/src/models/mod.rs index 6ed7a8b580..23edf349ad 100644 --- a/candle-transformers/src/models/mod.rs +++ b/candle-transformers/src/models/mod.rs @@ -84,6 +84,7 @@ pub mod siglip; pub mod stable_diffusion; pub mod stable_lm; pub mod starcoder2; +pub mod stella_en_v5; pub mod t5; pub mod trocr; pub mod vgg; diff --git a/candle-transformers/src/models/stella_en_v5.rs b/candle-transformers/src/models/stella_en_v5.rs new file mode 100644 index 0000000000..9d933fade5 --- /dev/null +++ b/candle-transformers/src/models/stella_en_v5.rs @@ -0,0 +1,399 @@ +use crate::models::with_tracing::{linear, linear_no_bias, Linear, RmsNorm}; +use candle::{DType, Device, IndexOp, Module, Result, Tensor}; +use candle_nn::{Activation, VarBuilder}; +use std::sync::Arc; + +// Same as `qwen2` family of models with the exception being the `embed_head` +// The final `output` causal modelling head is swapped with a learned `dense` layer, `embed_head` +#[derive(Debug, Clone, PartialEq, serde::Deserialize)] +pub struct Config { + pub vocab_size: usize, + pub hidden_size: usize, + pub intermediate_size: usize, + pub num_hidden_layers: usize, + pub num_attention_heads: usize, + pub num_key_value_heads: usize, + pub max_position_embeddings: usize, + pub max_window_layers: usize, + pub tie_word_embeddings: bool, + pub rope_theta: f64, + pub rms_norm_eps: f64, + pub hidden_act: Activation, + pub embed_head: EmbedHead, +} + +// Excerpt from `stella` model card: +// `Stella_en_1.5B_v5` models have been trained on [MRL](https://arxiv.org/abs/2205.13147) enabling multiple output dimensions +// Embed head represents the config for various embedding dims supported +#[derive(Debug, Clone, PartialEq, serde::Deserialize)] +pub struct EmbedHead { + pub in_features: usize, + pub out_features: usize, +} + +/// An enum variant representing the Embedding head dimensions `stella` is trained on +/// As the [model-card](https://huggingface.co/dunzhang/stella_en_1.5B_v5#introduction) suggests, D1024 is good enough for most cases +#[derive(Debug, Clone, Copy)] +pub enum EmbedDim { + Dim256, + Dim768, + Dim1024, + Dim2048, + Dim4096, + Dim6144, + Dim8192, +} + +impl Default for EmbedDim { + fn default() -> Self { + Self::Dim1024 + } +} + +impl EmbedDim { + pub fn config(&self) -> EmbedHead { + EmbedHead { + in_features: 1536, + out_features: match &self { + Self::Dim256 => 256, + Self::Dim768 => 768, + Self::Dim1024 => 1024, + Self::Dim2048 => 2048, + Self::Dim4096 => 4096, + Self::Dim6144 => 6144, + Self::Dim8192 => 8192, + }, + } + } +} + +// Initialize a new `stella_en` model - with 400M variant or 1.5B variant +impl Config { + /// Initialize a new `stella_en_1.5B_v5`` model with given embedding dim + pub fn new_1_5_b_v5(embed_dim: EmbedDim) -> Self { + // Representing config.json at https://huggingface.co/dunzhang/stella_en_1.5B_v5/blob/main/config.json + // Removed `sliding_window` related config which is basically being carried forward from `qwen2` but not used here + Self { + hidden_act: candle_nn::Activation::Silu, + vocab_size: 151646, + hidden_size: 1536, + intermediate_size: 8960, + num_hidden_layers: 28, + num_attention_heads: 12, + num_key_value_heads: 2, + max_position_embeddings: 131072, + max_window_layers: 21, + tie_word_embeddings: false, + rope_theta: 1000000., + rms_norm_eps: 1e-06, + embed_head: embed_dim.config(), + } + } +} + +#[derive(Debug, Clone)] +struct RotaryEmbedding { + sin: Tensor, + cos: Tensor, +} + +impl RotaryEmbedding { + fn new(dtype: DType, cfg: &Config, dev: &Device) -> Result { + let dim = cfg.hidden_size / cfg.num_attention_heads; + let max_seq_len = cfg.max_position_embeddings; + let inv_freq: Vec<_> = (0..dim) + .step_by(2) + .map(|i| 1f32 / cfg.rope_theta.powf(i as f64 / dim as f64) as f32) + .collect(); + let inv_freq_len = inv_freq.len(); + let inv_freq = Tensor::from_vec(inv_freq, (1, inv_freq_len), dev)?.to_dtype(dtype)?; + let t = Tensor::arange(0u32, max_seq_len as u32, dev)? + .to_dtype(dtype)? + .reshape((max_seq_len, 1))?; + let freqs = t.matmul(&inv_freq)?; + Ok(Self { + sin: freqs.sin()?, + cos: freqs.cos()?, + }) + } + + fn apply_rotary_emb_qkv(&self, q: &Tensor, k: &Tensor) -> Result<(Tensor, Tensor)> { + let (_b_sz, _h, seq_len, _n_embd) = q.dims4()?; + let cos = self.cos.narrow(0, 0, seq_len)?; + let sin = self.sin.narrow(0, 0, seq_len)?; + let q_embed = candle_nn::rotary_emb::rope(&q.contiguous()?, &cos, &sin)?; + let k_embed = candle_nn::rotary_emb::rope(&k.contiguous()?, &cos, &sin)?; + Ok((q_embed, k_embed)) + } +} + +#[derive(Debug, Clone)] +#[allow(clippy::upper_case_acronyms)] +struct MLP { + gate_proj: Linear, + up_proj: Linear, + down_proj: Linear, + act_fn: Activation, +} + +impl MLP { + fn new(cfg: &Config, vb: VarBuilder) -> Result { + let hidden_sz = cfg.hidden_size; + let intermediate_sz = cfg.intermediate_size; + let gate_proj = linear_no_bias(hidden_sz, intermediate_sz, vb.pp("gate_proj"))?; + let up_proj = linear_no_bias(hidden_sz, intermediate_sz, vb.pp("up_proj"))?; + let down_proj = linear_no_bias(intermediate_sz, hidden_sz, vb.pp("down_proj"))?; + Ok(Self { + gate_proj, + up_proj, + down_proj, + act_fn: cfg.hidden_act, + }) + } +} + +impl Module for MLP { + fn forward(&self, xs: &Tensor) -> Result { + let lhs = xs.apply(&self.gate_proj)?.apply(&self.act_fn)?; + let rhs = xs.apply(&self.up_proj)?; + (lhs * rhs)?.apply(&self.down_proj) + } +} + +#[derive(Debug, Clone)] +struct Attention { + q_proj: Linear, + k_proj: Linear, + v_proj: Linear, + o_proj: Linear, + num_heads: usize, + num_kv_heads: usize, + num_kv_groups: usize, + head_dim: usize, + hidden_size: usize, + rotary_emb: Arc, +} + +impl Attention { + fn new(rotary_emb: Arc, cfg: &Config, vb: VarBuilder) -> Result { + let hidden_sz = cfg.hidden_size; + let num_heads = cfg.num_attention_heads; + let num_kv_heads = cfg.num_key_value_heads; + let num_kv_groups = num_heads / num_kv_heads; + let head_dim = hidden_sz / num_heads; + let q_proj = linear(hidden_sz, num_heads * head_dim, vb.pp("q_proj"))?; + let k_proj = linear(hidden_sz, num_kv_heads * head_dim, vb.pp("k_proj"))?; + let v_proj = linear(hidden_sz, num_kv_heads * head_dim, vb.pp("v_proj"))?; + let o_proj = linear_no_bias(num_heads * head_dim, hidden_sz, vb.pp("o_proj"))?; + Ok(Self { + q_proj, + k_proj, + v_proj, + o_proj, + num_heads, + num_kv_heads, + num_kv_groups, + head_dim, + hidden_size: hidden_sz, + rotary_emb, + }) + } + + fn forward(&mut self, xs: &Tensor, attention_mask: Option<&Tensor>) -> Result { + let (b_sz, q_len, _) = xs.dims3()?; + + let query_states = self.q_proj.forward(xs)?; + let key_states = self.k_proj.forward(xs)?; + let value_states = self.v_proj.forward(xs)?; + + let query_states = query_states + .reshape((b_sz, q_len, self.num_heads, self.head_dim))? + .transpose(1, 2)?; + let key_states = key_states + .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? + .transpose(1, 2)?; + let value_states = value_states + .reshape((b_sz, q_len, self.num_kv_heads, self.head_dim))? + .transpose(1, 2)?; + + let (query_states, key_states) = self + .rotary_emb + .apply_rotary_emb_qkv(&query_states, &key_states)?; + + let key_states = crate::utils::repeat_kv(key_states, self.num_kv_groups)?.contiguous()?; + let value_states = + crate::utils::repeat_kv(value_states, self.num_kv_groups)?.contiguous()?; + + let attn_output = { + let scale = 1f64 / f64::sqrt(self.head_dim as f64); + let attn_weights = (query_states.matmul(&key_states.transpose(2, 3)?)? * scale)?; + + let attn_weights = match attention_mask { + None => attn_weights, + Some(mask) => attn_weights.broadcast_add(mask)?, + }; + let attn_weights = candle_nn::ops::softmax_last_dim(&attn_weights)?; + attn_weights.matmul(&value_states)? + }; + attn_output + .transpose(1, 2)? + .reshape((b_sz, q_len, self.hidden_size))? + .apply(&self.o_proj) + } +} + +#[derive(Debug, Clone)] +struct DecoderLayer { + self_attn: Attention, + mlp: MLP, + input_layernorm: RmsNorm, + post_attention_layernorm: RmsNorm, +} + +impl DecoderLayer { + fn new(rotary_emb: Arc, cfg: &Config, vb: VarBuilder) -> Result { + let self_attn = Attention::new(rotary_emb, cfg, vb.pp("self_attn"))?; + let mlp = MLP::new(cfg, vb.pp("mlp"))?; + let input_layernorm = + RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb.pp("input_layernorm"))?; + let post_attention_layernorm = RmsNorm::new( + cfg.hidden_size, + cfg.rms_norm_eps, + vb.pp("post_attention_layernorm"), + )?; + Ok(Self { + self_attn, + mlp, + input_layernorm, + post_attention_layernorm, + }) + } + + fn forward(&mut self, xs: &Tensor, attention_mask: Option<&Tensor>) -> Result { + let residual = xs; + let xs = self.input_layernorm.forward(xs)?; + let xs = self.self_attn.forward(&xs, attention_mask)?; + let xs = (xs + residual)?; + let residual = &xs; + let xs = xs.apply(&self.post_attention_layernorm)?.apply(&self.mlp)?; + residual + xs + } +} + +#[derive(Debug, Clone)] +pub struct Model { + embed_tokens: candle_nn::Embedding, + layers: Vec, + norm: RmsNorm, + device: Device, + dtype: DType, +} + +impl Model { + pub fn new(cfg: &Config, vb: VarBuilder) -> Result { + let vb_m = vb.pp("model"); + let embed_tokens = + candle_nn::embedding(cfg.vocab_size, cfg.hidden_size, vb_m.pp("embed_tokens"))?; + let rotary_emb = Arc::new(RotaryEmbedding::new(vb.dtype(), cfg, vb_m.device())?); + let mut layers = Vec::with_capacity(cfg.num_hidden_layers); + let vb_l = vb_m.pp("layers"); + for layer_idx in 0..cfg.num_hidden_layers { + let layer = DecoderLayer::new(rotary_emb.clone(), cfg, vb_l.pp(layer_idx))?; + layers.push(layer) + } + let norm = RmsNorm::new(cfg.hidden_size, cfg.rms_norm_eps, vb_m.pp("norm"))?; + Ok(Self { + embed_tokens, + layers, + norm, + // sliding_window: 0, + device: vb.device().clone(), + dtype: vb.dtype(), + }) + } + + fn prepare_attention_mask(&self, attn_mask: &Tensor) -> Result { + let (b_sz, sql_len) = attn_mask.dims2()?; + let mut mask: Vec = vec![]; + for b in 0..b_sz { + mask.push(attn_mask.i((b, ..))?.expand((1, 1, sql_len, sql_len))?); + } + let mask = Tensor::cat(&mask, 0)?; + let on_true = mask.zeros_like()?.to_dtype(self.dtype)?; + let on_false = Tensor::new(f32::NEG_INFINITY, &self.device)? + .broadcast_as(mask.shape())? + .to_dtype(self.dtype)?; + mask.where_cond(&on_true, &on_false) + } + + pub fn forward(&mut self, input_ids: &Tensor, mask: &Tensor) -> Result { + let (_, seq_len) = input_ids.dims2()?; + let attention_mask = if seq_len <= 1 { + None + } else { + // This is not a `causal language modelling` task, we'll need to prepare a `non-causal` attention + Some(self.prepare_attention_mask(mask)?) + }; + + let mut xs = self.embed_tokens.forward(input_ids)?; + for layer in self.layers.iter_mut() { + xs = layer.forward(&xs, attention_mask.as_ref())? + } + xs.apply(&self.norm) + } +} + +#[derive(Debug, Clone)] +pub struct EmbeddingModel { + base_model: Model, + lm_head: Linear, +} + +impl EmbeddingModel { + pub fn new(cfg: &Config, base_vb: VarBuilder, embed_vb: VarBuilder) -> Result { + let base_model = Model::new(cfg, base_vb.clone())?; + let lm_head = linear( + cfg.embed_head.in_features, + cfg.embed_head.out_features, + embed_vb.pp("linear"), + )?; + + Ok(Self { + base_model, + lm_head, + }) + } + + pub fn forward(&mut self, input_ids: &Tensor, mask: &Tensor) -> Result { + let x = self.base_model.forward(input_ids, mask)?; + let x = self.pool(&x, mask)?; + + // No matter what keeping the final activations as F32 helps with the accuracy + self.lm_head.forward(&x.to_dtype(DType::F32)?) // [B_sz, dim_size] + } + + /// Same as forward pass but normalizes the output + pub fn forward_norm(&mut self, input_ids: &Tensor, mask: &Tensor) -> Result { + let x = self.forward(input_ids, mask)?; + // Normalize + x.broadcast_div(&x.sqr()?.sum_keepdim(1)?.sqrt()?) + } + + fn pool(&self, x: &Tensor, mask: &Tensor) -> Result { + let mask = mask.to_dtype(x.dtype())?; // [B_Sz, Seq_len] + let (batch_size, seq_len, hidden_dim) = x.dims3()?; + // expanding the shape of the mask from [B_Sz, Seq_len] -> [B_Sz, Seq_len, Hidden_size] + let mask_expanded = mask + .unsqueeze(2)? + .broadcast_as((batch_size, seq_len, hidden_dim))?; // [B_Sz, Seq_len, Hidden_dim] + + let x = (x * &mask_expanded)?; + + // Sum + let sum_mask = mask + .sum(1)? + .unsqueeze(1)? + .expand((batch_size, hidden_dim))?; + x.sum(1)? / sum_mask + } +} From 3d1dc06cdb44e2e012559aadd8da7342da9c2ed5 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Mon, 14 Oct 2024 08:59:12 +0200 Subject: [PATCH 19/28] Enable stable-diffusion 3 on metal. (#2560) --- candle-examples/Cargo.toml | 3 --- .../examples/stable-diffusion-3/main.rs | 15 +++++++++------ .../examples/stable-diffusion-3/sampling.rs | 2 +- candle-transformers/src/models/marian.rs | 3 +-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml index d3e23b922c..0c1219d760 100644 --- a/candle-examples/Cargo.toml +++ b/candle-examples/Cargo.toml @@ -122,6 +122,3 @@ required-features = ["onnx"] [[example]] name = "colpali" required-features = ["pdf2image"] - -[[example]] -name = "stable-diffusion-3" \ No newline at end of file diff --git a/candle-examples/examples/stable-diffusion-3/main.rs b/candle-examples/examples/stable-diffusion-3/main.rs index 164ae4205b..ee467839e8 100644 --- a/candle-examples/examples/stable-diffusion-3/main.rs +++ b/candle-examples/examples/stable-diffusion-3/main.rs @@ -30,9 +30,9 @@ struct Args { #[arg(long)] cpu: bool, - /// The CUDA device ID to use. - #[arg(long, default_value = "0")] - cuda_device_id: usize, + /// The GPU device ID to use. + #[arg(long, default_value_t = 0)] + gpu_device_id: usize, /// Enable tracing (generates a trace-timestamp.json file). #[arg(long)] @@ -81,7 +81,7 @@ fn run(args: Args) -> Result<()> { prompt, uncond_prompt, cpu, - cuda_device_id, + gpu_device_id, tracing, use_flash_attn, height, @@ -100,11 +100,14 @@ fn run(args: Args) -> Result<()> { None }; - // TODO: Support and test on Metal. let device = if cpu { candle::Device::Cpu + } else if candle::utils::cuda_is_available() { + candle::Device::new_cuda(gpu_device_id)? + } else if candle::utils::metal_is_available() { + candle::Device::new_metal(gpu_device_id)? } else { - candle::Device::cuda_if_available(cuda_device_id)? + candle::Device::Cpu }; let api = hf_hub::api::sync::Api::new()?; diff --git a/candle-examples/examples/stable-diffusion-3/sampling.rs b/candle-examples/examples/stable-diffusion-3/sampling.rs index 147d8e7380..0efd160eba 100644 --- a/candle-examples/examples/stable-diffusion-3/sampling.rs +++ b/candle-examples/examples/stable-diffusion-3/sampling.rs @@ -31,7 +31,7 @@ pub fn euler_sample( let timestep = (*s_curr) * 1000.0; let noise_pred = mmdit.forward( &Tensor::cat(&[x.clone(), x.clone()], 0)?, - &Tensor::full(timestep, (2,), x.device())?.contiguous()?, + &Tensor::full(timestep as f32, (2,), x.device())?.contiguous()?, y, context, )?; diff --git a/candle-transformers/src/models/marian.rs b/candle-transformers/src/models/marian.rs index c4299da601..e93370c23e 100644 --- a/candle-transformers/src/models/marian.rs +++ b/candle-transformers/src/models/marian.rs @@ -1,9 +1,8 @@ use super::with_tracing::{linear, Embedding, Linear}; use candle::{Result, Tensor}; use candle_nn::{layer_norm, LayerNorm, VarBuilder}; -use serde::Deserialize; -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, serde::Deserialize)] pub struct Config { pub vocab_size: usize, pub decoder_vocab_size: Option, From a01aa897991fbc3da2dfda568b4254f697fdd598 Mon Sep 17 00:00:00 2001 From: Anubhab Bandyopadhyay <4890833+AnubhabB@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:04:07 +0530 Subject: [PATCH 20/28] onnx: ReduceMin/Max Ops (#2563) * Stella_en_1.5B_v5 * Separated creation. This is a critical step for numerical accuracy and would be documented in the readme * EmbedDim would require clone and copy * WIP: example * Examples added * a litte more in README * WIP: ONNX Reduce-max ops * WIP: tests for ReduceMin * Reduce min/ max v18+ * Reformatting tests for better review readability * Error on empty set, backward compatibility (13 and below) with 'axes' --- candle-onnx/src/eval.rs | 174 ++++++- candle-onnx/tests/ops.rs | 1038 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 1211 insertions(+), 1 deletion(-) diff --git a/candle-onnx/src/eval.rs b/candle-onnx/src/eval.rs index de3e1010ac..629b3f93d5 100644 --- a/candle-onnx/src/eval.rs +++ b/candle-onnx/src/eval.rs @@ -2,7 +2,7 @@ use crate::onnx::attribute_proto::AttributeType; use crate::onnx::tensor_proto::DataType; use crate::onnx::{self, GraphProto}; use candle::{bail, DType, Device, Result, Tensor}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; pub type Value = Tensor; @@ -1189,6 +1189,92 @@ fn simple_eval_( } values.insert(node.output[0].clone(), out); } + // https://onnx.ai/onnx/operators/onnx__ReduceMax.html#reducemax + "ReduceMax" => { + let input = get(&node.input[0])?; + let axes = get_opt(1); + let keepdims = get_attr_opt::(node, "keepdims")?.copied().unwrap_or(1) == 1; + + let axes = if let Some(Ok(axes)) = axes { + // Satisfies version 18+ + axes.to_vec1::().ok() + } else if let Ok(Some(axes)) = get_attr_opt::<[i64]>(node, "axes") { + // Backward compatiblity with version 13 and below + Some(axes.to_vec()) + } else { + None + }; + + let axes = if let Some(axes) = axes { + let rank = input.rank(); + let mut axes_set = HashSet::new(); + + let mut axes = axes + .iter() + .map(|a| { + let axis = if *a < 0 { + (rank as i64 + *a) as usize + } else { + *a as usize + }; + + axes_set.insert(axis); + axis + }) + .collect::>(); + + if axes_set.len() < axes.len() { + bail!("Duplicate value in 'axes'"); + } + + if axes.len() > 1 { + axes.sort(); + } + + Some(axes) + } else { + None + }; + + // TODO: Handle empty set + // Definition: + // "Reduction over an empty set of values yields minus infinity (if supported by the datatype) or the minimum value of the data type otherwise" + // For now, this will throw an error + if input.elem_count() == 0 { + bail!("reduction over zero-size tensor not supported"); + } + + let output = if let Some(axes) = axes { + let mut result = input.clone(); + for &axis in axes.iter().rev() { + result = if keepdims { + result.max_keepdim(axis)? + } else { + result.max(axis)? + } + } + + result + } else { + // If `axes` is empty and `noop_with_empty_axes` is set to `true (1)` + // ""input tensor will not be reduced,and the output tensor would be equivalent to input tensor."" + if get_attr_opt::(node, "noop_with_empty_axes")?.copied() == Some(1) { + input.clone() + } else { + let mut result = input.flatten_all()?; + if keepdims { + result = result.max_keepdim(0)?; + // If keepdims is true, reshape to match input dimensions + let shape = vec![1; input.rank()]; + result.reshape(shape)? + } else { + result.max(0)? + } + } + }; + + values.insert(node.output[0].clone(), output); + } // https://onnx.ai/onnx/operators/onnx__ReduceMean.html#reducemean-13 // TODO: This version is only compatible with ReduceMean V13 and below. "ReduceMean" => { @@ -1212,6 +1298,92 @@ fn simple_eval_( }; values.insert(node.output[0].clone(), output); } + // https://onnx.ai/onnx/operators/onnx__ReduceMin.html#reducemin + "ReduceMin" => { + let input = get(&node.input[0])?; + let axes = get_opt(1); + let keepdims = get_attr_opt::(node, "keepdims")?.copied().unwrap_or(1) == 1; + + let axes = if let Some(Ok(axes)) = axes { + // Satisfies version 18+ + axes.to_vec1::().ok() + } else if let Ok(Some(axes)) = get_attr_opt::<[i64]>(node, "axes") { + // Backward compatiblity with version 13 and below + Some(axes.to_vec()) + } else { + None + }; + + let axes = if let Some(axes) = axes { + let rank = input.rank(); + let mut axes_set = HashSet::new(); + + let mut axes = axes + .iter() + .map(|a| { + let axis = if *a < 0 { + (rank as i64 + *a) as usize + } else { + *a as usize + }; + + axes_set.insert(axis); + axis + }) + .collect::>(); + + if axes_set.len() < axes.len() { + bail!("Duplicate value in 'axes'"); + } + + if axes.len() > 1 { + axes.sort(); + } + + Some(axes) + } else { + None + }; + + // TODO: Handle empty set + // Definition: + // "Reduction over an empty set of values yields positive infinity (if supported by the datatype) or the max value of the data type otherwise" + // For now, this will throw an error + if input.elem_count() == 0 { + bail!("reduction over zero-size tensor not supported"); + } + + let output = if let Some(axes) = axes { + let mut result = input.clone(); + for &axis in axes.iter().rev() { + result = if keepdims { + result.min_keepdim(axis)? + } else { + result.min(axis)? + } + } + + result + } else { + // If `axes` is empty and `noop_with_empty_axes` is set to `true (1)` + // ""input tensor will not be reduced,and the output tensor would be equivalent to input tensor."" + if get_attr_opt::(node, "noop_with_empty_axes")?.copied() == Some(1) { + input.clone() + } else { + let mut result = input.flatten_all()?; + if keepdims { + result = result.min_keepdim(0)?; + // If keepdims is true, reshape to match input dimensions + let shape = vec![1; input.rank()]; + result.reshape(shape)? + } else { + result.min(0)? + } + } + }; + + values.insert(node.output[0].clone(), output); + } //https://github.com/onnx/onnx/blob/main/docs/Operators.md#Split // Version 18 impl "Split" => { diff --git a/candle-onnx/tests/ops.rs b/candle-onnx/tests/ops.rs index 2a138131b2..450a9879e6 100644 --- a/candle-onnx/tests/ops.rs +++ b/candle-onnx/tests/ops.rs @@ -1695,6 +1695,1044 @@ fn test_relu_operation() -> Result<()> { // "Cast" // #[test] +// "ReduceMax" +#[test] +fn test_reduce_max() -> Result<()> { + // Tests with random data generated with `np.random.uniform` + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-119 bool_inputs + // No special treatment reqired for bool + // `np.maximum.reduce(data, axis=axes, keepdims=True)` + test( + &[[1_u8, 1], [1, 0], [0, 1], [0, 0]], + Some(vec![1]), + 1, + None, + &[[1_u8], [1], [1], [0]], + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-119 default_axes_keepdims + // `np.maximum.reduce(data, axis=None, keepdims=True)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + None, + 1, + None, + &[[[60.]]], + false, + )?; + // same as above but with random + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + None, + 1, + None, + &[[[9.587318]]], + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-119 default_axes_donot_keep_dims + // `np.maximum.reduce(data, axis=None, keepdims=False)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + None, + 0, + None, + 60., + false, + )?; + // same as above but with random + // `np.maximum.reduce(data, axis=None, keepdims=False)` + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + None, + 0, + None, + 9.587318, + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-119 keepdims + // `np.maximum.reduce(data, axis=tuple(axes), keepdims=True)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![1]), + 1, + None, + &[[[20., 2.]], [[40., 2.]], [[60., 2.]]], + false, + )?; + // keepdims with random data + // `np.maximum.reduce(data, axis=tuple(axes), keepdims=True)` + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + Some(vec![1]), + 1, + None, + &[ + [[-7.318765, 7.2374434]], + [[6.304022, 4.939862]], + [[9.587318, 8.008944]], + ], + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-119 negative_axes_keepdims + // axes = np.array([-1], dtype=np.int64) + // `np.maximum.reduce(data, axis=tuple(axes), keepdims=True)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1]), + 1, + None, + &[[[5.], [20.]], [[30.], [40.]], [[55.], [60.]]], + false, + )?; + // axes = np.array([-2], dtype=np.int64) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-2]), + 1, + None, + &[[[20., 2.]], [[40., 2.]], [[60., 2.]]], + false, + )?; + // with random + test( + &[ + [[-4.1676497, -2.7603748], [-4.5138783, -0.762791]], + [[-6.3792877, 7.1619177], [-9.958144, 6.3753467]], + [[9.046973, 3.4554052], [-5.4674335, 5.4642754]], + ], + Some(vec![-2]), + 1, + None, + &[ + [[-4.1676497, -0.762791]], + [[-6.3792877, 7.1619177]], + [[9.046973, 5.4642754]], + ], + false, + )?; + + // Multiple axes - keepdims=1 (true) + // axes = np.array([0, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 1]), + 1, + None, + &[[[60., 2.]]], + false, + )?; + // axes = np.array([0, 2], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 2]), + 1, + None, + &[[[55.], [60.]]], + false, + )?; + // axes = np.array([2, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 1]), + 1, + None, + &[[[20.]], [[40.]], [[60.]]], + false, + )?; + // axes = np.array([2, 0, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 0, 1]), + 1, + None, + &[[[60.]]], + false, + )?; + // Multiple axes - keepdims=0 (false) + // axes = np.array([0, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 1]), + 0, + None, + &[60., 2.], + false, + )?; + // axes = np.array([0, 2], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 2]), + 0, + None, + &[55., 60.], + false, + )?; + // axes = np.array([2, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 1]), + 0, + None, + &[20., 40., 60.], + false, + )?; + // axes = np.array([2, 0, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 0, 1]), + 0, + None, + 60., + false, + )?; + + // Multiple axes - negative `axes` - keepdims=1 (true) + // axes = np.array([-1, 0, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1, 0, 1]), + 1, + None, + &[[[60.]]], + false, + )?; + // Multiple axes - negative `axes` - keepdims=0 (false) + // axes = np.array([-1, 0, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1, 0, 1]), + 0, + None, + 60., + false, + )?; + + // `noop_with_empty_axes = true (1)` should yield tensor equivallent to the input tensor + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + None, + 0, + Some(1), + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + false, + )?; + + // Rank-0 arrays are also valid + test(42., None, 0, None, 42., false)?; + test(42., None, 1, None, 42., false)?; + + // Negative test - expect error + // axes = np.array([-2, 0, 1], dtype=np.int64) + // np.maximum.reduce(data, axis=tuple(axes), keepdims=True) + // Should error out with `duplicate value in "axes"` + assert!(test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-2, 0, 1]), + 1, + None, + &[[[60.]]], + false + ) + .is_err()); + + // Negative test - expect error + // Should error out on empty set + assert!(test(&[[1_u8; 0]], Some(vec![-2, 0, 1]), 1, None, &[0.], false).is_err()); + + // Backward compatibility + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1, 0, 1]), + 0, + None, + 60., + true, + )?; + + fn test( + data: impl NdArray, + axes: Option>, + keepdims: i64, + noop_with_empty_axes: Option, + expected: impl NdArray, + backward_comp: bool, + ) -> Result<()> { + let has_axes = axes.is_some(); + + let att_keepdims = AttributeProto { + name: "keepdims".to_string(), + ref_attr_name: "keepdims".to_string(), + i: keepdims, + doc_string: "keepdims".to_string(), + r#type: 2, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: vec![], + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }; + + let mut attribute = vec![att_keepdims]; + if let Some(noop) = noop_with_empty_axes { + if !has_axes { + let att_no_op_empty_axes = AttributeProto { + name: "noop_with_empty_axes".to_string(), + ref_attr_name: "noop_with_empty_axes".to_string(), + i: noop, + doc_string: "noop_with_empty_axes".to_string(), + r#type: 2, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: vec![], + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }; + + attribute.push(att_no_op_empty_axes); + } + } + if has_axes && backward_comp { + attribute.push(AttributeProto { + name: "axes".to_string(), + ref_attr_name: "axes".to_string(), + i: 0, + doc_string: "axes".to_string(), + r#type: 7, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: axes.clone().unwrap_or_default(), + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }); + } + + let manual_graph = create_model_proto_with_graph(Some(GraphProto { + node: vec![NodeProto { + op_type: "ReduceMax".to_string(), + domain: "".to_string(), + attribute, + input: if has_axes && !backward_comp { + vec![INPUT_X.to_string(), INPUT_Y.to_string()] + } else { + vec![INPUT_X.to_string()] + }, + output: vec![OUTPUT_Z.to_string()], + name: "".to_string(), + doc_string: "".to_string(), + }], + name: "".to_string(), + initializer: vec![], + input: vec![], + output: vec![ValueInfoProto { + name: OUTPUT_Z.to_string(), + doc_string: "".to_string(), + r#type: None, + }], + value_info: vec![], + doc_string: "".to_string(), + sparse_initializer: vec![], + quantization_annotation: vec![], + })); + + let mut inputs: HashMap = HashMap::new(); + let input_tensor = Tensor::new(data, &Device::Cpu)?; + let input_dtype = input_tensor.dtype(); + inputs.insert(INPUT_X.to_string(), input_tensor); + if !backward_comp { + if let Some(a) = axes { + inputs.insert(INPUT_Y.to_string(), Tensor::new(a, &Device::Cpu)?); + } + } + + let eval = candle_onnx::simple_eval(&manual_graph, inputs)?; + assert_eq!(eval.len(), 1); + + let z = eval.get(OUTPUT_Z).expect("Output 'z' not found"); + + let expected = Tensor::new(expected, &Device::Cpu)?; + + match expected.dims().len() { + 0 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec0::()?, expected.to_vec0::()?) + } else { + assert_eq!(z.to_vec0::()?, expected.to_vec0::()?) + } + } + 1 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec1::()?, expected.to_vec1::()?) + } else { + assert_eq!(z.to_vec1::()?, expected.to_vec1::()?) + } + } + 2 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec2::()?, expected.to_vec2::()?) + } else { + assert_eq!(z.to_vec2::()?, expected.to_vec2::()?) + } + } + 3 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec3::()?, expected.to_vec3::()?) + } else { + assert_eq!(z.to_vec3::()?, expected.to_vec3::()?) + } + } + _ => unreachable!(), + }; + + Ok(()) + } + Ok(()) +} + +// "ReduceMin" +#[test] +fn test_reduce_min() -> Result<()> { + // Tests with random data generated with `np.random.uniform` + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-121 bool_inputs + // No special treatment reqired for bool + // `np.minimum.reduce(data, axis=axes, keepdims=True)` + test( + &[[1_u8, 1], [1, 0], [0, 1], [0, 0]], + Some(vec![1]), + 1, + None, + &[[1_u8], [0], [0], [0]], + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-121 default_axes_keepdims + // `np.minimum.reduce(data, axis=None, keepdims=True)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + None, + 1, + None, + &[[[1.]]], + false, + )?; + // same as above but with random + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + None, + 1, + None, + &[[[-8.794852]]], + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-121 default_axes_donot_keep_dims + // `np.minimum.reduce(data, axis=None, keepdims=False)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + None, + 0, + None, + 1., + false, + )?; + // same as above but with random + // `np.minimum.reduce(data, axis=None, keepdims=False)` + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + None, + 0, + None, + -8.794852, + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-121 keepdims + // `np.minimum.reduce(data, axis=tuple(axes), keepdims=True)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![1]), + 1, + None, + &[[[5., 1.]], [[30., 1.]], [[55., 1.]]], + false, + )?; + // keepdims with random data + // `np.minimum.reduce(data, axis=tuple(axes), keepdims=True)` + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + Some(vec![1]), + 1, + None, + &[ + [[-7.648377, -5.4018507]], + [[4.5435624, 3.072864]], + [[-2.5058026, -8.794852]], + ], + false, + )?; + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-121 negative_axes_keepdims + // axes = np.array([-1], dtype=np.int64) + // `np.minimum.reduce(data, axis=tuple(axes), keepdims=True)` + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1]), + 1, + None, + &[[[1.], [2.]], [[1.], [2.]], [[1.], [2.]]], + false, + )?; + // axes = np.array([-2], dtype=np.int64) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-2]), + 1, + None, + &[[[5., 1.]], [[30., 1.]], [[55., 1.]]], + false, + )?; + // with random + test( + &[ + [[-4.1676497, -2.7603748], [-4.5138783, -0.762791]], + [[-6.3792877, 7.1619177], [-9.958144, 6.3753467]], + [[9.046973, 3.4554052], [-5.4674335, 5.4642754]], + ], + Some(vec![-2]), + 1, + None, + &[ + [[-4.5138783, -2.7603748]], + [[-9.958144, 6.3753467]], + [[-5.4674335, 3.4554052]], + ], + false, + )?; + + // Multiple axes - keepdims=1 (true) + // axes = np.array([0, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 1]), + 1, + None, + &[[[5., 1.]]], + false, + )?; + // axes = np.array([0, 2], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 2]), + 1, + None, + &[[[1.], [2.]]], + false, + )?; + // axes = np.array([2, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 1]), + 1, + None, + &[[[1.]], [[1.]], [[1.]]], + false, + )?; + // axes = np.array([2, 0, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 0, 1]), + 1, + None, + &[[[1.]]], + false, + )?; + // Multiple axes - keepdims=0 (false) + // axes = np.array([0, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 1]), + 0, + None, + &[5., 1.], + false, + )?; + // axes = np.array([0, 2], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![0, 2]), + 0, + None, + &[1., 2.], + false, + )?; + // axes = np.array([2, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 1]), + 0, + None, + &[1., 1., 1.], + false, + )?; + // axes = np.array([2, 0, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=False) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![2, 0, 1]), + 0, + None, + 1., + false, + )?; + + // Multiple axes - negative `axes` - keepdims=1 (true) + // axes = np.array([-1, 0, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1, 0, 1]), + 1, + None, + &[[[1.]]], + false, + )?; + // Multiple axes - negative `axes` - keepdims=0 (false) + // axes = np.array([-1, 0, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=True) + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1, 0, 1]), + 0, + None, + 1., + false, + )?; + + // `noop_with_empty_axes = true (1)` should yield tensor equivallent to the input tensor + test( + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + None, + 0, + Some(1), + &[ + [[-7.648377, -5.4018507], [-7.318765, 7.2374434]], + [[6.304022, 4.939862], [4.5435624, 3.072864]], + [[-2.5058026, 8.008944], [9.587318, -8.794852]], + ], + false, + )?; + + // Rank-0 tensors are also valid + test(42., None, 0, None, 42., false)?; + test(42., None, 1, None, 42., false)?; + + // Negative test - expect error + // axes = np.array([-2, 0, 1], dtype=np.int64) + // np.minimum.reduce(data, axis=tuple(axes), keepdims=True) + // Should error out with `duplicate value in "axes"` + assert!(test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-2, 0, 1]), + 1, + None, + &[0.], + false + ) + .is_err()); + + // Negative test - expect error + // Should error out on empty set + assert!(test(&[[1_u8; 0]], Some(vec![-2, 0, 1]), 1, None, &[0.], false).is_err()); + + // Backward compatibility + test( + &[ + [[5., 1.], [20., 2.]], + [[30., 1.], [40., 2.]], + [[55., 1.], [60., 2.]], + ], + Some(vec![-1, 0, 1]), + 0, + None, + 1., + true, + )?; + + fn test( + data: impl NdArray, + axes: Option>, + keepdims: i64, + noop_with_empty_axes: Option, + expected: impl NdArray, + backward_comp: bool, + ) -> Result<()> { + let has_axes = axes.is_some(); + + let att_keepdims = AttributeProto { + name: "keepdims".to_string(), + ref_attr_name: "keepdims".to_string(), + i: keepdims, + doc_string: "keepdims".to_string(), + r#type: 2, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: vec![], + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }; + + let mut attribute = vec![att_keepdims]; + if let Some(noop) = noop_with_empty_axes { + if !has_axes { + let att_no_op_empty_axes = AttributeProto { + name: "noop_with_empty_axes".to_string(), + ref_attr_name: "noop_with_empty_axes".to_string(), + i: noop, + doc_string: "noop_with_empty_axes".to_string(), + r#type: 2, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: vec![], + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }; + + attribute.push(att_no_op_empty_axes); + } + } + if has_axes && backward_comp { + attribute.push(AttributeProto { + name: "axes".to_string(), + ref_attr_name: "axes".to_string(), + i: 0, + doc_string: "axes".to_string(), + r#type: 7, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: axes.clone().unwrap_or_default(), + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }); + } + + let manual_graph = create_model_proto_with_graph(Some(GraphProto { + node: vec![NodeProto { + op_type: "ReduceMin".to_string(), + domain: "".to_string(), + attribute, + input: if has_axes && !backward_comp { + vec![INPUT_X.to_string(), INPUT_Y.to_string()] + } else { + vec![INPUT_X.to_string()] + }, + output: vec![OUTPUT_Z.to_string()], + name: "".to_string(), + doc_string: "".to_string(), + }], + name: "".to_string(), + initializer: vec![], + input: vec![], + output: vec![ValueInfoProto { + name: OUTPUT_Z.to_string(), + doc_string: "".to_string(), + r#type: None, + }], + value_info: vec![], + doc_string: "".to_string(), + sparse_initializer: vec![], + quantization_annotation: vec![], + })); + + let mut inputs: HashMap = HashMap::new(); + let input_tensor = Tensor::new(data, &Device::Cpu)?; + let input_dtype = input_tensor.dtype(); + inputs.insert(INPUT_X.to_string(), input_tensor); + if !backward_comp { + if let Some(a) = axes { + inputs.insert(INPUT_Y.to_string(), Tensor::new(a, &Device::Cpu)?); + } + } + + let eval = candle_onnx::simple_eval(&manual_graph, inputs)?; + assert_eq!(eval.len(), 1); + + let z = eval.get(OUTPUT_Z).expect("Output 'z' not found"); + + let expected = Tensor::new(expected, &Device::Cpu)?; + + match expected.dims().len() { + 0 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec0::()?, expected.to_vec0::()?) + } else { + assert_eq!(z.to_vec0::()?, expected.to_vec0::()?) + } + } + 1 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec1::()?, expected.to_vec1::()?) + } else { + assert_eq!(z.to_vec1::()?, expected.to_vec1::()?) + } + } + 2 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec2::()?, expected.to_vec2::()?) + } else { + assert_eq!(z.to_vec2::()?, expected.to_vec2::()?) + } + } + 3 => { + if input_dtype == DType::U8 { + assert_eq!(z.to_vec3::()?, expected.to_vec3::()?) + } else { + assert_eq!(z.to_vec3::()?, expected.to_vec3::()?) + } + } + _ => unreachable!(), + }; + + Ok(()) + } + Ok(()) +} + // "ReduceMean" #[test] fn test_reduce_mean() -> Result<()> { From dcd83336b68049763973709733bf2721a687507d Mon Sep 17 00:00:00 2001 From: Anubhab Bandyopadhyay <4890833+AnubhabB@users.noreply.github.com> Date: Thu, 17 Oct 2024 16:30:45 +0530 Subject: [PATCH 21/28] Testcases (#2567) --- candle-core/src/tensor.rs | 7 +- candle-core/tests/tensor_tests.rs | 274 ++++++++++++++++++++++++++++++ 2 files changed, 278 insertions(+), 3 deletions(-) diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index 7dd24abf9b..e7355aadc5 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -1520,14 +1520,15 @@ impl Tensor { /// # Arguments /// /// * `self` - The input tensor. - /// * `indexes` - The indices of elements to gather, this should have the same shape as `self` - /// but can have a different number of elements on the target dimension. + /// * `indexes` - The indices of elements to gather, this should have same number of dimensions as `self` + /// and indexes.dims()[d] <= self.dims()[d] for all dimensions d != dim /// * `dim` - the target dimension. /// /// The resulting tensor has the same shape as `indexes` and use values from `self` indexed on /// dimension `dim` by the values in `indexes`. pub fn gather(&self, indexes: &Self, dim: D) -> Result { let dim = dim.to_index(self.shape(), "gather")?; + let self_dims = self.dims(); let indexes_dims = indexes.dims(); let mismatch = if indexes_dims.len() != self_dims.len() { @@ -1535,7 +1536,7 @@ impl Tensor { } else { let mut mismatch = false; for (i, (&d1, &d2)) in self_dims.iter().zip(indexes_dims.iter()).enumerate() { - if i != dim && d1 != d2 { + if i != dim && d1 < d2 { mismatch = true; break; } diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index e0cea15c61..e3246a33a5 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -1047,6 +1047,280 @@ fn gather(device: &Device) -> Result<()> { let ids = Tensor::new(&[[0u32, 2u32, 0u32], [0u32, 1u32, 1u32]], device)?; let hs = t.gather(&ids, 0)?; assert_eq!(hs.to_vec2::()?, &[[0.0, 7.0, 2.0], [0.0, 4.0, 5.0]]); + + // Random data + + // Dim: 0 + let t = Tensor::new( + &[ + [ + [108_f32, -47., 16., -56., -83., -130., 210.], + [253., 95., 151., 228., -210., -123., -127.], + [-9., -217., 2., -78., 163., 245., -204.], + [-246., 79., -238., 88., -226., -184., 171.], + [8., -48., -153., 234., -34., 166., -153.], + [124., 0., -10., -61., -242., -15., -238.], + ], + [ + [12., -64., -199., 244., -240., 156., -128.], + [173., -57., 4., -198., 233., -110., 238.], + [95., 82., 0., 240., 53., -211., 209.], + [-122., 167., -212., 227., -144., 61., 118.], + [-63., -146., 200., 244., 168., -167., 116.], + [-125., -147., 110., -253., -178., -250., -18.], + ], + [ + [57., 86., -50., 56., 92., 205., -78.], + [-137., -156., -18., 248., -61., -239., 14.], + [-248., -30., -50., -70., -251., 250., -83.], + [-221., 67., 72., 59., -24., -154., 232.], + [-144., -23., -74., 5., 93., 171., 205.], + [46., -77., -38., -226., 246., 161., -17.], + ], + [ + [-153., -231., -236., 161., 126., 2., -22.], + [-229., -41., 209., 164., 234., 160., 57.], + [223., 254., -186., -162., -46., -160., -102.], + [65., 30., 213., -253., 59., 224., -154.], + [-82., -203., -177., 17., 31., -256., -246.], + [176., -135., -65., 54., -56., 210., 76.], + ], + [ + [-10., -245., 168., 124., -14., -33., -178.], + [25., -43., -39., 132., -89., 169., 179.], + [187., -215., 32., -133., 87., -7., -168.], + [-224., -215., -5., -230., -58., -162., 128.], + [158., -137., -122., -100., -202., -83., 136.], + [30., -185., -144., 250., 209., -40., 127.], + ], + [ + [-196., 108., -245., 122., 146., -228., 62.], + [-1., -66., 160., 137., 13., -172., -21.], + [244., 199., -164., 28., 119., -175., 198.], + [-62., 253., -162., 195., -95., -230., -211.], + [123., -72., -26., -107., -139., 64., 245.], + [11., -126., -182., 108., -12., 184., -127.], + ], + [ + [-159., 126., 176., 161., 73., -111., -138.], + [-187., 214., -217., -33., -223., -201., -212.], + [-61., -120., -166., -172., -95., 53., 196.], + [-33., 86., 134., -152., 154., -53., 74.], + [186., -28., -154., -174., 141., -109., 217.], + [82., 35., 252., 145., 181., 74., -87.], + ], + ], + device, + )?; + + let ids = Tensor::new( + &[ + [ + [6_u32, 6, 4, 3, 4, 4, 6], + [3, 3, 2, 4, 4, 4, 6], + [3, 3, 0, 2, 4, 6, 4], + [2, 5, 1, 2, 6, 6, 1], + [2, 1, 6, 5, 3, 2, 3], + [6, 1, 0, 1, 0, 2, 6], + ], + [ + [4, 6, 4, 3, 3, 3, 2], + [4, 3, 2, 4, 4, 4, 6], + [2, 3, 0, 2, 4, 6, 4], + [6, 5, 1, 2, 6, 6, 1], + [4, 1, 6, 5, 3, 2, 3], + [1, 1, 0, 1, 0, 2, 6], + ], + [ + [3, 6, 4, 3, 3, 3, 2], + [2, 3, 2, 4, 4, 4, 6], + [4, 3, 0, 2, 4, 6, 4], + [0, 5, 1, 2, 6, 6, 1], + [6, 1, 6, 5, 3, 2, 3], + [4, 1, 0, 1, 0, 2, 6], + ], + [ + [0, 6, 4, 3, 3, 3, 2], + [5, 3, 2, 4, 4, 4, 6], + [0, 3, 0, 2, 4, 6, 4], + [3, 5, 1, 2, 6, 6, 1], + [0, 1, 6, 5, 3, 2, 3], + [3, 1, 0, 1, 0, 2, 6], + ], + ], + device, + )?; + + let hs = t.gather(&ids, 0)?; + assert_eq!( + hs.to_vec3::()?, + &[ + [ + [-159_f32, 126., 168., 161., -14., -33., -138.], + [-229., -41., -18., 132., -89., 169., -212.], + [223., 254., 2., -70., 87., 53., -168.], + [-221., 253., -212., 59., 154., -53., 118.], + [-144., -146., -154., -107., 31., 171., -246.], + [82., -147., -10., -253., -242., 161., -87.] + ], + [ + [-10., 126., 168., 161., 126., 2., -78.], + [25., -41., -18., 132., -89., 169., -212.], + [-248., 254., 2., -70., 87., 53., -168.], + [-33., 253., -212., 59., 154., -53., 118.], + [158., -146., -154., -107., 31., 171., -246.], + [-125., -147., -10., -253., -242., 161., -87.] + ], + [ + [-153., 126., 168., 161., 126., 2., -78.], + [-137., -41., -18., 132., -89., 169., -212.], + [187., 254., 2., -70., 87., 53., -168.], + [-246., 253., -212., 59., 154., -53., 118.], + [186., -146., -154., -107., 31., 171., -246.], + [30., -147., -10., -253., -242., 161., -87.] + ], + [ + [108., 126., 168., 161., 126., 2., -78.], + [-1., -41., -18., 132., -89., 169., -212.], + [-9., 254., 2., -70., 87., 53., -168.], + [65., 253., -212., 59., 154., -53., 118.], + [8., -146., -154., -107., 31., 171., -246.], + [176., -147., -10., -253., -242., 161., -87.] + ] + ] + ); + + // Dim: 1 + let t = Tensor::new( + &[ + [ + [-117_f32, -175., 69., -163.], + [200., 242., -21., -67.], + [179., 150., -126., -75.], + [-118., 38., -138., -13.], + [-221., 136., -185., 180.], + [58., 182., -204., -149.], + ], + [ + [3., -148., -58., -154.], + [-43., 45., -108., 4.], + [-69., -249., -71., -21.], + [80., 110., -152., -235.], + [-88., 7., 92., -250.], + [-186., 207., -242., 98.], + ], + [ + [238., 19., 64., -242.], + [-150., -97., 218., 58.], + [111., -233., 204., -212.], + [-242., -232., 83., 42.], + [153., 62., -251., 219.], + [-117., 36., -119., 10.], + ], + [ + [215., 159., -169., -27.], + [-83., 101., -88., 169.], + [-205., 93., 225., -64.], + [-162., 240., 214., 23.], + [-112., 6., 21., 245.], + [-38., 113., 93., 215.], + ], + [ + [91., -188., -148., 101.], + [74., 203., -35., 55.], + [-116., -130., -153., -96.], + [58., 22., -45., -194.], + [-221., -134., 73., 159.], + [-203., -254., 31., 235.], + ], + [ + [105., -53., 61., 186.], + [-195., 234., 75., -1.], + [51., 139., 160., -108.], + [-173., -167., 161., 19.], + [83., -246., 156., -222.], + [109., 39., -149., 137.], + ], + ], + device, + )?; + + let ids = Tensor::new( + &[ + [[4_u32, 4, 4, 2]], + [[0, 4, 4, 3]], + [[1, 5, 3, 4]], + [[0, 3, 3, 2]], + [[1, 1, 5, 2]], + [[1, 4, 5, 4]], + ], + device, + )?; + + let hs = t.gather(&ids, 1)?; + assert_eq!( + hs.to_vec3::()?, + &[ + [[-221., 136., -185., -75.]], + [[3., 7., 92., -235.]], + [[-150., 36., 83., 219.]], + [[215., 240., 214., -64.]], + [[74., 203., 31., -96.]], + [[-195., -246., -149., -222.]] + ] + ); + + // Dim: 2 + let t = Tensor::new( + &[ + [[-162_f32, 202.], [-126., -39.], [35., -65.], [1., 80.]], + [[37., 248.], [-191., 89.], [117., -40.], [-217., 220.]], + ], + device, + )?; + + let ids = Tensor::new(&[[[1_u32], [0], [1], [1]], [[0], [1], [0], [1]]], device)?; + + let hs = t.gather(&ids, 2)?; + assert_eq!( + hs.to_vec3::()?, + &[ + [[202.], [-126.], [-65.], [80.]], + [[37.], [89.], [117.], [220.]] + ] + ); + + let t = Tensor::new( + &[ + [[-21_f32, -197.], [194., 122.]], + [[255., -106.], [-191., 250.]], + [[33., -117.], [43., 10.]], + [[-130., 238.], [-217., -92.]], + ], + device, + )?; + + let ids = Tensor::new( + &[ + [[0_u32, 1], [1, 0]], + [[1, 0], [0, 1]], + [[0, 1], [0, 1]], + [[1, 0], [1, 0]], + ], + device, + )?; + + let hs = t.gather(&ids, 2)?; + assert_eq!( + hs.to_vec3::()?, + &[ + [[-21., -197.], [122., 194.]], + [[-106., 255.], [-191., 250.]], + [[33., -117.], [43., 10.]], + [[238., -130.], [-92., -217.]] + ] + ); + Ok(()) } From 7c09215ef443256523d2de2579db56d1b59fd683 Mon Sep 17 00:00:00 2001 From: Anubhab Bandyopadhyay <4890833+AnubhabB@users.noreply.github.com> Date: Thu, 17 Oct 2024 23:52:35 +0530 Subject: [PATCH 22/28] ONNX: GatherElements, Xor (#2568) --- candle-onnx/src/eval.rs | 53 ++++ candle-onnx/tests/ops.rs | 529 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 582 insertions(+) diff --git a/candle-onnx/src/eval.rs b/candle-onnx/src/eval.rs index 629b3f93d5..358af7acff 100644 --- a/candle-onnx/src/eval.rs +++ b/candle-onnx/src/eval.rs @@ -670,6 +670,49 @@ fn simple_eval_( }; values.insert(node.output[0].clone(), xs); } + // https://onnx.ai/onnx/operators/onnx__GatherElements.html#gatherelements + // A Note to fellow lurkers: + // The numpy based `gather_elements` implementation in `onnx` tests [here](https://github.com/onnx/onnx/blob/main/onnx/backend/test/case/node/gatherelements.py) + // and examples is incorrect. + // Use `torch.gather` for the validating/ verifying against the proper behaviour + "GatherElements" => { + let data = get(&node.input[0])?; + let indices = get(&node.input[1])?; + + let rank = data.rank(); + if rank != indices.rank() { + bail!("indices must have same rank as input data. Data rank [{}] != indices rank [{}]", data.rank(), indices.rank()); + } + + let axis = { + let axis_i64 = get_attr_opt::(node, "axis")?.copied().unwrap_or(0); + let axis = data.normalize_axis(axis_i64)?; + + if axis >= rank { + bail!( + "axis ({}) out of accepted range [-rank, rank-1] which was [-{rank}, {}]", + axis_i64, + rank - 1 + ) + } + + axis + }; + + // index_select does not support negative indices, so normalize them + // to positive indices. + let indices = &{ + let zeros = Tensor::zeros(indices.shape(), indices.dtype(), indices.device())?; + let max = Tensor::new(data.dims()[axis] as i64, indices.device())? + .to_dtype(indices.dtype())?; + let mask = indices.lt(&zeros)?; + mask.to_dtype(indices.dtype())? + .broadcast_mul(&max)? + .add(indices)? + }; + + values.insert(node.output[0].clone(), data.gather(indices, axis)?); + } "Shape" => { // https://github.com/onnx/onnx/blob/main/docs/Operators.md#Shape let xs = get(&node.input[0])?; @@ -1891,6 +1934,16 @@ fn simple_eval_( ); } } + // https://onnx.ai/onnx/operators/onnx__Xor.html + "Xor" => { + // Since we don't have a `DType::Bool` yet, this ensures that we are working with `0`(False) & `1`(True) + let a = get(&node.input[0])?.gt(0_u8)?; + let b = get(&node.input[1])?.gt(0_u8)?; + + let out = a.broadcast_add(&b)?.eq(1_u8)?; + + values.insert(node.output[0].clone(), out); + } op_type => bail!("unsupported op_type {op_type} for op {node:?}"), } } diff --git a/candle-onnx/tests/ops.rs b/candle-onnx/tests/ops.rs index 450a9879e6..a84ba481ee 100644 --- a/candle-onnx/tests/ops.rs +++ b/candle-onnx/tests/ops.rs @@ -1159,6 +1159,163 @@ fn test_gather_operation() -> Result<()> { Ok(()) } +// GatherElements +#[test] +fn test_gather_elements() -> Result<()> { + // all the tests below are verified against `torch.gather()` + + // Rank 1 index + test(&[1.0, 2.0, 3.0, 4.0], &[3i64], 0, &[4.0])?; + + // Rank 2 index + test(&[[1.0, 2.0, 3.0, 4.0]], &[[3i64]], 1, &[[4.0]])?; + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-57 gather_elements_0 + test( + &[[1., 2.], [3., 4.]], + &[[0i64, 0], [1, 0]], + 1, + &[[1., 1.], [4., 3.]], + )?; + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-57 gather_elements_1 + test( + &[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]], + &[[1i64, 2, 0], [2, 0, 0]], + 0, + &[[4., 8., 3.], [7., 2., 3.]], + )?; + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#examples-57 gather_elements_negative_indices + test( + &[[1., 2., 3.], [4., 5., 6.], [7., 8., 9.]], + &[[-1_i64, -2, 0], [-2, 0, 0]], + 0, + &[[7., 5., 3.], [4., 2., 3.]], + )?; + test( + &[[1.0], [2.0], [3.0], [4.0]], + &[[3i64], [2]], + 0, + &[[4.], [3.]], + )?; + + // Rank 3 + test( + &[ + [[1.0, 2.0], [3.0, 4.0]], + [[5.0, 6.0], [7.0, 8.0]], + [[9.0, 10.0], [11.0, 12.0]], + [[13.0, 14.0], [15.0, 16.0]], + ], + &[[[1i64]]], + 0, + &[[[5.]]], + )?; + + test( + &[ + [[1.0, 2.0], [3.0, 4.0]], + [[5.0, 6.0], [7.0, 8.0]], + [[9.0, 10.0], [11.0, 12.0]], + [[13.0, 14.0], [15.0, 16.0]], + ], + &[[[1i64]]], + 1, + &[[[3.]]], + )?; + + test( + &[ + [[1.0, 2.0], [3.0, 4.0]], + [[5.0, 6.0], [7.0, 8.0]], + [[9.0, 10.0], [11.0, 12.0]], + [[13.0, 14.0], [15.0, 16.0]], + ], + &[[[1i64], [0]]], + 2, + &[[[2.], [3.]]], + )?; + + // Error cases + // Invalid index + assert!(test(&[[1.0, 2.0, 3.0, 4.0]], &[[3i64]], 0, &[[1., 2., 3., 4.]]).is_err()); + // Invalid axis/ dim + assert!(test(&[[1.0, 2.0, 3.0, 4.0]], &[[3i64]], 2, &[[1., 2., 3., 4.]]).is_err()); + // Invalid rank + assert!(test(&[[1.0, 2.0, 3.0, 4.0]], &[3i64], 0, &[[1.]]).is_err()); + + fn test( + data: impl NdArray, + indices: impl NdArray, + axis: i64, + expected: impl NdArray, + ) -> Result<()> { + let att_axis = AttributeProto { + name: "axis".to_string(), + ref_attr_name: "axis".to_string(), + i: axis, + doc_string: "axis".to_string(), + r#type: 2, + f: 0.0, + s: vec![], + t: None, + g: None, + sparse_tensor: None, + tp: None, + floats: vec![], + ints: vec![], + strings: vec![], + tensors: vec![], + graphs: vec![], + sparse_tensors: vec![], + type_protos: vec![], + }; + + let manual_graph = create_model_proto_with_graph(Some(GraphProto { + node: vec![NodeProto { + op_type: "GatherElements".to_string(), + domain: "".to_string(), + attribute: vec![att_axis], + input: vec![INPUT_X.to_string(), INPUT_Y.to_string()], + output: vec![OUTPUT_Z.to_string()], + name: "".to_string(), + doc_string: "".to_string(), + }], + name: "".to_string(), + initializer: vec![], + input: vec![], + output: vec![ValueInfoProto { + name: OUTPUT_Z.to_string(), + doc_string: "".to_string(), + r#type: None, + }], + value_info: vec![], + doc_string: "".to_string(), + sparse_initializer: vec![], + quantization_annotation: vec![], + })); + + let mut inputs: HashMap = HashMap::new(); + inputs.insert(INPUT_X.to_string(), Tensor::new(data, &Device::Cpu)?); + inputs.insert(INPUT_Y.to_string(), Tensor::new(indices, &Device::Cpu)?); + + let eval = candle_onnx::simple_eval(&manual_graph, inputs)?; + assert_eq!(eval.len(), 1); + + let z = eval.get(OUTPUT_Z).expect("Output 'z' not found"); + let expected = Tensor::new(expected, &Device::Cpu)?; + match expected.dims().len() { + 0 => assert_eq!(z.to_vec0::()?, expected.to_vec0::()?), + 1 => assert_eq!(z.to_vec1::()?, expected.to_vec1::()?), + 2 => assert_eq!(z.to_vec2::()?, expected.to_vec2::()?), + 3 => assert_eq!(z.to_vec3::()?, expected.to_vec3::()?), + _ => unreachable!(), + }; + + Ok(()) + } + + Ok(()) +} + // "Size" #[test] fn test_size_operation() -> Result<()> { @@ -5340,3 +5497,375 @@ fn test_reduce_sum_do_not_keep_dims() -> Result<()> { Ok(()) } + +// Xor +#[test] +fn test_xor() -> Result<()> { + // tests based on: https://github.com/onnx/onnx/blob/main/docs/Operators.md#Xor xor + + // 2d + test( + &[[0_u8, 1, 0, 0], [0, 0, 1, 1], [0, 1, 1, 1]], + &[[1_u8, 1, 0, 0], [1, 0, 0, 1], [1, 1, 1, 0]], + &[[1_u8, 0, 0, 0], [1, 0, 1, 0], [1, 0, 0, 1]], + )?; + + // 3d + test( + &[ + [ + [0_u8, 1, 1, 1, 1], + [0, 1, 1, 0, 0], + [1, 1, 1, 1, 1], + [0, 0, 0, 0, 1], + ], + [ + [0, 0, 1, 1, 1], + [1, 0, 1, 1, 1], + [1, 1, 0, 0, 1], + [1, 0, 0, 1, 0], + ], + [ + [1, 0, 0, 1, 1], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 1], + [1, 0, 0, 0, 1], + ], + ], + &[ + [ + [1_u8, 0, 0, 1, 1], + [0, 0, 1, 0, 1], + [1, 0, 0, 1, 0], + [0, 0, 0, 0, 0], + ], + [ + [1, 0, 0, 1, 1], + [1, 0, 1, 1, 1], + [0, 1, 0, 1, 1], + [1, 1, 1, 0, 0], + ], + [ + [0, 1, 1, 1, 0], + [1, 1, 0, 1, 0], + [0, 1, 1, 1, 0], + [1, 1, 0, 1, 0], + ], + ], + &[ + [ + [1_u8, 1, 1, 0, 0], + [0, 1, 0, 0, 1], + [0, 1, 1, 0, 1], + [0, 0, 0, 0, 1], + ], + [ + [1, 0, 1, 0, 0], + [0, 0, 0, 0, 0], + [1, 0, 0, 1, 0], + [0, 1, 1, 1, 0], + ], + [ + [1, 1, 1, 0, 1], + [0, 0, 1, 1, 0], + [1, 0, 1, 1, 1], + [0, 1, 0, 1, 1], + ], + ], + )?; + + // 4d + test( + &[ + [ + [[0_u8, 1, 1, 0], [1, 0, 0, 0], [1, 1, 0, 1]], + [[1, 1, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1]], + ], + [ + [[1, 1, 0, 0], [1, 0, 1, 0], [1, 0, 0, 0]], + [[1, 0, 0, 1], [1, 0, 1, 1], [1, 1, 0, 1]], + ], + ], + &[ + [ + [[1_u8, 0, 1, 0], [0, 0, 1, 1], [1, 0, 1, 0]], + [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]], + ], + [ + [[1, 1, 1, 0], [0, 0, 0, 1], [0, 0, 1, 0]], + [[0, 0, 0, 0], [1, 0, 0, 0], [1, 1, 1, 1]], + ], + ], + &[ + [ + [[1_u8, 1, 0, 0], [1, 0, 1, 1], [0, 1, 1, 1]], + [[1, 0, 0, 1], [1, 0, 0, 1], [0, 0, 0, 0]], + ], + [ + [[0, 0, 1, 0], [1, 0, 1, 1], [1, 0, 1, 0]], + [[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 1, 0]], + ], + ], + )?; + + // tests based on: https://github.com/onnx/onnx/blob/main/docs/Operators.md#Xor xor_broadcast + // 3d vs 1d + test( + // Shape (3, 4, 5) + &[ + [ + [0_u8, 0, 0, 0, 1], + [0, 1, 0, 1, 1], + [1, 0, 0, 1, 1], + [0, 0, 1, 0, 1], + ], + [ + [0, 1, 0, 1, 1], + [1, 1, 0, 0, 1], + [0, 1, 1, 1, 0], + [0, 0, 0, 0, 1], + ], + [ + [1, 1, 0, 1, 1], + [0, 0, 0, 1, 1], + [0, 1, 1, 0, 1], + [1, 1, 0, 1, 1], + ], + ], + // shape (5) + &[1_u8, 0, 0, 1, 1], + // shape (3, 4, 5) + &[ + [ + [1_u8, 0, 0, 1, 0], + [1, 1, 0, 0, 0], + [0, 0, 0, 0, 0], + [1, 0, 1, 1, 0], + ], + [ + [1, 1, 0, 0, 0], + [0, 1, 0, 1, 0], + [1, 1, 1, 0, 1], + [1, 0, 0, 1, 0], + ], + [ + [0, 1, 0, 0, 0], + [1, 0, 0, 0, 0], + [1, 1, 1, 1, 0], + [0, 1, 0, 0, 0], + ], + ], + )?; + + // 3d vs 2d + test( + // Shape (3, 4, 5) + &[ + [ + [0_u8, 0, 0, 0, 1], + [0, 1, 0, 1, 1], + [1, 0, 0, 1, 1], + [0, 0, 1, 0, 1], + ], + [ + [0, 1, 0, 1, 1], + [1, 1, 0, 0, 1], + [0, 1, 1, 1, 0], + [0, 0, 0, 0, 1], + ], + [ + [1, 1, 0, 1, 1], + [0, 0, 0, 1, 1], + [0, 1, 1, 0, 1], + [1, 1, 0, 1, 1], + ], + ], + // shape (4, 5) + &[ + [0_u8, 1, 0, 1, 0], + [0, 0, 1, 0, 0], + [1, 1, 0, 1, 1], + [1, 1, 0, 1, 0], + ], + // shape (3, 4, 5) + &[ + [ + [0_u8, 1, 0, 1, 1], + [0, 1, 1, 1, 1], + [0, 1, 0, 0, 0], + [1, 1, 1, 1, 1], + ], + [ + [0, 0, 0, 0, 1], + [1, 1, 1, 0, 1], + [1, 0, 1, 0, 1], + [1, 1, 0, 1, 1], + ], + [ + [1, 0, 0, 0, 1], + [0, 0, 1, 1, 1], + [1, 0, 1, 1, 0], + [0, 0, 0, 0, 1], + ], + ], + )?; + + // 4d vs 2d + test( + // Shape (2, 3, 3, 4) + &[ + [ + [[1_u8, 0, 0, 1], [1, 1, 0, 0], [0, 1, 0, 0]], + [[1, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 1]], + [[1, 0, 0, 0], [1, 1, 1, 0], [0, 0, 1, 1]], + ], + [ + [[0, 1, 0, 1], [1, 1, 0, 1], [1, 0, 1, 1]], + [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1]], + [[1, 0, 0, 0], [1, 1, 0, 0], [0, 1, 0, 1]], + ], + ], + // shape (3, 4) + &[[0_u8, 0, 1, 1], [1, 1, 1, 1], [0, 1, 0, 1]], + // shape (2, 3, 3, 4) + &[ + [ + [[1_u8, 0, 1, 0], [0, 0, 1, 1], [0, 0, 0, 1]], + [[1, 1, 1, 1], [1, 0, 1, 1], [1, 1, 0, 0]], + [[1, 0, 1, 1], [0, 0, 0, 1], [0, 1, 1, 0]], + ], + [ + [[0, 1, 1, 0], [0, 0, 1, 0], [1, 1, 1, 0]], + [[1, 1, 1, 1], [0, 1, 1, 1], [0, 1, 1, 0]], + [[1, 0, 1, 1], [0, 0, 1, 1], [0, 0, 0, 0]], + ], + ], + )?; + + // 4d vs 3d + test( + // Shape (2, 3, 3, 4) + &[ + [ + [[1_u8, 0, 0, 1], [1, 1, 0, 0], [0, 1, 0, 0]], + [[1, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 1]], + [[1, 0, 0, 0], [1, 1, 1, 0], [0, 0, 1, 1]], + ], + [ + [[0, 1, 0, 1], [1, 1, 0, 1], [1, 0, 1, 1]], + [[1, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 1]], + [[1, 0, 0, 0], [1, 1, 0, 0], [0, 1, 0, 1]], + ], + ], + // shape (3, 3, 4) + &[ + [[1_u8, 1, 0, 0], [0, 0, 1, 1], [0, 1, 0, 0]], + [[0, 1, 0, 1], [0, 0, 0, 0], [0, 1, 0, 1]], + [[0, 1, 1, 0], [1, 0, 1, 1], [1, 1, 0, 1]], + ], + // shape (2, 3, 3, 4) + &[ + [ + [[0_u8, 1, 0, 1], [1, 1, 1, 1], [0, 0, 0, 0]], + [[1, 0, 0, 1], [0, 1, 0, 0], [1, 1, 0, 0]], + [[1, 1, 1, 0], [0, 1, 0, 1], [1, 1, 1, 0]], + ], + [ + [[1, 0, 0, 1], [1, 1, 1, 0], [1, 1, 1, 1]], + [[1, 0, 0, 1], [1, 0, 0, 0], [0, 1, 1, 0]], + [[1, 1, 1, 0], [0, 1, 1, 1], [1, 0, 0, 0]], + ], + ], + )?; + + // 4d vs 4d + test( + // Shape (1, 4, 1, 2) + &[[[[1_u8, 0]], [[1, 0]], [[1, 0]], [[1, 1]]]], + // shape (2, 1, 4, 2) + &[ + [[[0_u8, 0], [1, 1], [1, 1], [1, 1]]], + [[[0, 1], [1, 0], [0, 1], [0, 0]]], + ], + // shape (2, 4, 4, 2) + &[ + [ + [[1_u8, 0], [0, 1], [0, 1], [0, 1]], + [[1, 0], [0, 1], [0, 1], [0, 1]], + [[1, 0], [0, 1], [0, 1], [0, 1]], + [[1, 1], [0, 0], [0, 0], [0, 0]], + ], + [ + [[1, 1], [0, 0], [1, 1], [1, 0]], + [[1, 1], [0, 0], [1, 1], [1, 0]], + [[1, 1], [0, 0], [1, 1], [1, 0]], + [[1, 0], [0, 1], [1, 0], [1, 1]], + ], + ], + )?; + + fn test(input: impl NdArray, other: impl NdArray, expected: impl NdArray) -> Result<()> { + let manual_graph = create_model_proto_with_graph(Some(GraphProto { + node: vec![NodeProto { + op_type: "Xor".to_string(), + domain: "".to_string(), + attribute: vec![], + input: vec![INPUT_X.to_string(), INPUT_Y.to_string()], + output: vec![OUTPUT_Z.to_string()], + name: "".to_string(), + doc_string: "".to_string(), + }], + name: "".to_string(), + initializer: vec![], + input: vec![], + output: vec![ValueInfoProto { + name: OUTPUT_Z.to_string(), + doc_string: "".to_string(), + r#type: None, + }], + value_info: vec![], + doc_string: "".to_string(), + sparse_initializer: vec![], + quantization_annotation: vec![], + })); + + let inputs: HashMap = HashMap::from([ + (INPUT_X.to_string(), Tensor::new(input, &Device::Cpu)?), + (INPUT_Y.to_string(), Tensor::new(other, &Device::Cpu)?), + ]); + + let eval = candle_onnx::simple_eval(&manual_graph, inputs)?; + assert_eq!(eval.len(), 1); + + let z = eval.get(OUTPUT_Z).expect("Output 'z' not found"); + + let expected = Tensor::new(expected, &Device::Cpu)?; + + match expected.dims().len() { + 0 => { + assert_eq!(z.to_vec0::()?, expected.to_vec0::()?) + } + 1 => { + assert_eq!(z.to_vec1::()?, expected.to_vec1::()?) + } + 2 => { + assert_eq!(z.to_vec2::()?, expected.to_vec2::()?) + } + 3 => { + assert_eq!(z.to_vec3::()?, expected.to_vec3::()?) + } + 4 => { + // Candle has no method equivallent to `to_vec4()` + // So, as a hack, we flatten it to a single dim vec to test the results + assert_eq!( + z.flatten_all()?.to_vec1::()?, + expected.flatten_all()?.to_vec1::()? + ) + } + _ => unreachable!(), + }; + + Ok(()) + } + Ok(()) +} From a2e9d41b2062be5b45c84d24fe2bf4527ec27cee Mon Sep 17 00:00:00 2001 From: Zack Angelo Date: Wed, 23 Oct 2024 11:07:09 -0700 Subject: [PATCH 23/28] use softmax_last_dim (metal and cuda kernel) in llama attention layer (#2572) --- candle-transformers/src/models/llama.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/candle-transformers/src/models/llama.rs b/candle-transformers/src/models/llama.rs index a7bef099d6..e77697340e 100644 --- a/candle-transformers/src/models/llama.rs +++ b/candle-transformers/src/models/llama.rs @@ -341,7 +341,8 @@ impl CausalSelfAttention { let mask = cache.mask(seq_len)?.broadcast_as(att.shape())?; masked_fill(&att, &mask, f32::NEG_INFINITY)? }; - let att = candle_nn::ops::softmax(&att, D::Minus1)?; + + let att = candle_nn::ops::softmax_last_dim(&att)?; // Convert to contiguous as matmul doesn't support strided vs for now. att.matmul(&v.contiguous()?)?.to_dtype(in_dtype)? }; From 3699c1a053c2789775837552b2eec37afd436c7d Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sat, 26 Oct 2024 11:25:04 +0200 Subject: [PATCH 24/28] Fix the repo name for llama 3.1. (#2576) * Fix the repo name for llama 3.1. * Fix the book. --- Cargo.toml | 2 +- candle-book/src/inference/hub.md | 8 ++++---- candle-examples/examples/llama/main.rs | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d6cf18614f..bd6e1a856b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,7 +46,7 @@ criterion = { version = "0.5.1", default-features=false } cudarc = { version = "0.12.1", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false } fancy-regex = "0.13.0" gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] } -hf-hub = "0.3.0" +hf-hub = { version = "0.3.3", package = "candle-hf-hub" } half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] } hound = "3.5.1" image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] } diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md index e8d8b267db..fb6f9e51f6 100644 --- a/candle-book/src/inference/hub.md +++ b/candle-book/src/inference/hub.md @@ -11,8 +11,8 @@ Then let's start by downloading the [model file](https://huggingface.co/bert-bas ```rust # extern crate candle_core; -# extern crate hf_hub; -use hf_hub::api::sync::Api; +# extern crate candle_hf_hub; +use candle_hf_hub::api::sync::Api; use candle_core::Device; let api = Api::new().unwrap(); @@ -50,8 +50,8 @@ Now that we have our weights, we can use them in our bert architecture: ```rust # extern crate candle_core; # extern crate candle_nn; -# extern crate hf_hub; -# use hf_hub::api::sync::Api; +# extern crate candle_hf_hub; +# use candle_hf_hub::api::sync::Api; # # let api = Api::new().unwrap(); # let repo = api.model("bert-base-uncased".to_string()); diff --git a/candle-examples/examples/llama/main.rs b/candle-examples/examples/llama/main.rs index 7a555b00af..cc99b6c191 100644 --- a/candle-examples/examples/llama/main.rs +++ b/candle-examples/examples/llama/main.rs @@ -139,8 +139,8 @@ fn main() -> Result<()> { Which::V2 => "meta-llama/Llama-2-7b-hf".to_string(), Which::V3 => "meta-llama/Meta-Llama-3-8B".to_string(), Which::V3Instruct => "meta-llama/Meta-Llama-3-8B-Instruct".to_string(), - Which::V31 => "meta-llama/Meta-Llama-3.1-8B".to_string(), - Which::V31Instruct => "meta-llama/Meta-Llama-3.1-8B-Instruct".to_string(), + Which::V31 => "meta-llama/Llama-3.1-8B".to_string(), + Which::V31Instruct => "meta-llama/Llama-3.1-8B-Instruct".to_string(), Which::V32_1b => "meta-llama/Llama-3.2-1B".to_string(), Which::V32_1bInstruct => "meta-llama/Llama-3.2-1B-Instruct".to_string(), Which::V32_3b => "meta-llama/Llama-3.2-3B".to_string(), From 07849aa595c65309ed9230a4c97035f471c6afb1 Mon Sep 17 00:00:00 2001 From: sashaphmn Date: Sat, 26 Oct 2024 19:23:52 +0300 Subject: [PATCH 25/28] Update README.md (#2577) --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4c84a09185..246e2844ad 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ [![discord server](https://dcbadge.vercel.app/api/server/hugging-face-879548962464493619)](https://discord.gg/hugging-face-879548962464493619) [![Latest version](https://img.shields.io/crates/v/candle-core.svg)](https://crates.io/crates/candle-core) [![Documentation](https://docs.rs/candle-core/badge.svg)](https://docs.rs/candle-core) -![License](https://img.shields.io/crates/l/candle-core.svg) +[![License](https://img.shields.io/github/license/base-org/node?color=blue)](https://github.com/huggingface/candle/blob/main/LICENSE-MIT) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](https://github.com/huggingface/candle/blob/main/LICENSE-APACHE) Candle is a minimalist ML framework for Rust with a focus on performance (including GPU support) and ease of use. Try our online demos: From 37e0ab8c64eb8219e32cf546ac2aa570ed3d1f82 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 27 Oct 2024 10:01:04 +0100 Subject: [PATCH 26/28] Stable diffusion 3.5 support. (#2578) * Stable diffusion 3.5 support. * Clippy fixes. * CFG fix. * Remove some unnecessary clones. * Avoid duplicating some of the code. --- .../examples/stable-diffusion-3/clip.rs | 50 ++++- .../examples/stable-diffusion-3/main.rs | 198 +++++++++++------- .../examples/stable-diffusion-3/sampling.rs | 2 +- candle-transformers/src/models/mmdit/model.rs | 14 ++ .../src/models/mmdit/projections.rs | 30 ++- 5 files changed, 209 insertions(+), 85 deletions(-) diff --git a/candle-examples/examples/stable-diffusion-3/clip.rs b/candle-examples/examples/stable-diffusion-3/clip.rs index 77263d968c..d198366a83 100644 --- a/candle-examples/examples/stable-diffusion-3/clip.rs +++ b/candle-examples/examples/stable-diffusion-3/clip.rs @@ -1,6 +1,7 @@ use anyhow::{Error as E, Ok, Result}; use candle::{DType, IndexOp, Module, Tensor, D}; use candle_transformers::models::{stable_diffusion, t5}; +use std::path::PathBuf; use tokenizers::tokenizer::Tokenizer; struct ClipWithTokenizer { @@ -130,6 +131,53 @@ pub struct StableDiffusion3TripleClipWithTokenizer { } impl StableDiffusion3TripleClipWithTokenizer { + pub fn new_split( + clip_g_file: &PathBuf, + clip_l_file: &PathBuf, + t5xxl_file: &PathBuf, + device: &candle::Device, + ) -> Result { + let vb_clip_g = unsafe { + candle_nn::VarBuilder::from_mmaped_safetensors(&[clip_g_file], DType::F16, device)? + }; + let vb_clip_l = unsafe { + candle_nn::VarBuilder::from_mmaped_safetensors(&[clip_l_file], DType::F16, device)? + }; + let vb_t5 = unsafe { + candle_nn::VarBuilder::from_mmaped_safetensors(&[t5xxl_file], DType::F32, device)? + }; + let max_position_embeddings = 77usize; + let clip_l = ClipWithTokenizer::new( + vb_clip_l, + stable_diffusion::clip::Config::sdxl(), + "openai/clip-vit-large-patch14", + max_position_embeddings, + )?; + + let text_projection = + candle_nn::linear_no_bias(1280, 1280, vb_clip_g.pp("text_projection"))?; + + let clip_g = ClipWithTokenizer::new( + vb_clip_g, + stable_diffusion::clip::Config::sdxl2(), + "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", + max_position_embeddings, + )?; + + // Current T5 implementation does not support fp16, so we use fp32 VarBuilder for T5. + // This is a temporary workaround until the T5 implementation is updated to support fp16. + // Also see: + // https://github.com/huggingface/candle/issues/2480 + // https://github.com/huggingface/candle/pull/2481 + let t5 = T5WithTokenizer::new(vb_t5, max_position_embeddings)?; + Ok(Self { + clip_l, + clip_g, + clip_g_text_projection: text_projection, + t5, + }) + } + pub fn new(vb_fp16: candle_nn::VarBuilder, vb_fp32: candle_nn::VarBuilder) -> Result { let max_position_embeddings = 77usize; let clip_l = ClipWithTokenizer::new( @@ -158,7 +206,6 @@ impl StableDiffusion3TripleClipWithTokenizer { // https://github.com/huggingface/candle/issues/2480 // https://github.com/huggingface/candle/pull/2481 let t5 = T5WithTokenizer::new(vb_fp32.pp("t5xxl.transformer"), max_position_embeddings)?; - Ok(Self { clip_l, clip_g, @@ -195,7 +242,6 @@ impl StableDiffusion3TripleClipWithTokenizer { .encode_text_to_embedding(prompt, device)? .to_dtype(DType::F16)?; let context = Tensor::cat(&[&clip_embeddings_concat, &t5_embeddings], D::Minus2)?; - Ok((context, y)) } } diff --git a/candle-examples/examples/stable-diffusion-3/main.rs b/candle-examples/examples/stable-diffusion-3/main.rs index ee467839e8..702d8eec16 100644 --- a/candle-examples/examples/stable-diffusion-3/main.rs +++ b/candle-examples/examples/stable-diffusion-3/main.rs @@ -11,6 +11,25 @@ use crate::vae::{build_sd3_vae_autoencoder, sd3_vae_vb_rename}; use anyhow::{Ok, Result}; use clap::Parser; +#[derive(Clone, Debug, Copy, PartialEq, Eq, clap::ValueEnum)] +enum Which { + #[value(name = "3-medium")] + V3Medium, + #[value(name = "3.5-large")] + V3_5Large, + #[value(name = "3.5-large-turbo")] + V3_5LargeTurbo, +} + +impl Which { + fn is_3_5(&self) -> bool { + match self { + Self::V3Medium => false, + Self::V3_5Large | Self::V3_5LargeTurbo => true, + } + } +} + #[derive(Parser)] #[command(author, version, about, long_about = None)] struct Args { @@ -30,10 +49,6 @@ struct Args { #[arg(long)] cpu: bool, - /// The GPU device ID to use. - #[arg(long, default_value_t = 0)] - gpu_device_id: usize, - /// Enable tracing (generates a trace-timestamp.json file). #[arg(long)] tracing: bool, @@ -50,13 +65,17 @@ struct Args { #[arg(long, default_value_t = 1024)] width: usize, + /// The model to use. + #[arg(long, default_value = "3-medium")] + which: Which, + /// The seed to use when generating random samples. - #[arg(long, default_value_t = 28)] - num_inference_steps: usize, + #[arg(long)] + num_inference_steps: Option, // CFG scale. - #[arg(long, default_value_t = 4.0)] - cfg_scale: f64, + #[arg(long)] + cfg_scale: Option, // Time shift factor (alpha). #[arg(long, default_value_t = 3.0)] @@ -68,12 +87,6 @@ struct Args { } fn main() -> Result<()> { - let args = Args::parse(); - // Your main code here - run(args) -} - -fn run(args: Args) -> Result<()> { use tracing_chrome::ChromeLayerBuilder; use tracing_subscriber::prelude::*; @@ -81,7 +94,6 @@ fn run(args: Args) -> Result<()> { prompt, uncond_prompt, cpu, - gpu_device_id, tracing, use_flash_attn, height, @@ -90,7 +102,8 @@ fn run(args: Args) -> Result<()> { cfg_scale, time_shift, seed, - } = args; + which, + } = Args::parse(); let _guard = if tracing { let (chrome_layer, guard) = ChromeLayerBuilder::new().build(); @@ -100,87 +113,110 @@ fn run(args: Args) -> Result<()> { None }; - let device = if cpu { - candle::Device::Cpu - } else if candle::utils::cuda_is_available() { - candle::Device::new_cuda(gpu_device_id)? - } else if candle::utils::metal_is_available() { - candle::Device::new_metal(gpu_device_id)? - } else { - candle::Device::Cpu + let device = candle_examples::device(cpu)?; + let default_inference_steps = match which { + Which::V3_5Large => 28, + Which::V3_5LargeTurbo => 4, + Which::V3Medium => 28, + }; + let num_inference_steps = num_inference_steps.unwrap_or(default_inference_steps); + let default_cfg_scale = match which { + Which::V3_5Large => 4.0, + Which::V3_5LargeTurbo => 1.0, + Which::V3Medium => 4.0, }; + let cfg_scale = cfg_scale.unwrap_or(default_cfg_scale); let api = hf_hub::api::sync::Api::new()?; - let sai_repo = { - let name = "stabilityai/stable-diffusion-3-medium"; - api.repo(hf_hub::Repo::model(name.to_string())) - }; - let model_file = sai_repo.get("sd3_medium_incl_clips_t5xxlfp16.safetensors")?; - let vb_fp16 = unsafe { - candle_nn::VarBuilder::from_mmaped_safetensors(&[model_file.clone()], DType::F16, &device)? - }; + let (mmdit_config, mut triple, vb) = if which.is_3_5() { + let sai_repo = { + let name = match which { + Which::V3_5Large => "stabilityai/stable-diffusion-3.5-large", + Which::V3_5LargeTurbo => "stabilityai/stable-diffusion-3.5-large-turbo", + Which::V3Medium => unreachable!(), + }; + api.repo(hf_hub::Repo::model(name.to_string())) + }; + let clip_g_file = sai_repo.get("text_encoders/clip_g.safetensors")?; + let clip_l_file = sai_repo.get("text_encoders/clip_l.safetensors")?; + let t5xxl_file = sai_repo.get("text_encoders/t5xxl_fp16.safetensors")?; + let model_file = { + let model_file = match which { + Which::V3_5Large => "sd3.5_large.safetensors", + Which::V3_5LargeTurbo => "sd3.5_large_turbo.safetensors", + Which::V3Medium => unreachable!(), + }; + sai_repo.get(model_file)? + }; + let triple = StableDiffusion3TripleClipWithTokenizer::new_split( + &clip_g_file, + &clip_l_file, + &t5xxl_file, + &device, + )?; + let vb = unsafe { + candle_nn::VarBuilder::from_mmaped_safetensors(&[model_file], DType::F16, &device)? + }; + (MMDiTConfig::sd3_5_large(), triple, vb) + } else { + let sai_repo = { + let name = "stabilityai/stable-diffusion-3-medium"; + api.repo(hf_hub::Repo::model(name.to_string())) + }; + let model_file = sai_repo.get("sd3_medium_incl_clips_t5xxlfp16.safetensors")?; + let vb_fp16 = unsafe { + candle_nn::VarBuilder::from_mmaped_safetensors(&[&model_file], DType::F16, &device)? + }; - let (context, y) = { let vb_fp32 = unsafe { - candle_nn::VarBuilder::from_mmaped_safetensors( - &[model_file.clone()], - DType::F32, - &device, - )? + candle_nn::VarBuilder::from_mmaped_safetensors(&[model_file], DType::F32, &device)? }; - let mut triple = StableDiffusion3TripleClipWithTokenizer::new( + let triple = StableDiffusion3TripleClipWithTokenizer::new( vb_fp16.pp("text_encoders"), vb_fp32.pp("text_encoders"), )?; - let (context, y) = triple.encode_text_to_embedding(prompt.as_str(), &device)?; - let (context_uncond, y_uncond) = - triple.encode_text_to_embedding(uncond_prompt.as_str(), &device)?; - ( - Tensor::cat(&[context, context_uncond], 0)?, - Tensor::cat(&[y, y_uncond], 0)?, - ) - }; - - let x = { - let mmdit = MMDiT::new( - &MMDiTConfig::sd3_medium(), - use_flash_attn, - vb_fp16.pp("model.diffusion_model"), - )?; - - if let Some(seed) = seed { - device.set_seed(seed)?; - } - let start_time = std::time::Instant::now(); - let x = sampling::euler_sample( - &mmdit, - &y, - &context, - num_inference_steps, - cfg_scale, - time_shift, - height, - width, - )?; - let dt = start_time.elapsed().as_secs_f32(); - println!( - "Sampling done. {num_inference_steps} steps. {:.2}s. Average rate: {:.2} iter/s", - dt, - num_inference_steps as f32 / dt - ); - x + (MMDiTConfig::sd3_medium(), triple, vb_fp16) }; + let (context, y) = triple.encode_text_to_embedding(prompt.as_str(), &device)?; + let (context_uncond, y_uncond) = + triple.encode_text_to_embedding(uncond_prompt.as_str(), &device)?; + let context = Tensor::cat(&[context, context_uncond], 0)?; + let y = Tensor::cat(&[y, y_uncond], 0)?; + + let mmdit = MMDiT::new( + &mmdit_config, + use_flash_attn, + vb.pp("model.diffusion_model"), + )?; + + if let Some(seed) = seed { + device.set_seed(seed)?; + } + let start_time = std::time::Instant::now(); + let x = sampling::euler_sample( + &mmdit, + &y, + &context, + num_inference_steps, + cfg_scale, + time_shift, + height, + width, + )?; + let dt = start_time.elapsed().as_secs_f32(); + println!( + "Sampling done. {num_inference_steps} steps. {:.2}s. Average rate: {:.2} iter/s", + dt, + num_inference_steps as f32 / dt + ); let img = { - let vb_vae = vb_fp16 - .clone() - .rename_f(sd3_vae_vb_rename) - .pp("first_stage_model"); + let vb_vae = vb.rename_f(sd3_vae_vb_rename).pp("first_stage_model"); let autoencoder = build_sd3_vae_autoencoder(vb_vae)?; // Apply TAESD3 scale factor. Seems to be significantly improving the quality of the image. // https://github.com/comfyanonymous/ComfyUI/blob/3c60ecd7a83da43d694e26a77ca6b93106891251/nodes.py#L721-L723 - autoencoder.decode(&((x.clone() / 1.5305)? + 0.0609)?)? + autoencoder.decode(&((x / 1.5305)? + 0.0609)?)? }; let img = ((img.clamp(-1f32, 1f32)? + 1.0)? * 127.5)?.to_dtype(candle::DType::U8)?; candle_examples::save_image(&img.i(0)?, "out.jpg")?; diff --git a/candle-examples/examples/stable-diffusion-3/sampling.rs b/candle-examples/examples/stable-diffusion-3/sampling.rs index 0efd160eba..cd881b6a2f 100644 --- a/candle-examples/examples/stable-diffusion-3/sampling.rs +++ b/candle-examples/examples/stable-diffusion-3/sampling.rs @@ -30,7 +30,7 @@ pub fn euler_sample( let timestep = (*s_curr) * 1000.0; let noise_pred = mmdit.forward( - &Tensor::cat(&[x.clone(), x.clone()], 0)?, + &Tensor::cat(&[&x, &x], 0)?, &Tensor::full(timestep as f32, (2,), x.device())?.contiguous()?, y, context, diff --git a/candle-transformers/src/models/mmdit/model.rs b/candle-transformers/src/models/mmdit/model.rs index 864b662377..5b5c90b0c3 100644 --- a/candle-transformers/src/models/mmdit/model.rs +++ b/candle-transformers/src/models/mmdit/model.rs @@ -36,6 +36,20 @@ impl Config { frequency_embedding_size: 256, } } + + pub fn sd3_5_large() -> Self { + Self { + patch_size: 2, + in_channels: 16, + out_channels: 16, + depth: 38, + head_size: 64, + adm_in_channels: 2048, + pos_embed_max_size: 192, + context_embed_size: 4096, + frequency_embedding_size: 256, + } + } } pub struct MMDiT { diff --git a/candle-transformers/src/models/mmdit/projections.rs b/candle-transformers/src/models/mmdit/projections.rs index dc1e8ec941..2775328596 100644 --- a/candle-transformers/src/models/mmdit/projections.rs +++ b/candle-transformers/src/models/mmdit/projections.rs @@ -56,6 +56,8 @@ impl QkvOnlyAttnProjections { pub struct AttnProjections { head_dim: usize, qkv: nn::Linear, + ln_k: Option, + ln_q: Option, proj: nn::Linear, } @@ -64,16 +66,42 @@ impl AttnProjections { let head_dim = dim / num_heads; let qkv = nn::linear(dim, dim * 3, vb.pp("qkv"))?; let proj = nn::linear(dim, dim, vb.pp("proj"))?; + let (ln_k, ln_q) = if vb.contains_tensor("ln_k.weight") { + let ln_k = candle_nn::rms_norm(head_dim, 1e-6, vb.pp("ln_k"))?; + let ln_q = candle_nn::rms_norm(head_dim, 1e-6, vb.pp("ln_q"))?; + (Some(ln_k), Some(ln_q)) + } else { + (None, None) + }; Ok(Self { head_dim, qkv, proj, + ln_k, + ln_q, }) } pub fn pre_attention(&self, x: &Tensor) -> Result { let qkv = self.qkv.forward(x)?; - split_qkv(&qkv, self.head_dim) + let Qkv { q, k, v } = split_qkv(&qkv, self.head_dim)?; + let q = match self.ln_q.as_ref() { + None => q, + Some(l) => { + let (b, t, h) = q.dims3()?; + l.forward(&q.reshape((b, t, (), self.head_dim))?)? + .reshape((b, t, h))? + } + }; + let k = match self.ln_k.as_ref() { + None => k, + Some(l) => { + let (b, t, h) = k.dims3()?; + l.forward(&k.reshape((b, t, (), self.head_dim))?)? + .reshape((b, t, h))? + } + }; + Ok(Qkv { q, k, v }) } pub fn post_attention(&self, x: &Tensor) -> Result { From 594d984f9cf79207f3beb6114ddf73cbc8427b56 Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 27 Oct 2024 13:37:19 +0100 Subject: [PATCH 27/28] Support for UG kernels. (#2579) * Support for UG kernels. * Add a dedicated test. --- Cargo.toml | 2 + candle-core/Cargo.toml | 4 +- candle-core/src/cuda_backend/device.rs | 21 ++++++++ candle-core/src/custom_op.rs | 67 ++++++++++++++++++++++++++ candle-core/src/device.rs | 8 +++ candle-core/src/error.rs | 7 +++ candle-core/src/lib.rs | 2 +- candle-core/tests/custom_op_tests.rs | 30 ++++++++++++ 8 files changed, 139 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bd6e1a856b..64e1460ebe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,6 +70,8 @@ tokenizers = { version = "0.19.1", default-features = false } tracing = "0.1.37" tracing-chrome = "0.7.1" tracing-subscriber = "0.3.7" +ug = "0.0.2" +ug-cuda = "0.0.2" yoke = { version = "0.7.2", features = ["derive"] } zip = { version = "1.1.1", default-features = false } metal = { version = "0.27.0", features = ["mps"]} diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml index cbf8f2007f..8ea2b08c03 100644 --- a/candle-core/Cargo.toml +++ b/candle-core/Cargo.toml @@ -28,6 +28,8 @@ rand_distr = { workspace = true } rayon = { workspace = true } safetensors = { workspace = true } thiserror = { workspace = true } +ug = { workspace = true } +ug-cuda = { workspace = true, optional = true } yoke = { workspace = true } zip = { workspace = true } @@ -39,7 +41,7 @@ criterion = { workspace = true } [features] default = [] -cuda = ["cudarc", "dep:candle-kernels"] +cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda"] cudnn = ["cuda", "cudarc/cudnn"] mkl = ["dep:libc", "dep:intel-mkl-src"] accelerate = ["dep:libc", "dep:accelerate-src"] diff --git a/candle-core/src/cuda_backend/device.rs b/candle-core/src/cuda_backend/device.rs index 89fe44a6e6..d3bd29030e 100644 --- a/candle-core/src/cuda_backend/device.rs +++ b/candle-core/src/cuda_backend/device.rs @@ -51,6 +51,27 @@ impl CudaDevice { self.device.clone() } + pub fn compile( + &self, + func_name: &'static str, + kernel: ug::lang::ssa::Kernel, + ) -> Result { + let mut buf = vec![]; + ug_cuda::code_gen::gen(&mut buf, func_name, &kernel)?; + let cuda_code = String::from_utf8(buf)?; + let opts = cudarc::nvrtc::CompileOptions { + use_fast_math: Some(true), + ..Default::default() + }; + let ptx = cudarc::nvrtc::safe::compile_ptx_with_opts(cuda_code, opts).w()?; + self.device.load_ptx(ptx, "ug", &[func_name]).w()?; + let func = match self.device.get_func("ug", func_name) { + Some(func) => func, + None => crate::bail!("unknown function ug::{func_name}"), + }; + Ok(func) + } + pub fn id(&self) -> DeviceId { self.id } diff --git a/candle-core/src/custom_op.rs b/candle-core/src/custom_op.rs index 3a85dba9f4..276e3658e7 100644 --- a/candle-core/src/custom_op.rs +++ b/candle-core/src/custom_op.rs @@ -375,3 +375,70 @@ impl Tensor { ) } } + +pub struct UgIOp1 { + name: &'static str, + #[cfg(feature = "cuda")] + func: cudarc::driver::CudaFunction, +} + +impl UgIOp1 { + #[allow(unused)] + pub fn new( + name: &'static str, + kernel: ug::lang::ssa::Kernel, + device: &crate::Device, + ) -> Result { + #[cfg(feature = "cuda")] + { + let device = device.as_cuda_device()?; + let func = device.compile(name, kernel)?; + Ok(Self { name, func }) + } + #[cfg(not(feature = "cuda"))] + { + Ok(Self { name }) + } + } +} + +impl InplaceOp1 for UgIOp1 { + fn name(&self) -> &'static str { + self.name + } + + fn cpu_fwd(&self, _: &mut CpuStorage, _: &Layout) -> Result<()> { + crate::bail!("ug ops are only supported on cuda at the moment") + } + + fn metal_fwd(&self, _: &mut MetalStorage, _: &Layout) -> Result<()> { + crate::bail!("ug ops are only supported on cuda at the moment") + } + + #[cfg(feature = "cuda")] + fn cuda_fwd(&self, sto: &mut CudaStorage, layout: &Layout) -> Result<()> { + use crate::cuda_backend::WrapErr; + use cudarc::driver::LaunchAsync; + + let elem_count = layout.shape().elem_count(); + // TODO: support more dtypes. + let sto = sto.as_cuda_slice::()?; + let sto = match layout.contiguous_offsets() { + None => crate::bail!("input has to be contiguous"), + Some((o1, o2)) => sto.slice(o1..o2), + }; + let params = (&sto,); + let (g, b) = if elem_count % 32 == 0 { + (elem_count / 32, 32) + } else { + (elem_count, 1) + }; + let cfg = cudarc::driver::LaunchConfig { + grid_dim: (g as u32, 1, 1), + block_dim: (b as u32, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { self.func.clone().launch(cfg, params) }.w()?; + Ok(()) + } +} diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs index c4a8e9361e..91925b5781 100644 --- a/candle-core/src/device.rs +++ b/candle-core/src/device.rs @@ -130,6 +130,14 @@ impl Device { Ok(Self::Cuda(crate::CudaDevice::new(ordinal)?)) } + pub fn as_cuda_device(&self) -> Result<&crate::CudaDevice> { + match self { + Self::Cuda(d) => Ok(d), + Self::Cpu => crate::bail!("expected a cuda device, got cpu"), + Self::Metal(_) => crate::bail!("expected a cuda device, got Metal"), + } + } + pub fn new_cuda_with_stream(ordinal: usize) -> Result { Ok(Self::Cuda(crate::CudaDevice::new_with_stream(ordinal)?)) } diff --git a/candle-core/src/error.rs b/candle-core/src/error.rs index e7112e2e61..a35bec3cbe 100644 --- a/candle-core/src/error.rs +++ b/candle-core/src/error.rs @@ -165,6 +165,9 @@ pub enum Error { #[error("Metal error {0}")] Metal(#[from] MetalError), + #[error(transparent)] + Ug(#[from] ug::Error), + #[error(transparent)] TryFromIntError(#[from] core::num::TryFromIntError), @@ -179,6 +182,10 @@ pub enum Error { #[error(transparent)] ParseInt(#[from] std::num::ParseIntError), + /// Utf8 parse error. + #[error(transparent)] + FromUtf8(#[from] std::string::FromUtf8Error), + /// I/O error. #[error(transparent)] Io(#[from] std::io::Error), diff --git a/candle-core/src/lib.rs b/candle-core/src/lib.rs index d8d6253213..39ca909d88 100644 --- a/candle-core/src/lib.rs +++ b/candle-core/src/lib.rs @@ -77,7 +77,7 @@ mod variable; pub use cuda_backend::cudnn; pub use cpu_backend::{CpuStorage, CpuStorageRef}; -pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3}; +pub use custom_op::{CustomOp1, CustomOp2, CustomOp3, InplaceOp1, InplaceOp2, InplaceOp3, UgIOp1}; pub use device::{Device, DeviceLocation, NdArray}; pub use dtype::{DType, DTypeParseError, FloatDType, IntDType, WithDType}; pub use error::{Error, Result}; diff --git a/candle-core/tests/custom_op_tests.rs b/candle-core/tests/custom_op_tests.rs index be59e0c0c3..f2c01aca8e 100644 --- a/candle-core/tests/custom_op_tests.rs +++ b/candle-core/tests/custom_op_tests.rs @@ -143,3 +143,33 @@ fn inplace_op1() -> Result<()> { ); Ok(()) } + +#[cfg(feature = "cuda")] +#[allow(clippy::approx_constant)] +#[test] +fn ug_op() -> Result<()> { + let kernel = { + use ug::lang::op; + + let layout = ug::Layout::from_shape(&[12]); + let ptr = op::Arg::ptr(ug::DType::F32); + let src = op::load(ptr.id(), layout.clone(), ug::DType::F32)?; + let src = op::unary(op::UnaryOp::Exp, src)?; + let st = op::store(ptr.id(), layout, src)?; + let kernel = op::Kernel::new("exp".to_string(), vec![ptr], vec![st]); + let opts: ug::lower_op::Opts = Default::default(); + kernel.lower(&opts.with_global(0, 12))? + }; + let device = Device::new_cuda(0)?; + let op = candle_core::UgIOp1::new("test", kernel, &device)?; + let t = Tensor::arange(0u32, 12u32, &device)?.to_dtype(DType::F32)?; + t.inplace_op1(&op)?; + assert_eq!( + to_vec1_round(&t, 4)?, + &[ + 1.0, 2.7183, 7.3891, 20.0855, 54.5982, 148.4132, 403.4287, 1096.6334, 2980.9578, + 8103.0806, 22026.469, 59874.133 + ] + ); + Ok(()) +} From 0e2c8c17fba0c1ba720e3bb50d2d4dec19cef07c Mon Sep 17 00:00:00 2001 From: Laurent Mazare Date: Sun, 27 Oct 2024 15:20:37 +0100 Subject: [PATCH 28/28] UG metal integration. (#2580) --- Cargo.toml | 1 + candle-core/Cargo.toml | 3 +- candle-core/src/custom_op.rs | 48 ++++++++++++++++++++++--- candle-core/src/device.rs | 8 +++++ candle-core/src/metal_backend/device.rs | 22 ++++++++++++ candle-core/tests/custom_op_tests.rs | 16 ++++++--- candle-metal-kernels/src/lib.rs | 2 +- candle-metal-kernels/src/utils.rs | 10 ++---- 8 files changed, 92 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 64e1460ebe..f27ec93326 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ tracing-chrome = "0.7.1" tracing-subscriber = "0.3.7" ug = "0.0.2" ug-cuda = "0.0.2" +ug-metal = "0.0.2" yoke = { version = "0.7.2", features = ["derive"] } zip = { version = "1.1.1", default-features = false } metal = { version = "0.27.0", features = ["mps"]} diff --git a/candle-core/Cargo.toml b/candle-core/Cargo.toml index 8ea2b08c03..4ffc869ff8 100644 --- a/candle-core/Cargo.toml +++ b/candle-core/Cargo.toml @@ -30,6 +30,7 @@ safetensors = { workspace = true } thiserror = { workspace = true } ug = { workspace = true } ug-cuda = { workspace = true, optional = true } +ug-metal = { workspace = true, optional = true } yoke = { workspace = true } zip = { workspace = true } @@ -45,7 +46,7 @@ cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda"] cudnn = ["cuda", "cudarc/cudnn"] mkl = ["dep:libc", "dep:intel-mkl-src"] accelerate = ["dep:libc", "dep:accelerate-src"] -metal = ["dep:metal", "dep:candle-metal-kernels"] +metal = ["dep:metal", "dep:candle-metal-kernels", "dep:ug-metal"] [[bench]] name = "bench_main" diff --git a/candle-core/src/custom_op.rs b/candle-core/src/custom_op.rs index 276e3658e7..c0d97d670a 100644 --- a/candle-core/src/custom_op.rs +++ b/candle-core/src/custom_op.rs @@ -380,6 +380,8 @@ pub struct UgIOp1 { name: &'static str, #[cfg(feature = "cuda")] func: cudarc::driver::CudaFunction, + #[cfg(feature = "metal")] + func: metal::ComputePipelineState, } impl UgIOp1 { @@ -395,7 +397,13 @@ impl UgIOp1 { let func = device.compile(name, kernel)?; Ok(Self { name, func }) } - #[cfg(not(feature = "cuda"))] + #[cfg(feature = "metal")] + { + let device = device.as_metal_device()?; + let func = device.compile(name, kernel)?; + Ok(Self { name, func }) + } + #[cfg(not(any(feature = "cuda", feature = "metal")))] { Ok(Self { name }) } @@ -408,11 +416,43 @@ impl InplaceOp1 for UgIOp1 { } fn cpu_fwd(&self, _: &mut CpuStorage, _: &Layout) -> Result<()> { - crate::bail!("ug ops are only supported on cuda at the moment") + crate::bail!("ug ops are only supported on metal/cuda at the moment") } - fn metal_fwd(&self, _: &mut MetalStorage, _: &Layout) -> Result<()> { - crate::bail!("ug ops are only supported on cuda at the moment") + #[cfg(feature = "metal")] + fn metal_fwd(&self, sto: &mut MetalStorage, layout: &Layout) -> Result<()> { + use crate::backend::BackendStorage; + use candle_metal_kernels::utils::EncoderProvider; + + let elem_count = layout.shape().elem_count(); + if sto.dtype() != crate::DType::F32 { + // TODO: support more dtypes. + crate::bail!("input is not a f32 tensor") + } + let device = sto.device(); + println!("here"); + let command_buffer = device.command_buffer()?; + let command_buffer = &command_buffer; + let encoder = command_buffer.encoder(); + let encoder = encoder.as_ref(); + encoder.set_compute_pipeline_state(&self.func); + let (g, b) = if elem_count % 32 == 0 { + (elem_count / 32, 32) + } else { + (elem_count, 1) + }; + let grid_dims = metal::MTLSize { + width: g as u64, + height: 1, + depth: 1, + }; + let group_dims = candle_metal_kernels::utils::get_block_dims(b as u64, 1, 1); + candle_metal_kernels::utils::set_param(encoder, 0, (sto.buffer(), 0usize)); + + encoder.use_resource(sto.buffer(), metal::MTLResourceUsage::Write); + encoder.dispatch_threads(grid_dims, group_dims); + + Ok(()) } #[cfg(feature = "cuda")] diff --git a/candle-core/src/device.rs b/candle-core/src/device.rs index 91925b5781..18aa61aff7 100644 --- a/candle-core/src/device.rs +++ b/candle-core/src/device.rs @@ -138,6 +138,14 @@ impl Device { } } + pub fn as_metal_device(&self) -> Result<&crate::MetalDevice> { + match self { + Self::Cuda(_) => crate::bail!("expected a metal device, got cuda"), + Self::Cpu => crate::bail!("expected a metal device, got cpu"), + Self::Metal(d) => Ok(d), + } + } + pub fn new_cuda_with_stream(ordinal: usize) -> Result { Ok(Self::Cuda(crate::CudaDevice::new_with_stream(ordinal)?)) } diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 29b8995bc9..46be6ce4bb 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -144,6 +144,28 @@ impl MetalDevice { self.use_mlx_mm = use_mlx_mm } + pub fn compile( + &self, + func_name: &'static str, + kernel: ug::lang::ssa::Kernel, + ) -> Result { + let mut buf = vec![]; + ug_metal::code_gen::gen(&mut buf, func_name, &kernel)?; + let metal_code = String::from_utf8(buf)?; + let lib = self + .device + .new_library_with_source(&metal_code, &metal::CompileOptions::new()) + .map_err(MetalError::from)?; + let func = lib + .get_function(func_name, None) + .map_err(MetalError::from)?; + let pl = self + .device + .new_compute_pipeline_state_with_function(&func) + .map_err(MetalError::from)?; + Ok(pl) + } + pub fn id(&self) -> DeviceId { self.id } diff --git a/candle-core/tests/custom_op_tests.rs b/candle-core/tests/custom_op_tests.rs index f2c01aca8e..3572a4c9b2 100644 --- a/candle-core/tests/custom_op_tests.rs +++ b/candle-core/tests/custom_op_tests.rs @@ -144,7 +144,7 @@ fn inplace_op1() -> Result<()> { Ok(()) } -#[cfg(feature = "cuda")] +#[cfg(any(feature = "cuda", feature = "metal"))] #[allow(clippy::approx_constant)] #[test] fn ug_op() -> Result<()> { @@ -160,15 +160,21 @@ fn ug_op() -> Result<()> { let opts: ug::lower_op::Opts = Default::default(); kernel.lower(&opts.with_global(0, 12))? }; - let device = Device::new_cuda(0)?; + let device = if candle_core::utils::cuda_is_available() { + Device::new_cuda(0)? + } else if candle_core::utils::metal_is_available() { + Device::new_metal(0)? + } else { + candle_core::bail!("metal/cuda is mandatory for this test") + }; let op = candle_core::UgIOp1::new("test", kernel, &device)?; let t = Tensor::arange(0u32, 12u32, &device)?.to_dtype(DType::F32)?; t.inplace_op1(&op)?; assert_eq!( - to_vec1_round(&t, 4)?, + to_vec1_round(&t, 2)?, &[ - 1.0, 2.7183, 7.3891, 20.0855, 54.5982, 148.4132, 403.4287, 1096.6334, 2980.9578, - 8103.0806, 22026.469, 59874.133 + 1.0, 2.72, 7.39, 20.09, 54.6, 148.41, 403.43, 1096.63, 2980.96, 8103.08, 22026.47, + 59874.13 ] ); Ok(()) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index be6160093e..222ae8ad85 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::ffi::c_void; use std::sync::RwLock; -mod utils; +pub mod utils; pub use utils::BufferOffset; use utils::{get_block_dims, linear_split, EncoderProvider}; diff --git a/candle-metal-kernels/src/utils.rs b/candle-metal-kernels/src/utils.rs index d2cc09f495..0092ecfa58 100644 --- a/candle-metal-kernels/src/utils.rs +++ b/candle-metal-kernels/src/utils.rs @@ -24,7 +24,7 @@ pub(crate) fn linear_split(pipeline: &ComputePipelineState, length: usize) -> (M } // https://github.com/ml-explore/mlx/blob/bddf23f175726a57f0e443cd45518c0757daa166/mlx/backend/metal/utils.h#L96 -pub(crate) fn get_block_dims(dim0: u64, dim1: u64, dim2: u64) -> MTLSize { +pub fn get_block_dims(dim0: u64, dim1: u64, dim2: u64) -> MTLSize { let mut pows0 = 0u64; let mut pows1 = 0u64; let mut pows2 = 0u64; @@ -61,18 +61,14 @@ pub(crate) fn get_block_dims(dim0: u64, dim1: u64, dim2: u64) -> MTLSize { } } -pub(crate) fn set_param( - encoder: &ComputeCommandEncoderRef, - position: u64, - data: P, -) { +pub fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: P) {

::set_param(encoder, position, data) } /// Helper functions to create the various objects on the compute command encoder /// on a single line. /// Prevents getting wrong some arguments number and mixing length and size in bytes. -pub(crate) trait EncoderParam { +pub trait EncoderParam { fn set_param(encoder: &ComputeCommandEncoderRef, position: u64, data: Self); } macro_rules! primitive {