From 0d1ec0bb4a5a7059825c2fae9a83465a50093cad Mon Sep 17 00:00:00 2001 From: likholat Date: Tue, 24 Dec 2024 16:19:36 +0100 Subject: [PATCH 1/3] [Image Generation] Image2Image for FLUX --- .../image_generation/image2image_pipeline.hpp | 8 + .../src/image_generation/flux_pipeline.hpp | 142 +++++++++++------- .../image_generation/image2image_pipeline.cpp | 21 +++ .../src/image_generation/image_processor.cpp | 2 + .../schedulers/euler_ancestral_discrete.cpp | 2 +- .../schedulers/flow_match_euler_discrete.cpp | 45 ++++++ .../schedulers/flow_match_euler_discrete.hpp | 11 +- .../schedulers/ischeduler.hpp | 10 ++ 8 files changed, 182 insertions(+), 59 deletions(-) diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp index b086bd2aad..a945aa377d 100644 --- a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp +++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp @@ -49,6 +49,14 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline { const UNet2DConditionModel& unet, const AutoencoderKL& vae); + // creates Flux pipeline from building blocks + static Image2ImagePipeline flux( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const T5EncoderModel t5_encoder_model, + const FluxTransformer2DModel& transformer, + const AutoencoderKL& vae); + ImageGenerationConfig get_generation_config() const; void set_generation_config(const ImageGenerationConfig& generation_config); diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index fb6888b191..547efa4415 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -27,17 +27,16 @@ ov::Tensor pack_latents(const ov::Tensor latents, size_t batch_size, size_t num_ // Permute to (0, 2, 4, 1, 3, 5) for (size_t b = 0; b < batch_size; ++b) { - for (size_t h2 = 0; h2 < h_half; ++h2) { - for (size_t w2 = 0; w2 < w_half; ++w2) { - for (size_t c = 0; c < num_channels_latents; ++c) { - for (size_t h3 = 0; h3 < 2; ++h3) { - for (size_t w3 = 0; w3 < 2; ++w3) { - size_t src_index = ((b * num_channels_latents + c) * h_half + h2) * 2 * w_half * 2 + (h3 * w_half + w2) * 2 + w3; - size_t dst_index = ((b * h_half + h2) * w_half + w2) * num_channels_latents * 4 + (c * 4 + h3 * 2 + w3); - - dst_data[dst_index] = src_data[src_index]; - } - } + for (size_t c = 0; c < num_channels_latents; ++c) { + for (size_t h2 = 0; h2 < h_half; ++h2) { + for (size_t w2 = 0; w2 < w_half; ++w2) { + size_t base_src_index = (b * num_channels_latents + c) * height * width + (h2 * 2 * width + w2 * 2); + size_t base_dst_index = (b * h_half * w_half + h2 * w_half + w2) * num_channels_latents * 4 + c * 4; + + dst_data[base_dst_index] = src_data[base_src_index]; + dst_data[base_dst_index + 1] = src_data[base_src_index + 1]; + dst_data[base_dst_index + 2] = src_data[base_src_index + width]; + dst_data[base_dst_index + 3] = src_data[base_src_index + width + 1]; } } } @@ -71,15 +70,14 @@ ov::Tensor unpack_latents(const ov::Tensor& latents, size_t height, size_t width for (size_t b = 0; b < batch_size; ++b) { for (size_t c4 = 0; c4 < c_quarter; ++c4) { for (size_t h2 = 0; h2 < h_half; ++h2) { - for (size_t h3 = 0; h3 < 2; ++h3) { - for (size_t w2 = 0; w2 < w_half; ++w2) { - for (size_t w3 = 0; w3 < 2; ++w3) { - size_t reshaped_index = (((b * h_half + h2) * w_half + w2) * c_quarter + c4) * 4 + h3 * 2 + w3; - size_t final_index = (b * c_quarter * height * width) + (c4 * height * width) + (h2 * 2 + h3) * width + (w2 * 2 + w3); - - dst_data[final_index] = src_data[reshaped_index]; - } - } + for (size_t w2 = 0; w2 < w_half; ++w2) { + size_t base_reshaped_index = (((b * h_half + h2) * w_half + w2) * c_quarter + c4) * 4; + size_t base_final_index = (b * c_quarter * height * width) + (c4 * height * width) + (h2 * 2 * width + w2 * 2); + + dst_data[base_final_index] = src_data[base_reshaped_index]; + dst_data[base_final_index + 1] = src_data[base_reshaped_index + 1]; + dst_data[base_final_index + width] = src_data[base_reshaped_index + 2]; + dst_data[base_final_index + width + 1] = src_data[base_reshaped_index + 3]; } } } @@ -111,7 +109,18 @@ namespace genai { class FluxPipeline : public DiffusionPipeline { public: - FluxPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : DiffusionPipeline(pipeline_type) { + explicit FluxPipeline(PipelineType pipeline_type) : DiffusionPipeline(pipeline_type) { + // TODO: support GPU as well + const std::string device = "CPU"; + + if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { + const bool do_normalize = true, do_binarize = false, gray_scale_source = false; + m_image_processor = std::make_shared(device, do_normalize, do_binarize, gray_scale_source); + m_image_resizer = std::make_shared(device, ov::element::u8, "NHWC", ov::op::v11::Interpolate::InterpolateMode::BICUBIC_PILLOW); + } + } + + FluxPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : FluxPipeline(pipeline_type) { const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); @@ -163,7 +172,7 @@ class FluxPipeline : public DiffusionPipeline { const std::filesystem::path& root_dir, const std::string& device, const ov::AnyMap& properties) - : DiffusionPipeline(pipeline_type) { + : FluxPipeline(pipeline_type) { const std::filesystem::path model_index_path = root_dir / "model_index.json"; std::ifstream file(model_index_path); OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path); @@ -216,11 +225,11 @@ class FluxPipeline : public DiffusionPipeline { const T5EncoderModel& t5_text_model, const FluxTransformer2DModel& transformer, const AutoencoderKL& vae) - : DiffusionPipeline(pipeline_type), - m_clip_text_encoder(std::make_shared(clip_text_model)), - m_t5_text_encoder(std::make_shared(t5_text_model)), - m_vae(std::make_shared(vae)), - m_transformer(std::make_shared(transformer)) { + : FluxPipeline(pipeline_type) { + m_clip_text_encoder = std::make_shared(clip_text_model); + m_t5_text_encoder = std::make_shared(t5_text_model); + m_vae = std::make_shared(vae); + m_transformer = std::make_shared(transformer); initialize_generation_config("FluxPipeline"); } @@ -288,6 +297,20 @@ class FluxPipeline : public DiffusionPipeline { m_transformer->set_hidden_states("img_ids", latent_image_ids); } + std::vector get_timesteps(size_t num_inference_steps, float strength) { + float init_timestep = std::min(static_cast(num_inference_steps) * strength, static_cast(num_inference_steps)); + size_t t_start = static_cast(std::max(static_cast(num_inference_steps) - init_timestep, 0.0f)); + + std::vector timesteps, m_scheduler_timesteps = m_scheduler->get_float_timesteps(); + for (size_t i = t_start; i < m_scheduler_timesteps.size(); ++i) { + timesteps.push_back(m_scheduler_timesteps[i]); + } + + m_scheduler->set_begin_index(t_start); + + return timesteps; + } + std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override { const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); @@ -299,16 +322,22 @@ class FluxPipeline : public DiffusionPipeline { num_channels_latents, height, width}; - ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latent, noise; + ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latents, noise; if (initial_image) { - OPENVINO_THROW("StableDiffusion3 image to image is not implemented"); + proccesed_image = m_image_resizer->execute(initial_image, generation_config.height, generation_config.width); + proccesed_image = m_image_processor->execute(proccesed_image); + + image_latents = m_vae->encode(proccesed_image, generation_config.generator); + noise = generation_config.generator->randn_tensor(latent_shape); + m_scheduler->scale_noise(image_latents, m_latent_timestep, noise); + latent = pack_latents(image_latents, generation_config.num_images_per_prompt, num_channels_latents, height, width); } else { noise = generation_config.generator->randn_tensor(latent_shape); latent = pack_latents(noise, generation_config.num_images_per_prompt, num_channels_latents, height, width); } - return std::make_tuple(latent, proccesed_image, image_latent, noise); + return std::make_tuple(latent, proccesed_image, image_latents, noise); } void set_lora_adapters(std::optional adapters) override { @@ -341,24 +370,30 @@ class FluxPipeline : public DiffusionPipeline { compute_hidden_states(positive_prompt, m_custom_generation_config); - ov::Tensor latents, processed_image, image_latent, noise; - std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config); - - size_t image_seq_len = latents.get_shape()[1]; + size_t image_seq_len = (m_custom_generation_config.height / vae_scale_factor / 2) * + (m_custom_generation_config.width / vae_scale_factor / 2); float mu = m_scheduler->calculate_shift(image_seq_len); - float linspace_end = 1.0f / m_custom_generation_config.num_inference_steps; std::vector sigmas = numpy_utils::linspace(1.0f, linspace_end, m_custom_generation_config.num_inference_steps, true); - m_scheduler->set_timesteps_with_sigma(sigmas, mu); - std::vector timesteps = m_scheduler->get_float_timesteps(); + + std::vector timesteps; + if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) { + timesteps = m_scheduler->get_float_timesteps(); + m_latent_timestep = timesteps[0]; + } else { + timesteps = get_timesteps(m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength); + } + + ov::Tensor latents, processed_image, image_latent, noise; + std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config); // 6. Denoising loop ov::Tensor timestep(ov::element::f32, {1}); float* timestep_data = timestep.data(); for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) { - timestep_data[0] = timesteps[inference_step] / 1000; + timestep_data[0] = timesteps[inference_step] / 1000.0f; ov::Tensor noise_pred_tensor = m_transformer->infer(latents, timestep); @@ -371,14 +406,15 @@ class FluxPipeline : public DiffusionPipeline { } latents = unpack_latents(latents, m_custom_generation_config.height, m_custom_generation_config.width, vae_scale_factor); + return m_vae->decode(latents); } ov::Tensor decode(const ov::Tensor latent) override { ov::Tensor unpacked_latent = unpack_latents(latent, - m_custom_generation_config.height, - m_custom_generation_config.width, - m_vae->get_vae_scale_factor()); + m_custom_generation_config.height, + m_custom_generation_config.width, + m_vae->get_vae_scale_factor()); return m_vae->decode(unpacked_latent); } @@ -415,20 +451,17 @@ class FluxPipeline : public DiffusionPipeline { m_generation_config = ImageGenerationConfig(); - if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) { - m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor; - m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor; - } - if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) { m_generation_config.guidance_scale = 3.5f; m_generation_config.num_inference_steps = 28; m_generation_config.strength = 1.0f; } else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) { - m_generation_config.guidance_scale = 3.5f; + m_generation_config.guidance_scale = 7.0f; m_generation_config.num_inference_steps = 28; - m_generation_config.strength = 1.0f; + m_generation_config.strength = 0.6f; + m_generation_config.height = 1024; + m_generation_config.width = 1024; } m_generation_config.max_sequence_length = 512; } else { @@ -438,7 +471,6 @@ class FluxPipeline : public DiffusionPipeline { void check_image_size(const int height, const int width) const override { assert(m_transformer != nullptr); - // const size_t vae_scale_factor = m_transformer->get_vae_scale_factor(); const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && (width % vae_scale_factor == 0 || width < 0), "Both 'width' and 'height' must be divisible by ", @@ -456,14 +488,6 @@ class FluxPipeline : public DiffusionPipeline { OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by FluxPipeline"); if ((m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) && initial_image) { - ov::Shape initial_image_shape = initial_image.get_shape(); - size_t height = initial_image_shape[1], width = initial_image_shape[2]; - - OPENVINO_ASSERT(generation_config.height == height, - "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same"); - OPENVINO_ASSERT(generation_config.width == width, - "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same"); - OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f, "'Strength' generation parameter must be withion [0, 1] range"); } else { @@ -476,7 +500,11 @@ class FluxPipeline : public DiffusionPipeline { std::shared_ptr m_clip_text_encoder = nullptr; std::shared_ptr m_t5_text_encoder = nullptr; std::shared_ptr m_vae = nullptr; + std::shared_ptr m_image_processor = nullptr, m_mask_processor_rgb = nullptr, m_mask_processor_gray = nullptr; + std::shared_ptr m_image_resizer = nullptr, m_mask_resizer = nullptr; ImageGenerationConfig m_custom_generation_config; + + float m_latent_timestep = -1; }; } // namespace genai diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp index 8537e56ad5..7b00df678f 100644 --- a/src/cpp/src/image_generation/image2image_pipeline.cpp +++ b/src/cpp/src/image_generation/image2image_pipeline.cpp @@ -9,6 +9,7 @@ #include "image_generation/stable_diffusion_pipeline.hpp" #include "image_generation/stable_diffusion_xl_pipeline.hpp" +#include "image_generation/flux_pipeline.hpp" #include "utils.hpp" @@ -22,6 +23,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir) m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir); } else if (class_name == "StableDiffusionXLPipeline") { m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir); + } else if (class_name == "FluxPipeline") { + m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir); } else { OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'"); } @@ -34,6 +37,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir, m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties); } else if (class_name == "StableDiffusionXLPipeline") { m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties); + } else if (class_name == "FluxPipeline") { + m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties); } else { OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'"); } @@ -44,6 +49,8 @@ Image2ImagePipeline::Image2ImagePipeline(const InpaintingPipeline& pipe) { m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, *stable_diffusion_xl); } else if (auto stable_diffusion = std::dynamic_pointer_cast(pipe.m_impl); stable_diffusion != nullptr) { m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, *stable_diffusion); + } else if (auto flux = std::dynamic_pointer_cast(pipe.m_impl); flux != nullptr) { + m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, *flux); } else { OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Image2ImagePipeline"); } @@ -94,6 +101,20 @@ Image2ImagePipeline Image2ImagePipeline::stable_diffusion_xl( return Image2ImagePipeline(impl); } +Image2ImagePipeline Image2ImagePipeline::flux( + const std::shared_ptr& scheduler, + const CLIPTextModel& clip_text_model, + const T5EncoderModel t5_encoder_model, + const FluxTransformer2DModel& transformer, + const AutoencoderKL& vae){ + auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, t5_encoder_model, transformer, vae); + + assert(scheduler != nullptr); + impl->set_scheduler(scheduler); + + return Image2ImagePipeline(impl); +} + ImageGenerationConfig Image2ImagePipeline::get_generation_config() const { return m_impl->get_generation_config(); } diff --git a/src/cpp/src/image_generation/image_processor.cpp b/src/cpp/src/image_generation/image_processor.cpp index 3dabf888ab..bd06c9b893 100644 --- a/src/cpp/src/image_generation/image_processor.cpp +++ b/src/cpp/src/image_generation/image_processor.cpp @@ -32,6 +32,7 @@ IImageProcessor::IImageProcessor(const std::string& device) : } ov::Tensor IImageProcessor::execute(ov::Tensor image) { + OPENVINO_ASSERT(m_request, "ImageProcessor model must be compiled first. Cannot infer non-compiled model"); m_request.set_input_tensor(image); m_request.infer(); return m_request.get_output_tensor(); @@ -124,6 +125,7 @@ ImageResizer::ImageResizer(const std::string& device, ov::element::Type type, ov } ov::Tensor ImageResizer::execute(ov::Tensor image, int64_t dst_height, int64_t dst_width) { + OPENVINO_ASSERT(m_request, "ImageResizer model must be compiled first. Cannot infer non-compiled model"); ov::Tensor target_spatial_tensor(ov::element::i64, ov::Shape{2}); target_spatial_tensor.data()[0] = dst_height; target_spatial_tensor.data()[1] = dst_width; diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp index 383fece163..5f711f29ac 100644 --- a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp @@ -208,7 +208,7 @@ std::map EulerAncestralDiscreteScheduler::step(ov::Tens return {{"latent", prev_sample}, {"denoised", pred_original_sample}}; } -size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{ +size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const { for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) { if (timestep == m_schedule_timesteps[i]) { return i; diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp index 265a561869..17e50ddc04 100644 --- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp +++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp @@ -146,6 +146,43 @@ void FlowMatchEulerDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tens OPENVINO_THROW("Not implemented"); } +size_t FlowMatchEulerDiscreteScheduler::_index_for_timestep(float timestep) { + if (m_schedule_timesteps.empty()) { + m_schedule_timesteps = m_timesteps; + } + + for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) { + if (timestep == m_schedule_timesteps[i]) { + return i; + } + } + + OPENVINO_THROW("Failed to find index for timestep ", timestep); +} + +void FlowMatchEulerDiscreteScheduler::scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) { + OPENVINO_ASSERT(timestep == -1, "Timestep is not computed yet"); + + size_t index_for_timestep; + if (m_begin_index == -1) { + index_for_timestep = _index_for_timestep(timestep); + } else if (m_step_index != -1) { + index_for_timestep = m_step_index; + } else { + index_for_timestep = m_begin_index; + } + + const float sigma = m_sigmas[index_for_timestep]; + + float * sample_data = sample.data(); + const float * noise_data = noise.data(); + + for (size_t i = 0; i < sample.get_size(); ++i) { + sample_data[i] = sigma * noise_data[i] + (1.0f - sigma) * sample_data[i]; + } + +} + void FlowMatchEulerDiscreteScheduler::set_timesteps_with_sigma(std::vector sigma, float mu) { m_timesteps.clear(); m_sigmas.clear(); @@ -184,5 +221,13 @@ float FlowMatchEulerDiscreteScheduler::calculate_shift(size_t image_seq_len) { return mu; } +void FlowMatchEulerDiscreteScheduler::set_begin_index(size_t begin_index) { + m_begin_index = begin_index; +} + +size_t FlowMatchEulerDiscreteScheduler::get_begin_index() { + return m_begin_index; +} + } // namespace genai } // namespace ov diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp index 6399290ff3..b087a8cb86 100644 --- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp +++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp @@ -42,13 +42,21 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler { void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override; + void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) override; + float calculate_shift(size_t image_seq_len) override; + void set_begin_index(size_t begin_index) override; + + size_t get_begin_index() override; + + + private: Config m_config; std::vector m_sigmas; - std::vector m_timesteps; + std::vector m_timesteps, m_schedule_timesteps; float m_sigma_min, m_sigma_max; size_t m_step_index, m_begin_index; @@ -56,6 +64,7 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler { void init_step_index(); double sigma_to_t(double simga); + size_t _index_for_timestep(float timestep); }; } // namespace genai diff --git a/src/cpp/src/image_generation/schedulers/ischeduler.hpp b/src/cpp/src/image_generation/schedulers/ischeduler.hpp index 2dadd59b1b..ff6807d2f8 100644 --- a/src/cpp/src/image_generation/schedulers/ischeduler.hpp +++ b/src/cpp/src/image_generation/schedulers/ischeduler.hpp @@ -43,6 +43,16 @@ class IScheduler : public Scheduler { virtual std::vector get_float_timesteps() const { OPENVINO_THROW("Scheduler doesn't support float timesteps"); } + + virtual void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) { + OPENVINO_THROW("Scheduler doesn't support `scale_noise` method"); + } + + virtual void set_begin_index(size_t begin_index) {}; + + virtual size_t get_begin_index() { + OPENVINO_THROW("Scheduler doesn't support `get_begin_index` method"); + } }; } // namespace genai From b06c5678ebdeb3967682adb3b2bbab94041a3119 Mon Sep 17 00:00:00 2001 From: likholat Date: Thu, 23 Jan 2025 14:58:10 +0100 Subject: [PATCH 2/3] review fixes --- SUPPORTED_MODELS.md | 2 +- src/cpp/src/image_generation/flux_pipeline.hpp | 14 +++----------- .../schedulers/flow_match_euler_discrete.hpp | 2 -- src/python/openvino_genai/py_openvino_genai.pyi | 3 +++ src/python/py_image_generation_pipelines.cpp | 1 + tools/who_what_benchmark/tests/test_cli_image.py | 7 +++++-- 6 files changed, 13 insertions(+), 16 deletions(-) diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md index 3064fb58c1..c5c55b8d73 100644 --- a/SUPPORTED_MODELS.md +++ b/SUPPORTED_MODELS.md @@ -242,7 +242,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel` Flux Supported - Not supported + Supported Not supported
    diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp index 547efa4415..83acdeb2fb 100644 --- a/src/cpp/src/image_generation/flux_pipeline.hpp +++ b/src/cpp/src/image_generation/flux_pipeline.hpp @@ -429,15 +429,6 @@ class FluxPipeline : public DiffusionPipeline { const size_t vae_scale_factor = m_vae->get_vae_scale_factor(); const auto& transformer_config = m_transformer->get_config(); - // in case of image to image generation_config_value is just ignored and computed based on initial image - if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) { - OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline"); - ov::Shape shape = initial_image.get_shape(); - int64_t dim_val = shape[dim_idx]; - - generation_config_value = dim_val - (dim_val % vae_scale_factor); - } - if (generation_config_value < 0) generation_config_value = transformer_config.m_default_sample_size * vae_scale_factor; } @@ -451,6 +442,9 @@ class FluxPipeline : public DiffusionPipeline { m_generation_config = ImageGenerationConfig(); + m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor; + m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor; + if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) { if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) { m_generation_config.guidance_scale = 3.5f; @@ -460,8 +454,6 @@ class FluxPipeline : public DiffusionPipeline { m_generation_config.guidance_scale = 7.0f; m_generation_config.num_inference_steps = 28; m_generation_config.strength = 0.6f; - m_generation_config.height = 1024; - m_generation_config.width = 1024; } m_generation_config.max_sequence_length = 512; } else { diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp index b087a8cb86..e4c9fb2d87 100644 --- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp +++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp @@ -50,8 +50,6 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler { size_t get_begin_index() override; - - private: Config m_config; diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index bba366401e..9408220a64 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -774,6 +774,9 @@ class Image2ImagePipeline: This class is used for generation with image-to-image models. """ @staticmethod + def flux(scheduler: Scheduler, clip_text_model: CLIPTextModel, t5_encoder_model: T5EncoderModel, transformer: FluxTransformer2DModel, vae: AutoencoderKL) -> Image2ImagePipeline: + ... + @staticmethod def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Image2ImagePipeline: ... @staticmethod diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp index b011aee878..dcc50234ed 100644 --- a/src/python/py_image_generation_pipelines.cpp +++ b/src/python/py_image_generation_pipelines.cpp @@ -330,6 +330,7 @@ void init_image_generation_pipelines(py::module_& m) { .def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) .def_static("latent_consistency_model", &ov::genai::Image2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae")) .def_static("stable_diffusion_xl", &ov::genai::Image2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae")) + .def_static("flux", &ov::genai::Image2ImagePipeline::flux, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae")) .def( "compile", [](ov::genai::Image2ImagePipeline& pipe, diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index 1ad8236058..156d87e05c 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -103,8 +103,11 @@ def test_image_model_types(model_id, model_type, backend): ])), ) def test_image_model_genai(model_id, model_type): - if ("flux" in model_id or "stable-diffusion-3" in model_id) and model_type != "text-to-image": - pytest.skip(reason="FLUX or SD3 are supported as text to image only") + if ("stable-diffusion-3" in model_id) and model_type != "text-to-image": + pytest.skip(reason="SD3 is supported as text to image only") + + if ("flux" in model_id) and model_type != "image-inpainting": + pytest.skip(reason="FLUX is not yet supported as image inpainting") with tempfile.TemporaryDirectory() as temp_dir: GT_FILE = os.path.join(temp_dir, "gt.csv") From e8c95b7bcf9ede52e4affdcc02b5872b0bb1f84b Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 24 Jan 2025 00:12:53 +0400 Subject: [PATCH 3/3] Update tools/who_what_benchmark/tests/test_cli_image.py --- tools/who_what_benchmark/tests/test_cli_image.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index 156d87e05c..3edcc70636 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -106,7 +106,7 @@ def test_image_model_genai(model_id, model_type): if ("stable-diffusion-3" in model_id) and model_type != "text-to-image": pytest.skip(reason="SD3 is supported as text to image only") - if ("flux" in model_id) and model_type != "image-inpainting": + if ("flux" in model_id) and model_type == "image-inpainting": pytest.skip(reason="FLUX is not yet supported as image inpainting") with tempfile.TemporaryDirectory() as temp_dir: