From 0d1ec0bb4a5a7059825c2fae9a83465a50093cad Mon Sep 17 00:00:00 2001
From: likholat <anna.likholat@intel.com>
Date: Tue, 24 Dec 2024 16:19:36 +0100
Subject: [PATCH 1/3] [Image Generation] Image2Image for FLUX

---
 .../image_generation/image2image_pipeline.hpp |   8 +
 .../src/image_generation/flux_pipeline.hpp    | 142 +++++++++++-------
 .../image_generation/image2image_pipeline.cpp |  21 +++
 .../src/image_generation/image_processor.cpp  |   2 +
 .../schedulers/euler_ancestral_discrete.cpp   |   2 +-
 .../schedulers/flow_match_euler_discrete.cpp  |  45 ++++++
 .../schedulers/flow_match_euler_discrete.hpp  |  11 +-
 .../schedulers/ischeduler.hpp                 |  10 ++
 8 files changed, 182 insertions(+), 59 deletions(-)
diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
index b086bd2aad..a945aa377d 100644
--- a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
@@ -49,6 +49,14 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
         const UNet2DConditionModel& unet,
         const AutoencoderKL& vae);
 
+    // creates Flux pipeline from building blocks
+    static Image2ImagePipeline flux(
+        const std::shared_ptr<Scheduler>& scheduler,
+        const CLIPTextModel& clip_text_model,
+        const T5EncoderModel t5_encoder_model,
+        const FluxTransformer2DModel& transformer,
+        const AutoencoderKL& vae);
+
     ImageGenerationConfig get_generation_config() const;
     void set_generation_config(const ImageGenerationConfig& generation_config);
 
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index fb6888b191..547efa4415 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -27,17 +27,16 @@ ov::Tensor pack_latents(const ov::Tensor latents, size_t batch_size, size_t num_
 
     // Permute to (0, 2, 4, 1, 3, 5)
     for (size_t b = 0; b < batch_size; ++b) {
-        for (size_t h2 = 0; h2 < h_half; ++h2) {
-            for (size_t w2 = 0; w2 < w_half; ++w2) {
-                for (size_t c = 0; c < num_channels_latents; ++c) {
-                    for (size_t h3 = 0; h3 < 2; ++h3) {
-                        for (size_t w3 = 0; w3 < 2; ++w3) {
-                            size_t src_index = ((b * num_channels_latents + c) * h_half + h2) * 2 * w_half * 2 + (h3 * w_half + w2) * 2 + w3;
-                            size_t dst_index = ((b * h_half + h2) * w_half + w2) * num_channels_latents * 4 + (c * 4 + h3 * 2 + w3);
-
-                            dst_data[dst_index] = src_data[src_index];
-                        }
-                    }
+        for (size_t c = 0; c < num_channels_latents; ++c) {
+            for (size_t h2 = 0; h2 < h_half; ++h2) {
+                for (size_t w2 = 0; w2 < w_half; ++w2) {
+                    size_t base_src_index = (b * num_channels_latents + c) * height * width + (h2 * 2 * width + w2 * 2);
+                    size_t base_dst_index = (b * h_half * w_half + h2 * w_half + w2) * num_channels_latents * 4 + c * 4;
+
+                    dst_data[base_dst_index] = src_data[base_src_index];
+                    dst_data[base_dst_index + 1] = src_data[base_src_index + 1];
+                    dst_data[base_dst_index + 2] = src_data[base_src_index + width];
+                    dst_data[base_dst_index + 3] = src_data[base_src_index + width + 1];
                 }
             }
         }
@@ -71,15 +70,14 @@ ov::Tensor unpack_latents(const ov::Tensor& latents, size_t height, size_t width
     for (size_t b = 0; b < batch_size; ++b) {
         for (size_t c4 = 0; c4 < c_quarter; ++c4) {
             for (size_t h2 = 0; h2 < h_half; ++h2) {
-                for (size_t h3 = 0; h3 < 2; ++h3) {
-                    for (size_t w2 = 0; w2 < w_half; ++w2) {
-                        for (size_t w3 = 0; w3 < 2; ++w3) {
-                            size_t reshaped_index = (((b * h_half + h2) * w_half + w2) * c_quarter + c4) * 4 + h3 * 2 + w3;
-                            size_t final_index = (b * c_quarter * height * width) + (c4 * height * width) + (h2 * 2 + h3) * width + (w2 * 2 + w3);
-
-                            dst_data[final_index] = src_data[reshaped_index];
-                        }
-                    }
+                for (size_t w2 = 0; w2 < w_half; ++w2) {
+                    size_t base_reshaped_index = (((b * h_half + h2) * w_half + w2) * c_quarter + c4) * 4;
+                    size_t base_final_index = (b * c_quarter * height * width) + (c4 * height * width) + (h2 * 2 * width + w2 * 2);
+
+                    dst_data[base_final_index] = src_data[base_reshaped_index];
+                    dst_data[base_final_index + 1] = src_data[base_reshaped_index + 1];
+                    dst_data[base_final_index + width] = src_data[base_reshaped_index + 2];
+                    dst_data[base_final_index + width + 1] = src_data[base_reshaped_index + 3];
                 }
             }
         }
@@ -111,7 +109,18 @@ namespace genai {
 
 class FluxPipeline : public DiffusionPipeline {
 public:
-    FluxPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : DiffusionPipeline(pipeline_type) {
+    explicit FluxPipeline(PipelineType pipeline_type) : DiffusionPipeline(pipeline_type) {
+        // TODO: support GPU as well
+        const std::string device = "CPU";
+
+        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) {
+            const bool do_normalize = true, do_binarize = false, gray_scale_source = false;
+            m_image_processor = std::make_shared<ImageProcessor>(device, do_normalize, do_binarize, gray_scale_source);
+            m_image_resizer = std::make_shared<ImageResizer>(device, ov::element::u8, "NHWC", ov::op::v11::Interpolate::InterpolateMode::BICUBIC_PILLOW);
+        }
+    }
+
+    FluxPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : FluxPipeline(pipeline_type) {
         const std::filesystem::path model_index_path = root_dir / "model_index.json";
         std::ifstream file(model_index_path);
         OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path);
@@ -163,7 +172,7 @@ class FluxPipeline : public DiffusionPipeline {
                  const std::filesystem::path& root_dir,
                  const std::string& device,
                  const ov::AnyMap& properties)
-        : DiffusionPipeline(pipeline_type) {
+        : FluxPipeline(pipeline_type) {
         const std::filesystem::path model_index_path = root_dir / "model_index.json";
         std::ifstream file(model_index_path);
         OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path);
@@ -216,11 +225,11 @@ class FluxPipeline : public DiffusionPipeline {
                  const T5EncoderModel& t5_text_model,
                  const FluxTransformer2DModel& transformer,
                  const AutoencoderKL& vae)
-        : DiffusionPipeline(pipeline_type),
-          m_clip_text_encoder(std::make_shared<CLIPTextModel>(clip_text_model)),
-          m_t5_text_encoder(std::make_shared<T5EncoderModel>(t5_text_model)),
-          m_vae(std::make_shared<AutoencoderKL>(vae)),
-          m_transformer(std::make_shared<FluxTransformer2DModel>(transformer)) {
+        : FluxPipeline(pipeline_type) {
+        m_clip_text_encoder = std::make_shared<CLIPTextModel>(clip_text_model);
+        m_t5_text_encoder = std::make_shared<T5EncoderModel>(t5_text_model);
+        m_vae = std::make_shared<AutoencoderKL>(vae);
+        m_transformer = std::make_shared<FluxTransformer2DModel>(transformer);
         initialize_generation_config("FluxPipeline");
     }
 
@@ -288,6 +297,20 @@ class FluxPipeline : public DiffusionPipeline {
         m_transformer->set_hidden_states("img_ids", latent_image_ids);
     }
 
+    std::vector<float> get_timesteps(size_t num_inference_steps, float strength) {
+        float init_timestep = std::min(static_cast<float>(num_inference_steps) * strength, static_cast<float>(num_inference_steps));
+        size_t t_start = static_cast<size_t>(std::max(static_cast<float>(num_inference_steps) - init_timestep, 0.0f));
+
+        std::vector<float> timesteps, m_scheduler_timesteps = m_scheduler->get_float_timesteps();
+        for (size_t i = t_start; i < m_scheduler_timesteps.size(); ++i) {
+            timesteps.push_back(m_scheduler_timesteps[i]);
+        }
+
+        m_scheduler->set_begin_index(t_start);
+
+        return timesteps;
+    }
+
     std::tuple<ov::Tensor, ov::Tensor, ov::Tensor, ov::Tensor> prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override {
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
 
@@ -299,16 +322,22 @@ class FluxPipeline : public DiffusionPipeline {
                                num_channels_latents,
                                height,
                                width};
-        ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latent, noise;
+        ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latents, noise;
 
         if (initial_image) {
-            OPENVINO_THROW("StableDiffusion3 image to image is not implemented");
+            proccesed_image = m_image_resizer->execute(initial_image, generation_config.height, generation_config.width);
+            proccesed_image = m_image_processor->execute(proccesed_image);
+
+            image_latents = m_vae->encode(proccesed_image, generation_config.generator);
+            noise = generation_config.generator->randn_tensor(latent_shape);
+            m_scheduler->scale_noise(image_latents, m_latent_timestep, noise);
+            latent = pack_latents(image_latents, generation_config.num_images_per_prompt, num_channels_latents, height, width);
         } else {
             noise = generation_config.generator->randn_tensor(latent_shape);
             latent = pack_latents(noise, generation_config.num_images_per_prompt, num_channels_latents, height, width);
         }
 
-        return std::make_tuple(latent, proccesed_image, image_latent, noise);
+        return std::make_tuple(latent, proccesed_image, image_latents, noise);
     }
 
     void set_lora_adapters(std::optional<AdapterConfig> adapters) override {
@@ -341,24 +370,30 @@ class FluxPipeline : public DiffusionPipeline {
 
         compute_hidden_states(positive_prompt, m_custom_generation_config);
 
-        ov::Tensor latents, processed_image, image_latent, noise;
-        std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config);
-
-        size_t image_seq_len = latents.get_shape()[1];
+        size_t image_seq_len = (m_custom_generation_config.height / vae_scale_factor / 2) *
+                               (m_custom_generation_config.width / vae_scale_factor / 2);
         float mu = m_scheduler->calculate_shift(image_seq_len);
-
         float linspace_end = 1.0f / m_custom_generation_config.num_inference_steps;
         std::vector<float> sigmas = numpy_utils::linspace<float>(1.0f, linspace_end, m_custom_generation_config.num_inference_steps, true);
-
         m_scheduler->set_timesteps_with_sigma(sigmas, mu);
-        std::vector<float> timesteps = m_scheduler->get_float_timesteps();
+
+        std::vector<float> timesteps;
+        if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
+            timesteps = m_scheduler->get_float_timesteps();
+            m_latent_timestep = timesteps[0];
+        } else {
+            timesteps = get_timesteps(m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength);
+        }
+
+        ov::Tensor latents, processed_image, image_latent, noise;
+        std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config);
 
         // 6. Denoising loop
         ov::Tensor timestep(ov::element::f32, {1});
         float* timestep_data = timestep.data<float>();
 
         for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
-            timestep_data[0] = timesteps[inference_step] / 1000;
+            timestep_data[0] = timesteps[inference_step] / 1000.0f;
 
             ov::Tensor noise_pred_tensor = m_transformer->infer(latents, timestep);
 
@@ -371,14 +406,15 @@ class FluxPipeline : public DiffusionPipeline {
         }
 
         latents = unpack_latents(latents, m_custom_generation_config.height, m_custom_generation_config.width, vae_scale_factor);
+    
         return m_vae->decode(latents);
     }
 
     ov::Tensor decode(const ov::Tensor latent) override {
         ov::Tensor unpacked_latent = unpack_latents(latent,
-                                                m_custom_generation_config.height,
-                                                m_custom_generation_config.width,
-                                                m_vae->get_vae_scale_factor());
+                                     m_custom_generation_config.height,
+                                     m_custom_generation_config.width,
+                                     m_vae->get_vae_scale_factor());
         return m_vae->decode(unpacked_latent);
     }
 
@@ -415,20 +451,17 @@ class FluxPipeline : public DiffusionPipeline {
 
         m_generation_config = ImageGenerationConfig();
 
-        if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
-            m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
-            m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
-        }
-
         if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) {
             if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
                 m_generation_config.guidance_scale = 3.5f;
                 m_generation_config.num_inference_steps = 28;
                 m_generation_config.strength = 1.0f;
             } else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) {
-                m_generation_config.guidance_scale = 3.5f;
+                m_generation_config.guidance_scale = 7.0f;
                 m_generation_config.num_inference_steps = 28;
-                m_generation_config.strength = 1.0f;
+                m_generation_config.strength = 0.6f;
+                m_generation_config.height = 1024;
+                m_generation_config.width = 1024;
             }
             m_generation_config.max_sequence_length = 512;
         } else {
@@ -438,7 +471,6 @@ class FluxPipeline : public DiffusionPipeline {
 
     void check_image_size(const int height, const int width) const override {
         assert(m_transformer != nullptr);
-        // const size_t vae_scale_factor = m_transformer->get_vae_scale_factor();
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && (width % vae_scale_factor == 0 || width < 0),
                         "Both 'width' and 'height' must be divisible by ",
@@ -456,14 +488,6 @@ class FluxPipeline : public DiffusionPipeline {
         OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by FluxPipeline");
 
         if ((m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) && initial_image) {
-            ov::Shape initial_image_shape = initial_image.get_shape();
-            size_t height = initial_image_shape[1], width = initial_image_shape[2];
-
-            OPENVINO_ASSERT(generation_config.height == height,
-                "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
-            OPENVINO_ASSERT(generation_config.width == width,
-                "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
-
             OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f,
                 "'Strength' generation parameter must be withion [0, 1] range");
         } else {
@@ -476,7 +500,11 @@ class FluxPipeline : public DiffusionPipeline {
     std::shared_ptr<CLIPTextModel> m_clip_text_encoder = nullptr;
     std::shared_ptr<T5EncoderModel> m_t5_text_encoder = nullptr;
     std::shared_ptr<AutoencoderKL> m_vae = nullptr;
+    std::shared_ptr<IImageProcessor> m_image_processor = nullptr, m_mask_processor_rgb = nullptr, m_mask_processor_gray = nullptr;
+    std::shared_ptr<ImageResizer> m_image_resizer = nullptr, m_mask_resizer = nullptr;
     ImageGenerationConfig m_custom_generation_config;
+
+    float m_latent_timestep = -1;
 };
 
 }  // namespace genai
diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp
index 8537e56ad5..7b00df678f 100644
--- a/src/cpp/src/image_generation/image2image_pipeline.cpp
+++ b/src/cpp/src/image_generation/image2image_pipeline.cpp
@@ -9,6 +9,7 @@
 
 #include "image_generation/stable_diffusion_pipeline.hpp"
 #include "image_generation/stable_diffusion_xl_pipeline.hpp"
+#include "image_generation/flux_pipeline.hpp"
 
 #include "utils.hpp"
 
@@ -22,6 +23,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir)
         m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir);
     } else if (class_name == "StableDiffusionXLPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir);
+    } else if (class_name == "FluxPipeline") {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir);
     } else {
         OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
     }
@@ -34,6 +37,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir,
         m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
     } else if (class_name == "StableDiffusionXLPipeline") {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
+    } else if (class_name == "FluxPipeline") {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
     } else {
         OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
     }
@@ -44,6 +49,8 @@ Image2ImagePipeline::Image2ImagePipeline(const InpaintingPipeline& pipe) {
         m_impl = std::make_shared<StableDiffusionXLPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion_xl);
     } else if (auto stable_diffusion = std::dynamic_pointer_cast<StableDiffusionPipeline>(pipe.m_impl); stable_diffusion != nullptr) {
         m_impl = std::make_shared<StableDiffusionPipeline>(PipelineType::IMAGE_2_IMAGE, *stable_diffusion);
+    } else if (auto flux = std::dynamic_pointer_cast<FluxPipeline>(pipe.m_impl); flux != nullptr) {
+        m_impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, *flux);
     } else {
         OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Image2ImagePipeline");
     }
@@ -94,6 +101,20 @@ Image2ImagePipeline Image2ImagePipeline::stable_diffusion_xl(
     return Image2ImagePipeline(impl);
 }
 
+Image2ImagePipeline Image2ImagePipeline::flux(
+    const std::shared_ptr<Scheduler>& scheduler,
+    const CLIPTextModel& clip_text_model,
+    const T5EncoderModel t5_encoder_model,
+    const FluxTransformer2DModel& transformer,
+    const AutoencoderKL& vae){
+    auto impl = std::make_shared<FluxPipeline>(PipelineType::IMAGE_2_IMAGE, clip_text_model, t5_encoder_model, transformer, vae);
+
+    assert(scheduler != nullptr);
+    impl->set_scheduler(scheduler);
+
+    return Image2ImagePipeline(impl);
+}
+
 ImageGenerationConfig Image2ImagePipeline::get_generation_config() const {
     return m_impl->get_generation_config();
 }
diff --git a/src/cpp/src/image_generation/image_processor.cpp b/src/cpp/src/image_generation/image_processor.cpp
index 3dabf888ab..bd06c9b893 100644
--- a/src/cpp/src/image_generation/image_processor.cpp
+++ b/src/cpp/src/image_generation/image_processor.cpp
@@ -32,6 +32,7 @@ IImageProcessor::IImageProcessor(const std::string& device) :
 }
 
 ov::Tensor IImageProcessor::execute(ov::Tensor image) {
+    OPENVINO_ASSERT(m_request, "ImageProcessor model must be compiled first. Cannot infer non-compiled model");
     m_request.set_input_tensor(image);
     m_request.infer();
     return m_request.get_output_tensor();
@@ -124,6 +125,7 @@ ImageResizer::ImageResizer(const std::string& device, ov::element::Type type, ov
 }
 
 ov::Tensor ImageResizer::execute(ov::Tensor image, int64_t dst_height, int64_t dst_width) {
+    OPENVINO_ASSERT(m_request, "ImageResizer model must be compiled first. Cannot infer non-compiled model");
     ov::Tensor target_spatial_tensor(ov::element::i64, ov::Shape{2});
     target_spatial_tensor.data<int64_t>()[0] = dst_height;
     target_spatial_tensor.data<int64_t>()[1] = dst_width;
diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
index 383fece163..5f711f29ac 100644
--- a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
+++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
@@ -208,7 +208,7 @@ std::map<std::string, ov::Tensor> EulerAncestralDiscreteScheduler::step(ov::Tens
     return {{"latent", prev_sample}, {"denoised", pred_original_sample}};
 }
 
-size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{
+size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const {
     for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
         if (timestep == m_schedule_timesteps[i]) {
             return i;
diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp
index 265a561869..17e50ddc04 100644
--- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp
+++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp
@@ -146,6 +146,43 @@ void FlowMatchEulerDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tens
     OPENVINO_THROW("Not implemented");
 }
 
+size_t FlowMatchEulerDiscreteScheduler::_index_for_timestep(float timestep) {
+    if (m_schedule_timesteps.empty()) {
+        m_schedule_timesteps = m_timesteps;
+    }
+
+    for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
+        if (timestep == m_schedule_timesteps[i]) {
+            return i;
+        }
+    }
+
+    OPENVINO_THROW("Failed to find index for timestep ", timestep);
+}
+
+void FlowMatchEulerDiscreteScheduler::scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) {
+    OPENVINO_ASSERT(timestep == -1, "Timestep is not computed yet");
+    
+    size_t index_for_timestep;
+    if (m_begin_index == -1) {
+        index_for_timestep = _index_for_timestep(timestep);
+    } else if (m_step_index != -1) {
+        index_for_timestep = m_step_index;
+    } else {
+        index_for_timestep = m_begin_index;
+    }
+
+    const float sigma = m_sigmas[index_for_timestep];
+
+    float * sample_data = sample.data<float>();
+    const float * noise_data = noise.data<float>();
+
+    for (size_t i = 0; i < sample.get_size(); ++i) {
+        sample_data[i] = sigma * noise_data[i] + (1.0f - sigma) * sample_data[i];
+    }
+
+}
+
 void FlowMatchEulerDiscreteScheduler::set_timesteps_with_sigma(std::vector<float> sigma, float mu) {
     m_timesteps.clear();
     m_sigmas.clear();
@@ -184,5 +221,13 @@ float FlowMatchEulerDiscreteScheduler::calculate_shift(size_t image_seq_len) {
     return mu;
 }
 
+void FlowMatchEulerDiscreteScheduler::set_begin_index(size_t begin_index) {
+    m_begin_index = begin_index;
+}
+
+size_t FlowMatchEulerDiscreteScheduler::get_begin_index() {
+    return m_begin_index;
+}
+
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
index 6399290ff3..b087a8cb86 100644
--- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
+++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
@@ -42,13 +42,21 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
 
     void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override;
 
+    void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) override;
+
     float calculate_shift(size_t image_seq_len) override;
 
+    void set_begin_index(size_t begin_index) override;
+
+    size_t get_begin_index() override;
+
+
+
 private:
     Config m_config;
 
     std::vector<float> m_sigmas;
-    std::vector<float> m_timesteps;
+    std::vector<float> m_timesteps, m_schedule_timesteps;
 
     float m_sigma_min, m_sigma_max;
     size_t m_step_index, m_begin_index;
@@ -56,6 +64,7 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
 
     void init_step_index();
     double sigma_to_t(double simga);
+    size_t _index_for_timestep(float timestep);
 };
 
 } // namespace genai
diff --git a/src/cpp/src/image_generation/schedulers/ischeduler.hpp b/src/cpp/src/image_generation/schedulers/ischeduler.hpp
index 2dadd59b1b..ff6807d2f8 100644
--- a/src/cpp/src/image_generation/schedulers/ischeduler.hpp
+++ b/src/cpp/src/image_generation/schedulers/ischeduler.hpp
@@ -43,6 +43,16 @@ class IScheduler : public Scheduler {
     virtual std::vector<float> get_float_timesteps() const {
         OPENVINO_THROW("Scheduler doesn't support float timesteps");
     }
+
+    virtual void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) {
+        OPENVINO_THROW("Scheduler doesn't support `scale_noise` method");
+    }
+
+    virtual void set_begin_index(size_t begin_index) {};
+
+    virtual size_t get_begin_index() {
+        OPENVINO_THROW("Scheduler doesn't support `get_begin_index` method");
+    }
 };
 
 } // namespace genai

From b06c5678ebdeb3967682adb3b2bbab94041a3119 Mon Sep 17 00:00:00 2001
From: likholat <anna.likholat@intel.com>
Date: Thu, 23 Jan 2025 14:58:10 +0100
Subject: [PATCH 2/3] review fixes

---
 SUPPORTED_MODELS.md                                |  2 +-
 src/cpp/src/image_generation/flux_pipeline.hpp     | 14 +++-----------
 .../schedulers/flow_match_euler_discrete.hpp       |  2 --
 src/python/openvino_genai/py_openvino_genai.pyi    |  3 +++
 src/python/py_image_generation_pipelines.cpp       |  1 +
 tools/who_what_benchmark/tests/test_cli_image.py   |  7 +++++--
 6 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
index 3064fb58c1..c5c55b8d73 100644
--- a/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -242,7 +242,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
       <tr>
       <td><code>Flux</code></td>
       <td>Supported</td>
-      <td>Not supported</td>
+      <td>Supported</td>
       <td>Not supported</td>
       <td>
         <ul>
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index 547efa4415..83acdeb2fb 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -429,15 +429,6 @@ class FluxPipeline : public DiffusionPipeline {
         const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
         const auto& transformer_config = m_transformer->get_config();
 
-        // in case of image to image generation_config_value is just ignored and computed based on initial image
-        if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
-            OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline");
-            ov::Shape shape = initial_image.get_shape();
-            int64_t dim_val = shape[dim_idx];
-
-            generation_config_value = dim_val - (dim_val % vae_scale_factor);
-        }
-
         if (generation_config_value < 0)
             generation_config_value = transformer_config.m_default_sample_size * vae_scale_factor;
     }
@@ -451,6 +442,9 @@ class FluxPipeline : public DiffusionPipeline {
 
         m_generation_config = ImageGenerationConfig();
 
+        m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+        m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
+
         if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) {
             if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
                 m_generation_config.guidance_scale = 3.5f;
@@ -460,8 +454,6 @@ class FluxPipeline : public DiffusionPipeline {
                 m_generation_config.guidance_scale = 7.0f;
                 m_generation_config.num_inference_steps = 28;
                 m_generation_config.strength = 0.6f;
-                m_generation_config.height = 1024;
-                m_generation_config.width = 1024;
             }
             m_generation_config.max_sequence_length = 512;
         } else {
diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
index b087a8cb86..e4c9fb2d87 100644
--- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
+++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
@@ -50,8 +50,6 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
 
     size_t get_begin_index() override;
 
-
-
 private:
     Config m_config;
 
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index bba366401e..9408220a64 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -774,6 +774,9 @@ class Image2ImagePipeline:
     This class is used for generation with image-to-image models.
     """
     @staticmethod
+    def flux(scheduler: Scheduler, clip_text_model: CLIPTextModel, t5_encoder_model: T5EncoderModel, transformer: FluxTransformer2DModel, vae: AutoencoderKL) -> Image2ImagePipeline:
+        ...
+    @staticmethod
     def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Image2ImagePipeline:
         ...
     @staticmethod
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index b011aee878..dcc50234ed 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -330,6 +330,7 @@ void init_image_generation_pipelines(py::module_& m) {
         .def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
         .def_static("latent_consistency_model", &ov::genai::Image2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
         .def_static("stable_diffusion_xl", &ov::genai::Image2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae"))
+        .def_static("flux", &ov::genai::Image2ImagePipeline::flux, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae"))
         .def(
             "compile",
             [](ov::genai::Image2ImagePipeline& pipe,
diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index 1ad8236058..156d87e05c 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -103,8 +103,11 @@ def test_image_model_types(model_id, model_type, backend):
                             ])),
 )
 def test_image_model_genai(model_id, model_type):
-    if ("flux" in model_id or "stable-diffusion-3" in model_id) and model_type != "text-to-image":
-        pytest.skip(reason="FLUX or SD3 are supported as text to image only")
+    if ("stable-diffusion-3" in model_id) and model_type != "text-to-image":
+        pytest.skip(reason="SD3 is supported as text to image only")
+
+    if ("flux" in model_id) and model_type != "image-inpainting":
+        pytest.skip(reason="FLUX is not yet supported as image inpainting")
 
     with tempfile.TemporaryDirectory() as temp_dir:
         GT_FILE = os.path.join(temp_dir, "gt.csv")

From e8c95b7bcf9ede52e4affdcc02b5872b0bb1f84b Mon Sep 17 00:00:00 2001
From: Ilya Lavrenov <ilya.lavrenov@intel.com>
Date: Fri, 24 Jan 2025 00:12:53 +0400
Subject: [PATCH 3/3] Update tools/who_what_benchmark/tests/test_cli_image.py

---
 tools/who_what_benchmark/tests/test_cli_image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index 156d87e05c..3edcc70636 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -106,7 +106,7 @@ def test_image_model_genai(model_id, model_type):
     if ("stable-diffusion-3" in model_id) and model_type != "text-to-image":
         pytest.skip(reason="SD3 is supported as text to image only")
 
-    if ("flux" in model_id) and model_type != "image-inpainting":
+    if ("flux" in model_id) and model_type == "image-inpainting":
         pytest.skip(reason="FLUX is not yet supported as image inpainting")
 
     with tempfile.TemporaryDirectory() as temp_dir: