diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md
index 3064fb58c1..c5c55b8d73 100644
--- a/SUPPORTED_MODELS.md
+++ b/SUPPORTED_MODELS.md
@@ -242,7 +242,7 @@ The pipeline can work with other similar topologies produced by `optimum-intel`
Flux |
Supported |
- Not supported |
+ Supported |
Not supported |
diff --git a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
index b086bd2aad..a945aa377d 100644
--- a/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
+++ b/src/cpp/include/openvino/genai/image_generation/image2image_pipeline.hpp
@@ -49,6 +49,14 @@ class OPENVINO_GENAI_EXPORTS Image2ImagePipeline {
const UNet2DConditionModel& unet,
const AutoencoderKL& vae);
+ // creates Flux pipeline from building blocks
+ static Image2ImagePipeline flux(
+ const std::shared_ptr& scheduler,
+ const CLIPTextModel& clip_text_model,
+ const T5EncoderModel t5_encoder_model,
+ const FluxTransformer2DModel& transformer,
+ const AutoencoderKL& vae);
+
ImageGenerationConfig get_generation_config() const;
void set_generation_config(const ImageGenerationConfig& generation_config);
diff --git a/src/cpp/src/image_generation/flux_pipeline.hpp b/src/cpp/src/image_generation/flux_pipeline.hpp
index fb6888b191..83acdeb2fb 100644
--- a/src/cpp/src/image_generation/flux_pipeline.hpp
+++ b/src/cpp/src/image_generation/flux_pipeline.hpp
@@ -27,17 +27,16 @@ ov::Tensor pack_latents(const ov::Tensor latents, size_t batch_size, size_t num_
// Permute to (0, 2, 4, 1, 3, 5)
for (size_t b = 0; b < batch_size; ++b) {
- for (size_t h2 = 0; h2 < h_half; ++h2) {
- for (size_t w2 = 0; w2 < w_half; ++w2) {
- for (size_t c = 0; c < num_channels_latents; ++c) {
- for (size_t h3 = 0; h3 < 2; ++h3) {
- for (size_t w3 = 0; w3 < 2; ++w3) {
- size_t src_index = ((b * num_channels_latents + c) * h_half + h2) * 2 * w_half * 2 + (h3 * w_half + w2) * 2 + w3;
- size_t dst_index = ((b * h_half + h2) * w_half + w2) * num_channels_latents * 4 + (c * 4 + h3 * 2 + w3);
-
- dst_data[dst_index] = src_data[src_index];
- }
- }
+ for (size_t c = 0; c < num_channels_latents; ++c) {
+ for (size_t h2 = 0; h2 < h_half; ++h2) {
+ for (size_t w2 = 0; w2 < w_half; ++w2) {
+ size_t base_src_index = (b * num_channels_latents + c) * height * width + (h2 * 2 * width + w2 * 2);
+ size_t base_dst_index = (b * h_half * w_half + h2 * w_half + w2) * num_channels_latents * 4 + c * 4;
+
+ dst_data[base_dst_index] = src_data[base_src_index];
+ dst_data[base_dst_index + 1] = src_data[base_src_index + 1];
+ dst_data[base_dst_index + 2] = src_data[base_src_index + width];
+ dst_data[base_dst_index + 3] = src_data[base_src_index + width + 1];
}
}
}
@@ -71,15 +70,14 @@ ov::Tensor unpack_latents(const ov::Tensor& latents, size_t height, size_t width
for (size_t b = 0; b < batch_size; ++b) {
for (size_t c4 = 0; c4 < c_quarter; ++c4) {
for (size_t h2 = 0; h2 < h_half; ++h2) {
- for (size_t h3 = 0; h3 < 2; ++h3) {
- for (size_t w2 = 0; w2 < w_half; ++w2) {
- for (size_t w3 = 0; w3 < 2; ++w3) {
- size_t reshaped_index = (((b * h_half + h2) * w_half + w2) * c_quarter + c4) * 4 + h3 * 2 + w3;
- size_t final_index = (b * c_quarter * height * width) + (c4 * height * width) + (h2 * 2 + h3) * width + (w2 * 2 + w3);
-
- dst_data[final_index] = src_data[reshaped_index];
- }
- }
+ for (size_t w2 = 0; w2 < w_half; ++w2) {
+ size_t base_reshaped_index = (((b * h_half + h2) * w_half + w2) * c_quarter + c4) * 4;
+ size_t base_final_index = (b * c_quarter * height * width) + (c4 * height * width) + (h2 * 2 * width + w2 * 2);
+
+ dst_data[base_final_index] = src_data[base_reshaped_index];
+ dst_data[base_final_index + 1] = src_data[base_reshaped_index + 1];
+ dst_data[base_final_index + width] = src_data[base_reshaped_index + 2];
+ dst_data[base_final_index + width + 1] = src_data[base_reshaped_index + 3];
}
}
}
@@ -111,7 +109,18 @@ namespace genai {
class FluxPipeline : public DiffusionPipeline {
public:
- FluxPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : DiffusionPipeline(pipeline_type) {
+ explicit FluxPipeline(PipelineType pipeline_type) : DiffusionPipeline(pipeline_type) {
+ // TODO: support GPU as well
+ const std::string device = "CPU";
+
+ if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) {
+ const bool do_normalize = true, do_binarize = false, gray_scale_source = false;
+ m_image_processor = std::make_shared(device, do_normalize, do_binarize, gray_scale_source);
+ m_image_resizer = std::make_shared(device, ov::element::u8, "NHWC", ov::op::v11::Interpolate::InterpolateMode::BICUBIC_PILLOW);
+ }
+ }
+
+ FluxPipeline(PipelineType pipeline_type, const std::filesystem::path& root_dir) : FluxPipeline(pipeline_type) {
const std::filesystem::path model_index_path = root_dir / "model_index.json";
std::ifstream file(model_index_path);
OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path);
@@ -163,7 +172,7 @@ class FluxPipeline : public DiffusionPipeline {
const std::filesystem::path& root_dir,
const std::string& device,
const ov::AnyMap& properties)
- : DiffusionPipeline(pipeline_type) {
+ : FluxPipeline(pipeline_type) {
const std::filesystem::path model_index_path = root_dir / "model_index.json";
std::ifstream file(model_index_path);
OPENVINO_ASSERT(file.is_open(), "Failed to open ", model_index_path);
@@ -216,11 +225,11 @@ class FluxPipeline : public DiffusionPipeline {
const T5EncoderModel& t5_text_model,
const FluxTransformer2DModel& transformer,
const AutoencoderKL& vae)
- : DiffusionPipeline(pipeline_type),
- m_clip_text_encoder(std::make_shared(clip_text_model)),
- m_t5_text_encoder(std::make_shared(t5_text_model)),
- m_vae(std::make_shared(vae)),
- m_transformer(std::make_shared(transformer)) {
+ : FluxPipeline(pipeline_type) {
+ m_clip_text_encoder = std::make_shared(clip_text_model);
+ m_t5_text_encoder = std::make_shared(t5_text_model);
+ m_vae = std::make_shared(vae);
+ m_transformer = std::make_shared(transformer);
initialize_generation_config("FluxPipeline");
}
@@ -288,6 +297,20 @@ class FluxPipeline : public DiffusionPipeline {
m_transformer->set_hidden_states("img_ids", latent_image_ids);
}
+ std::vector get_timesteps(size_t num_inference_steps, float strength) {
+ float init_timestep = std::min(static_cast(num_inference_steps) * strength, static_cast(num_inference_steps));
+ size_t t_start = static_cast(std::max(static_cast(num_inference_steps) - init_timestep, 0.0f));
+
+ std::vector timesteps, m_scheduler_timesteps = m_scheduler->get_float_timesteps();
+ for (size_t i = t_start; i < m_scheduler_timesteps.size(); ++i) {
+ timesteps.push_back(m_scheduler_timesteps[i]);
+ }
+
+ m_scheduler->set_begin_index(t_start);
+
+ return timesteps;
+ }
+
std::tuple prepare_latents(ov::Tensor initial_image, const ImageGenerationConfig& generation_config) const override {
const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
@@ -299,16 +322,22 @@ class FluxPipeline : public DiffusionPipeline {
num_channels_latents,
height,
width};
- ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latent, noise;
+ ov::Tensor latent(ov::element::f32, {}), proccesed_image, image_latents, noise;
if (initial_image) {
- OPENVINO_THROW("StableDiffusion3 image to image is not implemented");
+ proccesed_image = m_image_resizer->execute(initial_image, generation_config.height, generation_config.width);
+ proccesed_image = m_image_processor->execute(proccesed_image);
+
+ image_latents = m_vae->encode(proccesed_image, generation_config.generator);
+ noise = generation_config.generator->randn_tensor(latent_shape);
+ m_scheduler->scale_noise(image_latents, m_latent_timestep, noise);
+ latent = pack_latents(image_latents, generation_config.num_images_per_prompt, num_channels_latents, height, width);
} else {
noise = generation_config.generator->randn_tensor(latent_shape);
latent = pack_latents(noise, generation_config.num_images_per_prompt, num_channels_latents, height, width);
}
- return std::make_tuple(latent, proccesed_image, image_latent, noise);
+ return std::make_tuple(latent, proccesed_image, image_latents, noise);
}
void set_lora_adapters(std::optional adapters) override {
@@ -341,24 +370,30 @@ class FluxPipeline : public DiffusionPipeline {
compute_hidden_states(positive_prompt, m_custom_generation_config);
- ov::Tensor latents, processed_image, image_latent, noise;
- std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config);
-
- size_t image_seq_len = latents.get_shape()[1];
+ size_t image_seq_len = (m_custom_generation_config.height / vae_scale_factor / 2) *
+ (m_custom_generation_config.width / vae_scale_factor / 2);
float mu = m_scheduler->calculate_shift(image_seq_len);
-
float linspace_end = 1.0f / m_custom_generation_config.num_inference_steps;
std::vector sigmas = numpy_utils::linspace(1.0f, linspace_end, m_custom_generation_config.num_inference_steps, true);
-
m_scheduler->set_timesteps_with_sigma(sigmas, mu);
- std::vector timesteps = m_scheduler->get_float_timesteps();
+
+ std::vector timesteps;
+ if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
+ timesteps = m_scheduler->get_float_timesteps();
+ m_latent_timestep = timesteps[0];
+ } else {
+ timesteps = get_timesteps(m_custom_generation_config.num_inference_steps, m_custom_generation_config.strength);
+ }
+
+ ov::Tensor latents, processed_image, image_latent, noise;
+ std::tie(latents, processed_image, image_latent, noise) = prepare_latents(initial_image, m_custom_generation_config);
// 6. Denoising loop
ov::Tensor timestep(ov::element::f32, {1});
float* timestep_data = timestep.data();
for (size_t inference_step = 0; inference_step < timesteps.size(); ++inference_step) {
- timestep_data[0] = timesteps[inference_step] / 1000;
+ timestep_data[0] = timesteps[inference_step] / 1000.0f;
ov::Tensor noise_pred_tensor = m_transformer->infer(latents, timestep);
@@ -371,14 +406,15 @@ class FluxPipeline : public DiffusionPipeline {
}
latents = unpack_latents(latents, m_custom_generation_config.height, m_custom_generation_config.width, vae_scale_factor);
+
return m_vae->decode(latents);
}
ov::Tensor decode(const ov::Tensor latent) override {
ov::Tensor unpacked_latent = unpack_latents(latent,
- m_custom_generation_config.height,
- m_custom_generation_config.width,
- m_vae->get_vae_scale_factor());
+ m_custom_generation_config.height,
+ m_custom_generation_config.width,
+ m_vae->get_vae_scale_factor());
return m_vae->decode(unpacked_latent);
}
@@ -393,15 +429,6 @@ class FluxPipeline : public DiffusionPipeline {
const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
const auto& transformer_config = m_transformer->get_config();
- // in case of image to image generation_config_value is just ignored and computed based on initial image
- if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE) {
- OPENVINO_ASSERT(initial_image, "Initial image is empty for image to image pipeline");
- ov::Shape shape = initial_image.get_shape();
- int64_t dim_val = shape[dim_idx];
-
- generation_config_value = dim_val - (dim_val % vae_scale_factor);
- }
-
if (generation_config_value < 0)
generation_config_value = transformer_config.m_default_sample_size * vae_scale_factor;
}
@@ -415,10 +442,8 @@ class FluxPipeline : public DiffusionPipeline {
m_generation_config = ImageGenerationConfig();
- if (m_pipeline_type != PipelineType::IMAGE_2_IMAGE) {
- m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
- m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
- }
+ m_generation_config.height = transformer_config.m_default_sample_size * vae_scale_factor;
+ m_generation_config.width = transformer_config.m_default_sample_size * vae_scale_factor;
if (class_name == "FluxPipeline" || class_name == "FluxImg2ImgPipeline" || class_name == "FluxInpaintPipeline" ) {
if (m_pipeline_type == PipelineType::TEXT_2_IMAGE) {
@@ -426,9 +451,9 @@ class FluxPipeline : public DiffusionPipeline {
m_generation_config.num_inference_steps = 28;
m_generation_config.strength = 1.0f;
} else if (m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) {
- m_generation_config.guidance_scale = 3.5f;
+ m_generation_config.guidance_scale = 7.0f;
m_generation_config.num_inference_steps = 28;
- m_generation_config.strength = 1.0f;
+ m_generation_config.strength = 0.6f;
}
m_generation_config.max_sequence_length = 512;
} else {
@@ -438,7 +463,6 @@ class FluxPipeline : public DiffusionPipeline {
void check_image_size(const int height, const int width) const override {
assert(m_transformer != nullptr);
- // const size_t vae_scale_factor = m_transformer->get_vae_scale_factor();
const size_t vae_scale_factor = m_vae->get_vae_scale_factor();
OPENVINO_ASSERT((height % vae_scale_factor == 0 || height < 0) && (width % vae_scale_factor == 0 || width < 0),
"Both 'width' and 'height' must be divisible by ",
@@ -456,14 +480,6 @@ class FluxPipeline : public DiffusionPipeline {
OPENVINO_ASSERT(generation_config.prompt_3 == std::nullopt, "Prompt 3 is not used by FluxPipeline");
if ((m_pipeline_type == PipelineType::IMAGE_2_IMAGE || m_pipeline_type == PipelineType::INPAINTING) && initial_image) {
- ov::Shape initial_image_shape = initial_image.get_shape();
- size_t height = initial_image_shape[1], width = initial_image_shape[2];
-
- OPENVINO_ASSERT(generation_config.height == height,
- "Height for initial (", height, ") and generated (", generation_config.height,") images must be the same");
- OPENVINO_ASSERT(generation_config.width == width,
- "Width for initial (", width, ") and generated (", generation_config.width,") images must be the same");
-
OPENVINO_ASSERT(generation_config.strength >= 0.0f && generation_config.strength <= 1.0f,
"'Strength' generation parameter must be withion [0, 1] range");
} else {
@@ -476,7 +492,11 @@ class FluxPipeline : public DiffusionPipeline {
std::shared_ptr m_clip_text_encoder = nullptr;
std::shared_ptr m_t5_text_encoder = nullptr;
std::shared_ptr m_vae = nullptr;
+ std::shared_ptr m_image_processor = nullptr, m_mask_processor_rgb = nullptr, m_mask_processor_gray = nullptr;
+ std::shared_ptr m_image_resizer = nullptr, m_mask_resizer = nullptr;
ImageGenerationConfig m_custom_generation_config;
+
+ float m_latent_timestep = -1;
};
} // namespace genai
diff --git a/src/cpp/src/image_generation/image2image_pipeline.cpp b/src/cpp/src/image_generation/image2image_pipeline.cpp
index 8537e56ad5..7b00df678f 100644
--- a/src/cpp/src/image_generation/image2image_pipeline.cpp
+++ b/src/cpp/src/image_generation/image2image_pipeline.cpp
@@ -9,6 +9,7 @@
#include "image_generation/stable_diffusion_pipeline.hpp"
#include "image_generation/stable_diffusion_xl_pipeline.hpp"
+#include "image_generation/flux_pipeline.hpp"
#include "utils.hpp"
@@ -22,6 +23,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir)
m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir);
} else if (class_name == "StableDiffusionXLPipeline") {
m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir);
+ } else if (class_name == "FluxPipeline") {
+ m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir);
} else {
OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
}
@@ -34,6 +37,8 @@ Image2ImagePipeline::Image2ImagePipeline(const std::filesystem::path& root_dir,
m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
} else if (class_name == "StableDiffusionXLPipeline") {
m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
+ } else if (class_name == "FluxPipeline") {
+ m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, root_dir, device, properties);
} else {
OPENVINO_THROW("Unsupported image to image generation pipeline '", class_name, "'");
}
@@ -44,6 +49,8 @@ Image2ImagePipeline::Image2ImagePipeline(const InpaintingPipeline& pipe) {
m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, *stable_diffusion_xl);
} else if (auto stable_diffusion = std::dynamic_pointer_cast(pipe.m_impl); stable_diffusion != nullptr) {
m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, *stable_diffusion);
+ } else if (auto flux = std::dynamic_pointer_cast(pipe.m_impl); flux != nullptr) {
+ m_impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, *flux);
} else {
OPENVINO_ASSERT("Cannot convert specified InpaintingPipeline to Image2ImagePipeline");
}
@@ -94,6 +101,20 @@ Image2ImagePipeline Image2ImagePipeline::stable_diffusion_xl(
return Image2ImagePipeline(impl);
}
+Image2ImagePipeline Image2ImagePipeline::flux(
+ const std::shared_ptr& scheduler,
+ const CLIPTextModel& clip_text_model,
+ const T5EncoderModel t5_encoder_model,
+ const FluxTransformer2DModel& transformer,
+ const AutoencoderKL& vae){
+ auto impl = std::make_shared(PipelineType::IMAGE_2_IMAGE, clip_text_model, t5_encoder_model, transformer, vae);
+
+ assert(scheduler != nullptr);
+ impl->set_scheduler(scheduler);
+
+ return Image2ImagePipeline(impl);
+}
+
ImageGenerationConfig Image2ImagePipeline::get_generation_config() const {
return m_impl->get_generation_config();
}
diff --git a/src/cpp/src/image_generation/image_processor.cpp b/src/cpp/src/image_generation/image_processor.cpp
index 3dabf888ab..bd06c9b893 100644
--- a/src/cpp/src/image_generation/image_processor.cpp
+++ b/src/cpp/src/image_generation/image_processor.cpp
@@ -32,6 +32,7 @@ IImageProcessor::IImageProcessor(const std::string& device) :
}
ov::Tensor IImageProcessor::execute(ov::Tensor image) {
+ OPENVINO_ASSERT(m_request, "ImageProcessor model must be compiled first. Cannot infer non-compiled model");
m_request.set_input_tensor(image);
m_request.infer();
return m_request.get_output_tensor();
@@ -124,6 +125,7 @@ ImageResizer::ImageResizer(const std::string& device, ov::element::Type type, ov
}
ov::Tensor ImageResizer::execute(ov::Tensor image, int64_t dst_height, int64_t dst_width) {
+ OPENVINO_ASSERT(m_request, "ImageResizer model must be compiled first. Cannot infer non-compiled model");
ov::Tensor target_spatial_tensor(ov::element::i64, ov::Shape{2});
target_spatial_tensor.data()[0] = dst_height;
target_spatial_tensor.data()[1] = dst_width;
diff --git a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
index 383fece163..5f711f29ac 100644
--- a/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
+++ b/src/cpp/src/image_generation/schedulers/euler_ancestral_discrete.cpp
@@ -208,7 +208,7 @@ std::map EulerAncestralDiscreteScheduler::step(ov::Tens
return {{"latent", prev_sample}, {"denoised", pred_original_sample}};
}
-size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const{
+size_t EulerAncestralDiscreteScheduler::_index_for_timestep(int64_t timestep) const {
for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
if (timestep == m_schedule_timesteps[i]) {
return i;
diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp
index 265a561869..17e50ddc04 100644
--- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp
+++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.cpp
@@ -146,6 +146,43 @@ void FlowMatchEulerDiscreteScheduler::add_noise(ov::Tensor init_latent, ov::Tens
OPENVINO_THROW("Not implemented");
}
+size_t FlowMatchEulerDiscreteScheduler::_index_for_timestep(float timestep) {
+ if (m_schedule_timesteps.empty()) {
+ m_schedule_timesteps = m_timesteps;
+ }
+
+ for (size_t i = 0; i < m_schedule_timesteps.size(); ++i) {
+ if (timestep == m_schedule_timesteps[i]) {
+ return i;
+ }
+ }
+
+ OPENVINO_THROW("Failed to find index for timestep ", timestep);
+}
+
+void FlowMatchEulerDiscreteScheduler::scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) {
+ OPENVINO_ASSERT(timestep == -1, "Timestep is not computed yet");
+
+ size_t index_for_timestep;
+ if (m_begin_index == -1) {
+ index_for_timestep = _index_for_timestep(timestep);
+ } else if (m_step_index != -1) {
+ index_for_timestep = m_step_index;
+ } else {
+ index_for_timestep = m_begin_index;
+ }
+
+ const float sigma = m_sigmas[index_for_timestep];
+
+ float * sample_data = sample.data();
+ const float * noise_data = noise.data();
+
+ for (size_t i = 0; i < sample.get_size(); ++i) {
+ sample_data[i] = sigma * noise_data[i] + (1.0f - sigma) * sample_data[i];
+ }
+
+}
+
void FlowMatchEulerDiscreteScheduler::set_timesteps_with_sigma(std::vector sigma, float mu) {
m_timesteps.clear();
m_sigmas.clear();
@@ -184,5 +221,13 @@ float FlowMatchEulerDiscreteScheduler::calculate_shift(size_t image_seq_len) {
return mu;
}
+void FlowMatchEulerDiscreteScheduler::set_begin_index(size_t begin_index) {
+ m_begin_index = begin_index;
+}
+
+size_t FlowMatchEulerDiscreteScheduler::get_begin_index() {
+ return m_begin_index;
+}
+
} // namespace genai
} // namespace ov
diff --git a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
index 6399290ff3..e4c9fb2d87 100644
--- a/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
+++ b/src/cpp/src/image_generation/schedulers/flow_match_euler_discrete.hpp
@@ -42,13 +42,19 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
void add_noise(ov::Tensor init_latent, ov::Tensor noise, int64_t latent_timestep) const override;
+ void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) override;
+
float calculate_shift(size_t image_seq_len) override;
+ void set_begin_index(size_t begin_index) override;
+
+ size_t get_begin_index() override;
+
private:
Config m_config;
std::vector m_sigmas;
- std::vector m_timesteps;
+ std::vector m_timesteps, m_schedule_timesteps;
float m_sigma_min, m_sigma_max;
size_t m_step_index, m_begin_index;
@@ -56,6 +62,7 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
void init_step_index();
double sigma_to_t(double simga);
+ size_t _index_for_timestep(float timestep);
};
} // namespace genai
diff --git a/src/cpp/src/image_generation/schedulers/ischeduler.hpp b/src/cpp/src/image_generation/schedulers/ischeduler.hpp
index 2dadd59b1b..ff6807d2f8 100644
--- a/src/cpp/src/image_generation/schedulers/ischeduler.hpp
+++ b/src/cpp/src/image_generation/schedulers/ischeduler.hpp
@@ -43,6 +43,16 @@ class IScheduler : public Scheduler {
virtual std::vector get_float_timesteps() const {
OPENVINO_THROW("Scheduler doesn't support float timesteps");
}
+
+ virtual void scale_noise(ov::Tensor sample, float timestep, ov::Tensor noise) {
+ OPENVINO_THROW("Scheduler doesn't support `scale_noise` method");
+ }
+
+ virtual void set_begin_index(size_t begin_index) {};
+
+ virtual size_t get_begin_index() {
+ OPENVINO_THROW("Scheduler doesn't support `get_begin_index` method");
+ }
};
} // namespace genai
diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
index bba366401e..9408220a64 100644
--- a/src/python/openvino_genai/py_openvino_genai.pyi
+++ b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -774,6 +774,9 @@ class Image2ImagePipeline:
This class is used for generation with image-to-image models.
"""
@staticmethod
+ def flux(scheduler: Scheduler, clip_text_model: CLIPTextModel, t5_encoder_model: T5EncoderModel, transformer: FluxTransformer2DModel, vae: AutoencoderKL) -> Image2ImagePipeline:
+ ...
+ @staticmethod
def latent_consistency_model(scheduler: Scheduler, clip_text_model: CLIPTextModel, unet: UNet2DConditionModel, vae: AutoencoderKL) -> Image2ImagePipeline:
...
@staticmethod
diff --git a/src/python/py_image_generation_pipelines.cpp b/src/python/py_image_generation_pipelines.cpp
index b011aee878..dcc50234ed 100644
--- a/src/python/py_image_generation_pipelines.cpp
+++ b/src/python/py_image_generation_pipelines.cpp
@@ -330,6 +330,7 @@ void init_image_generation_pipelines(py::module_& m) {
.def_static("stable_diffusion", &ov::genai::Image2ImagePipeline::stable_diffusion, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
.def_static("latent_consistency_model", &ov::genai::Image2ImagePipeline::latent_consistency_model, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("unet"), py::arg("vae"))
.def_static("stable_diffusion_xl", &ov::genai::Image2ImagePipeline::stable_diffusion_xl, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("clip_text_model_with_projection"), py::arg("unet"), py::arg("vae"))
+ .def_static("flux", &ov::genai::Image2ImagePipeline::flux, py::arg("scheduler"), py::arg("clip_text_model"), py::arg("t5_encoder_model"), py::arg("transformer"), py::arg("vae"))
.def(
"compile",
[](ov::genai::Image2ImagePipeline& pipe,
diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
index 1ad8236058..3edcc70636 100644
--- a/tools/who_what_benchmark/tests/test_cli_image.py
+++ b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -103,8 +103,11 @@ def test_image_model_types(model_id, model_type, backend):
])),
)
def test_image_model_genai(model_id, model_type):
- if ("flux" in model_id or "stable-diffusion-3" in model_id) and model_type != "text-to-image":
- pytest.skip(reason="FLUX or SD3 are supported as text to image only")
+ if ("stable-diffusion-3" in model_id) and model_type != "text-to-image":
+ pytest.skip(reason="SD3 is supported as text to image only")
+
+ if ("flux" in model_id) and model_type == "image-inpainting":
+ pytest.skip(reason="FLUX is not yet supported as image inpainting")
with tempfile.TemporaryDirectory() as temp_dir:
GT_FILE = os.path.join(temp_dir, "gt.csv")
|