Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Concurrency in stable-diffusion image generation #1475

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions samples/cpp/image_generation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,24 @@ install(TARGETS text2image
COMPONENT samples_bin
EXCLUDE_FROM_ALL)

# create text2image sample executable with concurrency

add_executable(text2image_concurrency text2image_concurrency.cpp imwrite.cpp)

target_include_directories(text2image_concurrency PRIVATE ${CMAKE_BINARY_DIR} "${CMAKE_CURRENT_SOURCE_DIR}")
target_link_libraries(text2image_concurrency PRIVATE openvino::genai)

set_target_properties(text2image_concurrency PROPERTIES
COMPILE_PDB_NAME text2image_concurrency
# Ensure out of box LC_RPATH on macOS with SIP
INSTALL_RPATH_USE_LINK_PATH ON)

install(TARGETS text2image_concurrency
RUNTIME DESTINATION samples_bin/
COMPONENT samples_bin
EXCLUDE_FROM_ALL)


# create LoRA sample executable

add_executable(lora_text2image lora_text2image.cpp imwrite.cpp)
Expand Down
88 changes: 88 additions & 0 deletions samples/cpp/image_generation/text2image_concurrency.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (C) 2023-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "openvino/genai/image_generation/text2image_pipeline.hpp"

#include "imwrite.hpp"
#include <thread>
#include <future>


void runPipeline(std::string prompt, std::filesystem::path root_dir, ov::genai::CLIPTextModel & text_encoder, ov::genai::UNet2DConditionModel & unet, ov::genai::AutoencoderKL & vae, std::promise<ov::Tensor> & Tensor_prm){
std::cout << "create pipeline" << prompt << std::endl;
auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json");
auto pipe2 = ov::genai::Text2ImagePipeline::stable_diffusion(scheduler, text_encoder, unet, vae);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the problem with such approach is that it will be hard to apply LoRA adapters here in generic case.
E.g. SD 1.5 has simple LoRA configuration, while FLUX or other more complex models, require code like this https://github.com/openvinotoolkit/openvino.genai/pull/1602/files

Alternative approach is to have API like

Text2ImagePipeline pipeline( .. ); // similar to compile model
Text2ImagePipeline::GenerationRequest request = pipeline.create_generation_request(); // holds inference request
request.update_generation_config(guidance_scale(5.0));
Tensor image = request.generate("cat", callback(my_callback));

Text2ImagePipeline::GenerationRequest request2 = pipeline.create_generation_request(); // holds inference request
request2.update_generation_config(guidance_scale(6.0));
Tensor image2 = request2.generate("cat", width(200), height(200));

In this case all complexity with LoRA is hidden inside and even clients can use the same API (e.g. generate different images with different LoRAs / alphas in parallel)

std::cout << "start generate " << prompt << std::endl;
try{
ov::Tensor image = pipe2.generate(prompt,
ov::genai::width(512),
ov::genai::height(512),
ov::genai::guidance_scale(0.75f),
ov::genai::num_inference_steps(10));
Tensor_prm.set_value(image);
std::cout << "finished generate" << std::endl;
}
catch (const std::exception& error) {
try {
std::cerr << error.what() << '\n';
} catch (const std::ios_base::failure&) {}
} catch (...) {
try {
std::cerr << "Non-exception object thrown\n";
} catch (const std::ios_base::failure&) {}
}

}

int32_t main(int32_t argc, char* argv[]) try {
OPENVINO_ASSERT(argc == 2, "Usage: ", argv[0], " <MODEL_DIR>");

const std::string models_path = argv[1];
std::filesystem::path root_dir = models_path;
const std::string device = "CPU"; // GPU can be used as well
auto scheduler = ov::genai::Scheduler::from_config(root_dir / "scheduler/scheduler_config.json");
auto text_encoder = ov::genai::CLIPTextModel(root_dir / "text_encoder");
text_encoder.compile("CPU");
auto unet = ov::genai::UNet2DConditionModel(root_dir / "unet");
if (device == "NPU") {
// The max_position_embeddings config from text encoder will be used as a parameter to unet reshape.
int max_position_embeddings = text_encoder.get_config().max_position_embeddings;
unet.reshape(1, 512, 512, max_position_embeddings);
}
unet.compile("CPU");

auto vae = ov::genai::AutoencoderKL(root_dir / "vae_decoder");
vae.compile("CPU");
std::cout << "models loaded" << std::endl;

std::promise<ov::Tensor> Tensor1_prm;
std::promise<ov::Tensor> Tensor2_prm;

std::thread t1(&runPipeline, std::string("a bucket of red roses"), root_dir, std::ref(text_encoder), std::ref(unet), std::ref(vae), std::ref(Tensor1_prm));
std::thread t2(&runPipeline, std::string("a glass of water on a wooden table"), root_dir, std::ref(text_encoder), std::ref(unet), std::ref(vae), std::ref(Tensor2_prm));


std::cout << "threads started" << std::endl;
std::future<ov::Tensor> T1_ftr = Tensor1_prm.get_future();
std::future<ov::Tensor> T2_ftr = Tensor2_prm.get_future();

ov::Tensor image1 = T1_ftr.get();
ov::Tensor image2 = T2_ftr.get();
t1.join();
t2.join();
// writes `num_images_per_prompt` images by pattern name
imwrite("image1_%d.bmp", image1, true);
imwrite("image2_%d.bmp", image2, true);

return EXIT_SUCCESS;
} catch (const std::exception& error) {
try {
std::cerr << error.what() << '\n';
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
} catch (...) {
try {
std::cerr << "Non-exception object thrown\n";
} catch (const std::ios_base::failure&) {}
return EXIT_FAILURE;
}
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
std::vector<size_t> block_out_channels = { 64 };

explicit Config(const std::filesystem::path& config_path);
Config() = default;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is it required?

I think you can initialize m_config in copy constructor via constructor initializer list

};

explicit AutoencoderKL(const std::filesystem::path& vae_decoder_path);
Expand Down Expand Up @@ -140,6 +141,7 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {

Config m_config;
ov::InferRequest m_encoder_request, m_decoder_request;
std::shared_ptr<ov::CompiledModel> encoder_compiled_model, decoder_compiled_model;
std::shared_ptr<ov::Model> m_encoder_model = nullptr, m_decoder_model = nullptr;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
size_t num_hidden_layers = 12;

explicit Config(const std::filesystem::path& config_path);
Config() = default;
};

explicit CLIPTextModel(const std::filesystem::path& root_dir);
Expand Down Expand Up @@ -92,6 +93,7 @@ class OPENVINO_GENAI_EXPORTS CLIPTextModel {
Config m_config;
AdapterController m_adapter_controller;
ov::InferRequest m_request;
std::shared_ptr<ov::CompiledModel> compiled_model;
std::shared_ptr<ov::Model> m_model;

Tokenizer m_clip_tokenizer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel {
int time_cond_proj_dim = -1;

explicit Config(const std::filesystem::path& config_path);
Config() = default;
};

explicit UNet2DConditionModel(const std::filesystem::path& root_dir);
Expand Down Expand Up @@ -95,6 +96,7 @@ class OPENVINO_GENAI_EXPORTS UNet2DConditionModel {
return guidance_scale > 1.0f && m_config.time_cond_proj_dim < 0;
}


private:
class UNetInference;
std::shared_ptr<UNetInference> m_impl;
Expand Down
24 changes: 17 additions & 7 deletions src/cpp/src/image_generation/models/autoencoder_kl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,17 @@ AutoencoderKL::AutoencoderKL(const std::string& vae_encoder_model,
}
}

AutoencoderKL::AutoencoderKL(const AutoencoderKL&) = default;
AutoencoderKL::AutoencoderKL(const AutoencoderKL& original_model){
encoder_compiled_model = original_model.encoder_compiled_model;
decoder_compiled_model = original_model.decoder_compiled_model;
m_decoder_request = original_model.decoder_compiled_model->create_infer_request();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what if model is not compiled yet?

if (m_encoder_model){
m_decoder_request = decoder_compiled_model->create_infer_request();
}
m_encoder_model = original_model.m_encoder_model;
m_decoder_model = original_model.m_decoder_model;
m_config = original_model.m_config;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it does not look safe that copy constructor performs infer request creation. We have code like:

    StableDiffusionPipeline(
        PipelineType pipeline_type,
        const CLIPTextModel& clip_text_model,
        const UNet2DConditionModel& unet,
        const AutoencoderKL& vae)
        : StableDiffusionPipeline(pipeline_type) {
        m_clip_text_encoder = std::make_shared<CLIPTextModel>(clip_text_model); // LEADS TO RE_CREATION OF REQUEST
        m_unet = std::make_shared<UNet2DConditionModel>(unet); // LEADS TO RE_CREATION OF REQUEST
        m_vae = std::make_shared<AutoencoderKL>(vae); // LEADS TO RE_CREATION OF REQUEST

        const bool is_lcm = m_unet->get_config().time_cond_proj_dim > 0;
        const char * const pipeline_name = is_lcm ? "LatentConsistencyModelPipeline" : "StableDiffusionPipeline";
        initialize_generation_config(pipeline_name);
    }

which means inference request will be re-created, while we don't have such goal.

}

AutoencoderKL& AutoencoderKL::reshape(int batch_size, int height, int width) {
OPENVINO_ASSERT(m_decoder_model, "Model has been already compiled. Cannot reshape already compiled model");
Expand Down Expand Up @@ -207,16 +217,16 @@ AutoencoderKL& AutoencoderKL::compile(const std::string& device, const ov::AnyMa
ov::Core core = utils::singleton_core();

if (m_encoder_model) {
ov::CompiledModel encoder_compiled_model = core.compile_model(m_encoder_model, device, properties);
ov::genai::utils::print_compiled_model_properties(encoder_compiled_model, "Auto encoder KL encoder model");
m_encoder_request = encoder_compiled_model.create_infer_request();
encoder_compiled_model = std::make_shared<ov::CompiledModel>(core.compile_model(m_encoder_model, device, properties));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ov::CompiledModel is shared model by its implementation. It's a wrapper around p_impl

ov::genai::utils::print_compiled_model_properties(*encoder_compiled_model, "Auto encoder KL encoder model");
m_encoder_request = encoder_compiled_model->create_infer_request();
// release the original model
m_encoder_model.reset();
}

ov::CompiledModel decoder_compiled_model = core.compile_model(m_decoder_model, device, properties);
ov::genai::utils::print_compiled_model_properties(decoder_compiled_model, "Auto encoder KL decoder model");
m_decoder_request = decoder_compiled_model.create_infer_request();
decoder_compiled_model = std::make_shared<ov::CompiledModel>(core.compile_model(m_decoder_model, device, properties));
ov::genai::utils::print_compiled_model_properties(*decoder_compiled_model, "Auto encoder KL decoder model");
m_decoder_request = decoder_compiled_model->create_infer_request();
// release the original model
m_decoder_model.reset();

Expand Down
19 changes: 13 additions & 6 deletions src/cpp/src/image_generation/models/clip_text_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,15 @@ CLIPTextModel::CLIPTextModel(const std::string& model,
compile(device, properties);
}

CLIPTextModel::CLIPTextModel(const CLIPTextModel&) = default;
CLIPTextModel::CLIPTextModel(const CLIPTextModel& origin_model) {
m_config = origin_model.m_config;
m_adapter_controller = origin_model.m_adapter_controller;
compiled_model = origin_model.compiled_model;
m_request = compiled_model->create_infer_request();
m_model = origin_model.m_model;
m_clip_tokenizer = origin_model.m_clip_tokenizer;
}


const CLIPTextModel::Config& CLIPTextModel::get_config() const {
return m_config;
Expand All @@ -86,17 +94,16 @@ CLIPTextModel& CLIPTextModel::reshape(int batch_size) {
CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMap& properties) {
OPENVINO_ASSERT(m_model, "Model has been already compiled. Cannot re-compile already compiled model");
ov::Core core = utils::singleton_core();
ov::CompiledModel compiled_model;
std::optional<AdapterConfig> adapters;
if (auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) {
adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("lora_te"));
m_adapter_controller = AdapterController(m_model, *adapters, device);
compiled_model = core.compile_model(m_model, device, *filtered_properties);
compiled_model = std::make_shared<ov::CompiledModel>(core.compile_model(m_model, device, *filtered_properties));
} else {
compiled_model = core.compile_model(m_model, device, properties);
compiled_model = std::make_shared<ov::CompiledModel>(core.compile_model(m_model, device, properties));
}
ov::genai::utils::print_compiled_model_properties(compiled_model, "Clip Text model");
m_request = compiled_model.create_infer_request();
ov::genai::utils::print_compiled_model_properties(*compiled_model, "Clip Text model");
m_request = compiled_model->create_infer_request();
// release the original model
m_model.reset();

Expand Down
13 changes: 12 additions & 1 deletion src/cpp/src/image_generation/models/unet2d_condition_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,18 @@ UNet2DConditionModel::UNet2DConditionModel(const std::string& model,
compile(device, properties);
}

UNet2DConditionModel::UNet2DConditionModel(const UNet2DConditionModel&) = default;
UNet2DConditionModel::UNet2DConditionModel(const UNet2DConditionModel& original_model) {
m_config = original_model.m_config;
m_adapter_controller = original_model.m_adapter_controller;
m_model = original_model.m_model;
m_vae_scale_factor = original_model.m_vae_scale_factor;
if (typeid(m_impl) == typeid(UNet2DConditionModel::UNetInferenceStaticBS1)) {
m_impl = std::make_shared<UNet2DConditionModel::UNetInferenceStaticBS1>(original_model.m_impl->get_compiled_model());
} else {
m_impl = std::make_shared<UNet2DConditionModel::UNetInferenceDynamic>(original_model.m_impl->get_compiled_model());
}

}

const UNet2DConditionModel::Config& UNet2DConditionModel::get_config() const {
return m_config;
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/src/image_generation/models/unet_inference.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class UNet2DConditionModel::UNetInference {
virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) = 0;
virtual void set_adapters(AdapterController& adapter_controller, const AdapterConfig& adapters) = 0;
virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) = 0;
virtual std::shared_ptr<ov::CompiledModel> get_compiled_model() = 0;

// utility function to resize model given optional dimensions.
static void reshape(std::shared_ptr<ov::Model> model,
Expand Down Expand Up @@ -62,6 +63,8 @@ class UNet2DConditionModel::UNetInference {

model->reshape(name_to_shape);
}
UNetInference(const UNetInference & );
UNetInference() = default;
};

} // namespace genai
Expand Down
29 changes: 22 additions & 7 deletions src/cpp/src/image_generation/models/unet_inference_dynamic.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,14 @@ namespace genai {

class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::UNetInference {
public:
virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override {
ov::CompiledModel compiled_model = utils::singleton_core().compile_model(model, device, properties);
ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition dynamic model");
m_request = compiled_model.create_infer_request();

virtual void compile(std::shared_ptr<ov::Model> model, const std::string& device, const ov::AnyMap& properties) override
{
ov::Core core = utils::singleton_core();

compiled_model = std::make_shared<ov::CompiledModel>(utils::singleton_core().compile_model(model, device, properties));
ov::genai::utils::print_compiled_model_properties(*compiled_model, "UNet 2D Condition dynamic model");
m_request = compiled_model->create_infer_request();
}

virtual void set_hidden_states(const std::string& tensor_name, ov::Tensor encoder_hidden_states) override {
Expand All @@ -30,17 +34,28 @@ class UNet2DConditionModel::UNetInferenceDynamic : public UNet2DConditionModel::

virtual ov::Tensor infer(ov::Tensor sample, ov::Tensor timestep) override {
OPENVINO_ASSERT(m_request, "UNet model must be compiled first. Cannot infer non-compiled model");

m_request.set_tensor("sample", sample);
m_request.set_tensor("timestep", timestep);

ov::CompiledModel test = m_request.get_compiled_model();
ov::genai::utils::print_compiled_model_properties(test, "UNet 2D Condition TEST");
m_request.infer();

return m_request.get_output_tensor();
}

UNetInferenceDynamic(std::shared_ptr<ov::CompiledModel> origin_compiled_model){
compiled_model = origin_compiled_model;
m_request = compiled_model->create_infer_request();
}

UNetInferenceDynamic() = default;

std::shared_ptr<ov::CompiledModel> get_compiled_model(){
return compiled_model;
}

private:
ov::InferRequest m_request;
std::shared_ptr<ov::CompiledModel> compiled_model;
};

} // namespace genai
Expand Down
25 changes: 21 additions & 4 deletions src/cpp/src/image_generation/models/unet_inference_static_bs1.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ namespace genai {
// Static Batch-Size 1 variant of UNetInference
class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel::UNetInference {
public:
UNetInferenceStaticBS1() = default;

UNetInferenceStaticBS1(const std::shared_ptr<ov::CompiledModel> & origin_compiled_model){
OPENVINO_ASSERT(origin_compiled_model, "Source model must be compiled first");
compiled_model = origin_compiled_model;
m_native_batch_size = compiled_model->input("sample").get_shape()[0];
for (int i = 0; i < m_native_batch_size; i++) {
m_requests[i] = compiled_model->create_infer_request();
}
}

virtual void compile(std::shared_ptr<ov::Model> model,
const std::string& device,
const ov::AnyMap& properties) override {
Expand All @@ -39,11 +50,12 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel
UNetInference::reshape(model, 1);

ov::Core core = utils::singleton_core();
ov::CompiledModel compiled_model = core.compile_model(model, device, properties);
ov::genai::utils::print_compiled_model_properties(compiled_model, "UNet 2D Condition batch-1 model");
compiled_model = std::make_shared<ov::CompiledModel>(core.compile_model(model, device, properties));
ov::genai::utils::print_compiled_model_properties(*compiled_model, "UNet 2D Condition batch-1 model");

for (int i = 0; i < m_native_batch_size; i++) {
m_requests[i] = compiled_model.create_infer_request();
for (int i = 0; i < m_native_batch_size; i++)
{
m_requests[i] = compiled_model->create_infer_request();
}
}

Expand Down Expand Up @@ -135,10 +147,15 @@ class UNet2DConditionModel::UNetInferenceStaticBS1 : public UNet2DConditionModel

return out_sample;
}
std::shared_ptr<ov::CompiledModel> get_compiled_model(){
return compiled_model;
}

private:
std::shared_ptr<ov::CompiledModel> compiled_model;
std::vector<ov::InferRequest> m_requests;
size_t m_native_batch_size = 0;

};

} // namespace genai
Expand Down
6 changes: 6 additions & 0 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,12 @@ void print_compiled_model_properties(ov::CompiledModel& compiled_Model, const ch
std::cout << " " << cfg << ": " << prop.as<std::string>() << std::endl;
}
}
for (const auto& input : compiled_Model.inputs()) {
std::cout << "Input name: " << input.get_any_name() << ", shape: " << input.get_partial_shape().to_string() << std::endl;
}
for (const auto& out : compiled_Model.outputs()) {
std::cout << "Output name: " << out.get_any_name() << ", shape: " << out.get_partial_shape().to_string() << std::endl;
}

ov::Core core;
std::vector<std::string> exeTargets;
Expand Down
Loading