From 88d42d8fc30fd2ea50f12c9f345b1e06fe248cfa Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Fri, 1 Nov 2024 17:50:56 +0400 Subject: [PATCH 01/28] Phi-3.5-vision-instruc --- src/cpp/src/visual_language/clip.cpp | 2 +- src/cpp/src/visual_language/clip.hpp | 1 + .../src/visual_language/inputs_embedder.cpp | 121 ++++++++++ .../src/visual_language/inputs_embedder.hpp | 1 + .../src/visual_language/processor_config.cpp | 4 + .../src/visual_language/processor_config.hpp | 8 +- .../src/visual_language/vision_encoder.cpp | 211 ++++++++++++++++++ .../src/visual_language/vision_encoder.hpp | 4 + .../src/visual_language/vlm_model_type.hpp | 4 +- src/docs/SUPPORTED_MODELS.md | 10 + 10 files changed, 363 insertions(+), 3 deletions(-) diff --git a/src/cpp/src/visual_language/clip.cpp b/src/cpp/src/visual_language/clip.cpp index d7b3c6fb05..c02201ab80 100644 --- a/src/cpp/src/visual_language/clip.cpp +++ b/src/cpp/src/visual_language/clip.cpp @@ -28,7 +28,7 @@ inline float clip_lerp(float s, float e, float t) { return s + (e - s) * t; } // Bilinear resize function -static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { +void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height) { dst.nx = target_width; dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); diff --git a/src/cpp/src/visual_language/clip.hpp b/src/cpp/src/visual_language/clip.hpp index 9494a48fd9..6c7acc24ab 100644 --- a/src/cpp/src/visual_language/clip.hpp +++ b/src/cpp/src/visual_language/clip.hpp @@ -33,6 +33,7 @@ struct clip_image_f32 { bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); bool bicubic_resize(const clip_image_u8& img, clip_image_u8& dst, int target_width, int target_height); +void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int target_width, int target_height); /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */ clip_image_f32 clip_image_preprocess(struct clip_ctx& ctx, const clip_image_u8& img); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index b01f45917b..50fa57aa7b 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1007,6 +1007,125 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { } }; +class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { +public: + InputsEmbedderPhi3V( + const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config + ) : IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0} {} + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images) override { + std::string images_prompt; + std::vector embeds; + for (const ov::Tensor& image : to_single_image_tensors(images)) { + EncodedImage encoded_image = m_vision_encoder.encode(image); + } + ov::Tensor inputs_embeds; + // if (m_vlm_config.use_image_id) { + // images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; + // ++m_image_id; + // } + // std::string unk64; + // for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { + // unk64 += m_vlm_config.unk; + // } + // images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; + // if (encoded_image.slices) { + // ov::Shape slices_shape = encoded_image.slices.get_shape(); + // for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { + // for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { + // images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; + // } + // images_prompt += '\n'; + // } + // } + // if ('\n' != *(images_prompt.end() - 1)) { + // // Image wasn't sliced, add \n to the end of image anyway. + // // Strangely, \n isn't placed between . + // images_prompt += '\n'; + // } + // embeds.push_back(std::move(encoded_image)); + // } + // images_prompt += prompt; + + // ov::Tensor encoded_input = get_encoded_input_ids(images_prompt); + + // ov::Tensor inputs_embeds = m_embedding.infer(encoded_input); + // OPENVINO_ASSERT( + // m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), + // "Unexpected embedding size" + // ); + // ov::Tensor special_tokens = m_tokenizer.encode( + // m_vlm_config.im_start + // + m_vlm_config.im_end + // + m_vlm_config.slice_start + // + m_vlm_config.slice_end + // ).input_ids; + // OPENVINO_ASSERT( + // 4 == special_tokens.get_shape().at(1), + // "Every special token must be represented with a single int." + // ); + // int64_t im_start_id = special_tokens.data()[0]; + // int64_t im_end_id = special_tokens.data()[1]; + // int64_t slice_start_id = special_tokens.data()[2]; + // int64_t slice_end_id = special_tokens.data()[3]; + // int64_t im_start_pos = 0, slice_start_pos = 0; + // int64_t* begin = encoded_input.data(); + // int64_t* ids = begin; + // size_t encoded_input_size = encoded_input.get_size(); + // int64_t* end = ids + encoded_input_size; + // float* inputs_embeds_data = inputs_embeds.data(); + // for (const EncodedImage& encoded_image : embeds) { + // const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size}); + // float* emb = resampled_source.data(); + // ids = std::find(ids, end, im_start_id); + // OPENVINO_ASSERT(end != ids); + // ++ids; + // std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + // ids += m_vlm_config.query_num; + // if (encoded_image.slices) { + // size_t token_idx = 0; + // const ov::Shape& slices_shape = encoded_image.slices.get_shape(); + // for (size_t i = 0; i < slices_shape.at(0); ++i) { + // for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { + // size_t d2 = slices_shape.at(2); + // size_t d3 = slices_shape.at(3); + // ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; + // const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size}); + // ids = std::find(ids, end, slice_start_id); + // OPENVINO_ASSERT(end != ids); + // ++ids; + // std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + // ids += m_vlm_config.query_num; + // } + // } + // } + // } + + if (!m_is_chat_conversation) { + m_image_id = 0; + } + + return inputs_embeds; + } + + virtual void start_chat(const std::string& system_message) override { + IInputsEmbedder::start_chat(system_message); + m_image_id = 0; + } + + virtual void finish_chat() override { + IInputsEmbedder::finish_chat(); + m_image_id = 0; + } + +private: + // Used to insert <|image_i|>\n per image (not a slice). + size_t m_image_id; +}; + InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, const std::filesystem::path& model_dir, const std::string& device, @@ -1019,6 +1138,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, m_impl = std::make_shared(vlm_config, model_dir, device, device_config); } else if (vlm_config.model_type == VLMModelType::INTERNVL_CHAT) { m_impl = std::make_shared(vlm_config, model_dir, device, device_config); + } else if (vlm_config.model_type == VLMModelType::PHI3_V) { + m_impl = std::make_shared(vlm_config, model_dir, device, device_config); } else { OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 15df273ee8..8e4442c407 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -45,6 +45,7 @@ class InputsEmbedder { friend class InputsEmbedderLLaVA; friend class InputsEmbedderLLaVANext; friend class InputsEmbedderInternVLChat; + friend class InputsEmbedderPhi3V; }; } // namespace ov::genai diff --git a/src/cpp/src/visual_language/processor_config.cpp b/src/cpp/src/visual_language/processor_config.cpp index 7b953e5bed..e0d29a02c4 100644 --- a/src/cpp/src/visual_language/processor_config.cpp +++ b/src/cpp/src/visual_language/processor_config.cpp @@ -41,4 +41,8 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa if (parsed.contains("image_grid_pinpoints")) { image_grid_pinpoints = parsed.at("image_grid_pinpoints").get>>(); } + read_json_param(parsed, "num_crops", phi3_v.num_crops); + if (parsed.contains("img_processor")) { + phi3_v.num_img_tokens = parsed.at("img_processor").at("num_img_tokens"); + } } diff --git a/src/cpp/src/visual_language/processor_config.hpp b/src/cpp/src/visual_language/processor_config.hpp index 83cf9870a3..c7eac68204 100644 --- a/src/cpp/src/visual_language/processor_config.hpp +++ b/src/cpp/src/visual_language/processor_config.hpp @@ -35,9 +35,10 @@ class ProcessorConfig { /// llava calls it image_std. std::array norm_std{1.0f, 1.0f, 1.0f}; - // llava specific config params + // A renamed version of norm_mean. std::array image_mean{0.0f, 0.0f, 0.0f}; std::array image_std{1.0f, 1.0f, 1.0f}; + // llava specific config params size_t crop_size_height = 336; size_t crop_size_width = 336; size_t size_shortest_edge = 336; @@ -45,6 +46,11 @@ class ProcessorConfig { // llava-next specific config params std::vector> image_grid_pinpoints{{336, 672}, {672, 336}, {672, 672}, {1008, 336}, {336, 1008}}; + struct { + size_t num_crops = 4; + size_t num_img_tokens = 144; + } phi3_v; + /// @brief Default constructor ProcessorConfig() = default; /// @brief Construct ProcessorConfig from values in json_path. diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 0b6b169f18..98705f63d0 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -613,8 +613,209 @@ ov::Tensor get_pixel_values_internvl(const ov::Tensor& image, const ProcessorCon } return output_tensor; } + +namespace phi3_v { +constexpr size_t INPUT_IMAGE_SIZE = 336; + +ov::Tensor padding_336(const ov::Tensor& unpadded) { + ov::Shape _1ss3 = unpadded.get_shape(); + size_t s1 = _1ss3.at(1), s2 = _1ss3.at(2); + // TODO: test horizontal and vertical images + if (s1 < s2) { + size_t tar = size_t(std::ceil(float(s1) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE); + size_t top_padding = (tar - s1) / 2; + ov::Tensor padded{ov::element::u8, {1, tar, s2, 3}}; + uint8_t* padded_data = padded.data(); + std::fill_n(padded_data, padded.get_size(), 255); + std::copy_n(unpadded.data(), unpadded.get_size(), padded_data + top_padding * s2 * 3); + return padded; + } + size_t tar = size_t(std::ceil(float(s2) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE); + size_t left_padding = (tar - s2) / 2; + ov::Tensor padded{ov::element::u8, {1, s1, tar, 3}}; + uint8_t* padded_data = padded.data(); + std::fill_n(padded_data, padded.get_size(), 255); + uint8_t* unpadded_data = unpadded.data(); + for (size_t row = 0; row < s1; ++row) { + std::copy_n(unpadded_data + row * s2 * 3, s2 * 3, padded_data + row * tar * 3 + left_padding * 3); + } + return padded; +} + +ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) { + ov::Shape _1hwc = uint8.get_shape(); + size_t height = _1hwc.at(1), width = _1hwc.at(2); + bool trans = false; + if (width < height) { + std::swap(height, width); + trans = true; + } + float ratio = float(width) / height; + unsigned scale = 1; + while (scale * std::ceil(scale / ratio) <= num_crops) { + ++scale; + } + --scale; + size_t new_w = scale * INPUT_IMAGE_SIZE; + size_t new_h = new_w / ratio; + clip_image_u8 src{}, dst{}; + uint8_t* uint8_data = uint8.data(); + if (trans) { + src = clip_image_u8{height, width, {uint8_data, uint8_data + uint8.get_size()}}; + bilinear_resize(src, dst, new_h, new_w); + // std::cout << new_h << ' ' << new_w << '\n'; + return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()}); + } + src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}}; + bilinear_resize(src, dst, new_w, new_h); + // std::cout << new_w << ' ' << new_h << '\n'; + // 672, 448 + return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()}); +} + +ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) { + uint8_t* uint_8_data = uint8.data(); + ov::Tensor float_normalized{ov::element::f32, uint8.get_shape()}; + float* float_data = float_normalized.data(); + OPENVINO_ASSERT(0 == uint8.get_size() % 3, "RGB"); + for (size_t idx = 0; idx < uint8.get_size(); idx += 3) { + float_data[idx] = (float(uint_8_data[idx]) / 255.0f - config.norm_mean[0]) / config.norm_std[0]; + float_data[idx + 1] = (float(uint_8_data[idx + 1]) / 255.0f - config.norm_mean[1]) / config.norm_std[1]; + float_data[idx + 2] = (float(uint_8_data[idx + 2]) / 255.0f - config.norm_mean[2]) / config.norm_std[2]; + } + return float_normalized; } +ov::Tensor channels_first(const ov::Tensor& _1hw3) { + ov::Shape shape = _1hw3.get_shape(); + ov::Tensor _13hw = ov::Tensor{ov::element::f32, {1, 3, shape.at(1), shape.at(2)}}; + float* _1hw3_data = _1hw3.data(); + float* _13hw_data = _13hw.data(); + for (size_t plane = 0; plane < 3; ++plane) { + for (size_t row = 0; row < shape.at(1); ++row) { + for (size_t col = 0; col < shape.at(2); ++col) { + _13hw_data[plane * shape.at(1) * shape.at(2) + row * shape.at(2) + col] = _1hw3_data[row * shape.at(2) * 3 + col * 3 + plane]; + } + } + } + return _13hw; +} + +// Reimplementation of Python im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336) +ov::Tensor slice_image(const ov::Tensor& image) { + ov::Shape shape = image.get_shape(); + size_t N = shape[0]; + size_t C = shape[1]; + size_t H = shape[2]; + size_t W = shape[3]; + + size_t num_h_slices = H / INPUT_IMAGE_SIZE; + size_t num_w_slices = W / INPUT_IMAGE_SIZE; + + // Step 1: Define and populate the reshaped tensor in the correct shape order + ov::Tensor reshaped{ov::element::f32, {N, num_h_slices, num_w_slices, C, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE}}; + float* reshaped_data = reshaped.data(); + float* image_data = image.data(); + + // Populate the reshaped tensor + for (size_t n = 0; n < N; ++n) { + for (size_t h = 0; h < num_h_slices; ++h) { + for (size_t w = 0; w < num_w_slices; ++w) { + for (size_t c = 0; c < C; ++c) { + for (size_t i = 0; i < INPUT_IMAGE_SIZE; ++i) { + for (size_t j = 0; j < INPUT_IMAGE_SIZE; ++j) { + size_t src_idx = n * C * H * W + c * H * W + (h * INPUT_IMAGE_SIZE + i) * W + (w * INPUT_IMAGE_SIZE + j); + size_t dst_idx = n * num_h_slices * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + h * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + w * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + i * INPUT_IMAGE_SIZE + j; + reshaped_data[dst_idx] = image_data[src_idx]; + } + } + } + } + } + } + + // Step 2: Define the permuted tensor in the final shape + ov::Tensor permuted{ov::element::f32, {N * num_h_slices * num_w_slices, C, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE}}; + float* permuted_data = permuted.data(); + + // Perform permutation by flattening N, num_h_slices, and num_w_slices + for (size_t n = 0; n < N; ++n) { + for (size_t h = 0; h < num_h_slices; ++h) { + for (size_t w = 0; w < num_w_slices; ++w) { + for (size_t c = 0; c < C; ++c) { + for (size_t i = 0; i < INPUT_IMAGE_SIZE; ++i) { + for (size_t j = 0; j < INPUT_IMAGE_SIZE; ++j) { + size_t src_idx = n * num_h_slices * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + h * num_w_slices * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + w * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + i * INPUT_IMAGE_SIZE + j; + size_t dst_idx = (n * num_h_slices * num_w_slices + h * num_w_slices + w) * C * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + c * INPUT_IMAGE_SIZE * INPUT_IMAGE_SIZE + + i * INPUT_IMAGE_SIZE + j; + permuted_data[dst_idx] = reshaped_data[src_idx]; + } + } + } + } + } + } + + return permuted; +} + +ov::Tensor concatenate_batch(const ov::Tensor& float_first, const ov::Tensor& float_second) { + ov::Shape shape_first = float_first.get_shape(); + ov::Shape shape_second = float_second.get_shape(); + OPENVINO_ASSERT(shape_first.at(1) == shape_second.at(1), "Channels must be the same"); + OPENVINO_ASSERT(shape_first.at(2) == shape_second.at(2), "Height must be the same"); + OPENVINO_ASSERT(shape_first.at(3) == shape_second.at(3), "Width must be the same"); + ov::Tensor concatenated{ov::element::f32, {shape_first.at(0) + shape_second.at(0), shape_first.at(1), shape_first.at(2), shape_first.at(3)}}; + float* concatenated_data = concatenated.data(); + float* first_data = float_first.data(); + float* second_data = float_second.data(); + std::copy(first_data, first_data + float_first.get_size(), concatenated_data); + std::copy(second_data, second_data + float_second.get_size(), concatenated_data + float_first.get_size()); + return concatenated; +} + +ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops) { + ov::Shape shape = nchw.get_shape(); + size_t num_crops = shape[0]; + if (num_crops >= max_crops) { + return nchw; + } + ov::Tensor padded{ov::element::f32, {max_crops, shape[1], shape[2], shape[3]}}; + float* padded_data = padded.data(); + float* nchw_data = nchw.data(); + std::copy_n(nchw_data, nchw.get_size(), padded_data); + return padded; +} + +std::tuple get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { + ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops); // TODO: this is just resize_and_pad_image() from clip.hpp. + ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)}; + clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data(), hd_image.data() + hd_image.get_size()}}; + clip_image_u8 dst; + bicubic_resize(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE); + ov::Tensor global_image{ov::element::u8, {1, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3}, dst.buf.data()}; + global_image = mean_scale(global_image, config); + hd_image = mean_scale(hd_image, config); + global_image = channels_first(global_image); + hd_image = channels_first(hd_image); + ov::Tensor slices = slice_image(hd_image); + ov::Tensor concatenated = concatenate_batch(global_image, slices); + ov::Tensor pixel_values = pad_to_max_num_crops_tensor(concatenated, config.phi3_v.num_crops); + size_t num_img_tokens = (image_size.height / INPUT_IMAGE_SIZE) * (image_size.width / INPUT_IMAGE_SIZE) * config.phi3_v.num_img_tokens + 1 + (image_size.height / INPUT_IMAGE_SIZE + 1) * size_t(std::sqrt(config.phi3_v.num_img_tokens)); + return {std::move(pixel_values), image_size, num_img_tokens}; +} +} // namespace phi3_v +} // anonymous namespace + VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) : model_type(model_type) { m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); @@ -632,6 +833,8 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfi return encode_llava_next(image, config); } else if (model_type == VLMModelType::INTERNVL_CHAT) { return encode_internvl(image, config); + } else if (model_type == VLMModelType::PHI3_V) { + return encode_phi3_v(image, config); } else { OPENVINO_THROW("Unsupported type of VisionEncoder"); } @@ -705,3 +908,11 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce return {std::move(image_features), resized_source_size}; } + +EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { + auto [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config); + // m_vision_encoder.set_input_tensor(); + m_vision_encoder.infer(); + + return {}; +} diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index a95abb838c..ffb5e82d44 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -142,5 +142,9 @@ class VisionEncoder { EncodedImage encode_internvl( const ov::Tensor& image, const ProcessorConfig& config ); + + EncodedImage encode_phi3_v( + const ov::Tensor& image, const ProcessorConfig& config + ); }; } diff --git a/src/cpp/src/visual_language/vlm_model_type.hpp b/src/cpp/src/visual_language/vlm_model_type.hpp index e4b5e823b6..86b23f50f8 100644 --- a/src/cpp/src/visual_language/vlm_model_type.hpp +++ b/src/cpp/src/visual_language/vlm_model_type.hpp @@ -16,6 +16,7 @@ enum class VLMModelType { LLAVA, LLAVA_NEXT, INTERNVL_CHAT, + PHI3_V, }; inline VLMModelType to_vlm_model_type(const std::string& value) { @@ -23,7 +24,8 @@ inline VLMModelType to_vlm_model_type(const std::string& value) { {"minicpmv", VLMModelType::MINICPM}, {"llava", VLMModelType::LLAVA}, {"llava_next", VLMModelType::LLAVA_NEXT}, - {"internvl_chat", VLMModelType::INTERNVL_CHAT} + {"internvl_chat", VLMModelType::INTERNVL_CHAT}, + {"phi3_v", VLMModelType::PHI3_V} }; auto it = model_types_map.find(value); diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index af3f8c064a..e36f15324d 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -257,6 +257,16 @@ The pipeline can work with other similar topologies produced by `optimum-intel` + + Phi3VForCausalLM + phi3_v + + + + From 9d7c7a0ad523ca1026e09cca8ad76462ed93886e Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 7 Nov 2024 16:05:17 +0400 Subject: [PATCH 02/28] encode --- .../visual_language_chat.cpp | 22 +++++++++---------- .../src/visual_language/vision_encoder.cpp | 8 +++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index 3a655374e9..7e334f7502 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) { return !(std::cout << subword << std::flush); } -int main(int argc, char* argv[]) try { +int main(int argc, char* argv[]) { if (3 != argc) { throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); } @@ -48,14 +48,14 @@ int main(int argc, char* argv[]) try { "question:\n"; } pipe.finish_chat(); -} catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; -} catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; +// } catch (const std::exception& error) { +// try { +// std::cerr << error.what() << '\n'; +// } catch (const std::ios_base::failure&) {} +// return EXIT_FAILURE; +// } catch (...) { +// try { +// std::cerr << "Non-exception object thrown\n"; +// } catch (const std::ios_base::failure&) {} +// return EXIT_FAILURE; } diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 610d9dee67..90d16f743e 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -905,9 +905,9 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce } EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { - auto [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config); - // m_vision_encoder.set_input_tensor(); + // TODO: drop num_img_tokens + const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config); + m_vision_encoder.set_input_tensor(pixel_values); m_vision_encoder.infer(); - - return {}; + return {m_vision_encoder.get_output_tensor(), image_size}; } From 21dc4984ff08db835bead81c7519025d15d100d9 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 27 Nov 2024 13:34:02 +0400 Subject: [PATCH 03/28] Add hd_feature_transformer --- .../src/visual_language/inputs_embedder.cpp | 196 +++++++++++++++++- .../src/visual_language/vision_encoder.cpp | 2 + 2 files changed, 194 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index c6c9f68ed7..e6fefe2d44 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -6,6 +6,7 @@ #include "visual_language/clip.hpp" #include "visual_language/vision_encoder.hpp" #include "visual_language/embedding_model.hpp" +#include "openvino/opsets/opset13.hpp" #include "utils.hpp" @@ -1006,20 +1007,207 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { } }; +namespace { +namespace phi3_v { +ov::InferRequest create_hd_feature_transformer() { + using namespace ov; + using namespace element; + using namespace opset13; + using namespace std; + auto t0 = make_shared(f32, PartialShape{-1, 576, 1024}); + auto t1 = make_shared(i32, PartialShape{}); + auto t2 = make_shared(i32, PartialShape{}); + auto t3 = make_shared(t0); + auto t4 = make_shared(i64, Shape{}, vector{0}); + auto t5 = make_shared(i64, Shape{}, vector{0}); + auto t6 = make_shared(t3, t4, t5); + auto t7 = make_shared(i64, Shape{1}, vector{1}); + auto t8 = make_shared(t6, t7, false); + auto t9 = make_shared(i64, Shape{}, vector{1}); + auto t10 = make_shared(i64, Shape{}, vector{0}); + auto t11 = make_shared(t3, t9, t10); + auto t12 = make_shared(t11, element::f32); + auto t13 = make_shared(f32, Shape{}, vector{0.5}); + auto t14 = make_shared(t12, t13, "numpy"); + auto t15 = make_shared(t14, element::i32); + auto t16 = make_shared(t15, element::i64); + auto t17 = make_shared(i32, Shape{}, vector{0}); + auto t18 = make_shared(t16, t17); + auto t19 = make_shared(i64, Shape{1}, vector{2}); + auto t20 = make_shared(i64, Shape{}, vector{0}); + auto t21 = make_shared(t3, t19, t20); + auto t22 = make_shared(NodeVector{t8, t18, t18, t21}, 0); + auto t23 = make_shared(t0, t22, false); + auto t24 = make_shared(i64, Shape{}, vector{2}); + auto t25 = make_shared(t16, t24, "numpy"); + auto t26 = make_shared(t25); + auto t27 = make_shared(i32, Shape{}, vector{0}); + auto t28 = make_shared(t26, t27); + auto t29 = make_shared(i64, Shape{1}, vector{2}); + auto t30 = make_shared(i64, Shape{1}, vector{2}); + auto t31 = make_shared(NodeVector{t8, t28, t29, t28, t30, t21}, 0); + auto t32 = make_shared(t23, t31, false); + auto t33 = make_shared(i64, Shape{6}, vector{0, 1, 3, 2, 4, 5}); + auto t34 = make_shared(t32, t33); + auto t35 = make_shared(i64, Shape{1}, vector{-1}); + auto t36 = make_shared(i64, Shape{1}, vector{4}); + auto t37 = make_shared(t21, t36, "numpy"); + auto t38 = make_shared(NodeVector{t8, t35, t37}, 0); + auto t39 = make_shared(t34, t38, false); + auto t40 = make_shared(t1, t2, "numpy"); + auto t41 = make_shared(t40, element::i64); + auto t42 = make_shared(t6, t41, "numpy"); + auto t43 = make_shared(t42); + auto t44 = make_shared(i64, Shape{}, vector{0}); + auto t45 = make_shared(t43, t44); + auto t46 = make_shared(t1, element::i64); + auto t47 = make_shared(t46, t44); + auto t48 = make_shared(t2, element::i64); + auto t49 = make_shared(t48, t44); + auto t50 = make_shared(i64, Shape{1}, vector{-1}); + auto t51 = make_shared(NodeVector{t45, t47, t49, t28, t28, t50}, 0); + auto t52 = make_shared(t39, t51, false); + auto t53 = make_shared(i64, Shape{6}, vector{0, 1, 3, 2, 4, 5}); + auto t54 = make_shared(t52, t53); + auto t55 = make_shared(t1, t15, "numpy"); + auto t56 = make_shared(t55, element::i64); + auto t57 = make_shared(i64, Shape{}, vector{2}); + auto t58 = make_shared(t56, t57, "numpy"); + auto t59 = make_shared(t58); + auto t60 = make_shared(i32, Shape{}, vector{0}); + auto t61 = make_shared(t59, t60); + auto t62 = make_shared(t2, t15, "numpy"); + auto t63 = make_shared(t62, element::i64); + auto t64 = make_shared(i64, Shape{}, vector{2}); + auto t65 = make_shared(t63, t64, "numpy"); + auto t66 = make_shared(t65); + auto t67 = make_shared(t66, t60); + auto t68 = make_shared(NodeVector{t45, t61, t67, t37}, 0); + auto t69 = make_shared(t54, t68, false); + + // t0 = opset.Parameter({'shape': [-1, 576, 1024], 'element_type': 'f32'}, # -> f32[?,576,1024] + // t1 = opset.Parameter({'shape': [], 'element_type': 'i32'}, # -> i32[] + // t2 = opset.Parameter({'shape': [], 'element_type': 'i32'}, # -> i32[] + // t3 = opset.ShapeOf([t0], {'output_type': 'i64'}, # f32[?,576,1024] -> i64[3] + // t4 = opset.Constant(model, 4, # -> i64[](0) + // t5 = opset.Constant(model, 5, # -> i64[](0) + // t6 = opset.Gather([t3, t4, t5], {'batch_dims': 0}, # i64[3], i64[], i64[] -> i64[] + // t7 = opset.Constant(model, 7, # -> i64[1]([1]) + // t8 = opset.Reshape([t6, t7], {'special_zero': False}, # i64[], i64[1] -> i64[1] + // t9 = opset.Constant(model, 9, # -> i64[](1) + // t10 = opset.Constant(model, 10, # -> i64[](0) + // t11 = opset.Gather([t3, t9, t10], {'batch_dims': 0}, # i64[3], i64[], i64[] -> i64[] + // t12 = opset.Convert([t11], {'destination_type': 'f32'}, # i64[] -> f32[] + // t13 = opset.Constant(model, 13, # -> f32[](0.5) + // t14 = opset.Power([t12, t13], {'auto_broadcast': 'numpy'}, # f32[], f32[] -> f32[] + // t15 = opset.Convert([t14], {'destination_type': 'i32'}, # f32[] -> i32[] + // t16 = opset.Convert([t15], {'destination_type': 'i64'}, # i32[] -> i64[] + // t17 = opset.Constant(model, 17, # -> i32[](0) + // t18 = opset.Unsqueeze([t16, t17], {}, # i64[], i32[] -> i64[1] + // t19 = opset.Constant(model, 19, # -> i64[1]([2]) + // t20 = opset.Constant(model, 20, # -> i64[](0) + // t21 = opset.Gather([t3, t19, t20], {'batch_dims': 0}, # i64[3], i64[1], i64[] -> i64[1] + // t22 = opset.Concat([t8, t18, t18, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1] -> i64[4] + // t23 = opset.Reshape([t0, t22], {'special_zero': False}, # f32[?,576,1024], i64[4] -> f32[?,24,24,1024] + // t24 = opset.Constant(model, 24, # -> i64[](2) + // t25 = opset.Divide([t16, t24], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] + // t26 = opset.Floor([t25], {}, # i64[] -> i64[] + // t27 = opset.Constant(model, 27, # -> i32[](0) + // t28 = opset.Unsqueeze([t26, t27], {}, # i64[], i32[] -> i64[1] + // t29 = opset.Constant(model, 29, # -> i64[1]([2]) + // t30 = opset.Constant(model, 30, # -> i64[1]([2]) + // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] + // t32 = opset.Reshape([t23, t31], {'special_zero': False}, # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024] + // t33 = opset.Constant(model, 33, + // t34 = opset.Transpose([t32, t33], {}, # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] + // t35 = opset.Constant(model, 35, # -> i64[1]([-1]) + // t36 = opset.Constant(model, 36, # -> i64[1]([4]) + // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'}, # i64[1], i64[1] -> i64[1] + // t38 = opset.Concat([t8, t35, t37], {'axis': 0}, # i64[1], i64[1], i64[1] -> i64[3] + // t39 = opset.Reshape([t34, t38], {'special_zero': False}, # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] + // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] + // t41 = opset.Convert([t40], {'destination_type': 'i64'}, # i32[] -> i64[] + // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] + // t43 = opset.Floor([t42], {}, # i64[] -> i64[] + // t44 = opset.Constant(model, 44, # -> i32[](0) + // t45 = opset.Unsqueeze([t43, t44], {}, # i64[], i32[] -> i64[1] + // t46 = opset.Convert([t1], {'destination_type': 'i64'}, # i32[] -> i64[] + // t47 = opset.Unsqueeze([t46, t44], {}, # i64[], i32[] -> i64[1] + // t48 = opset.Convert([t2], {'destination_type': 'i64'}, # i32[] -> i64[] + // t49 = opset.Unsqueeze([t48, t44], {}, # i64[], i32[] -> i64[1] + // t50 = opset.Constant(model, 50, # -> i64[1]([-1]) + // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] + // t52 = opset.Reshape([t39, t51], {'special_zero': False}, # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] + // t53 = opset.Constant(model, 53, + // t54 = opset.Transpose([t52, t53], {}, # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?] + // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] + // t56 = opset.Convert([t55], {'destination_type': 'i64'}, # i32[] -> i64[] + // t57 = opset.Constant(model, 57, # -> i64[](2) + // t58 = opset.Divide([t56, t57], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] + // t59 = opset.Floor([t58], {}, # i64[] -> i64[] + // t60 = opset.Constant(model, 60, # -> i32[](0) + // t61 = opset.Unsqueeze([t59, t60], {}, # i64[], i32[] -> i64[1] + // t62 = opset.Multiply([t2, t15], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] + // t63 = opset.Convert([t62], {'destination_type': 'i64'}, # i32[] -> i64[] + // t64 = opset.Constant(model, 64, # -> i64[](2) + // t65 = opset.Divide([t63, t64], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] + // t66 = opset.Floor([t65], {}, # i64[] -> i64[] + // t67 = opset.Unsqueeze([t66, t60], {}, # i64[], i32[] -> i64[1] + // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1] -> i64[4] + // t69 = opset.Reshape([t54, t68], {'special_zero': False}, # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?] + shared_ptr model = make_shared(make_shared(t69), ParameterVector{t0, t1, t2}); + ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model( + model, "CPU" + ).create_infer_request(); + // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {4, 576, 1024}}); + // ov::Tensor h_crop = ov::Tensor{i32, {}}; + // h_crop.data()[0] = 2; + // hd_feature_transformer.set_input_tensor(1, h_crop); + // ov::Tensor w_crop = ov::Tensor{i32, {}}; + // w_crop.data()[0] = 2; + // hd_feature_transformer.set_input_tensor(2, w_crop); + // hd_feature_transformer.infer(); + return hd_feature_transformer; +} + +ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop) { + ov::Shape shape = image_features.get_shape(); + OPENVINO_ASSERT(3 == shape.size()); + OPENVINO_ASSERT(1 == shape.at(0)); + OPENVINO_ASSERT(24 * 24 == shape.at(1)); + OPENVINO_ASSERT(1024 == shape.at(2)); + return {}; +} + +// image_features.resized_source: (num_crops+1, 24*24, 1024) +ov::Tensor hd_feature_transform(const EncodedImage& image_features) { + ov::Tensor global_image_features{ov::element::f32, {1, 24*24, 1024}, image_features.resized_source.data()}; + // global feature can be viewed as a special HD case with num_crops 1x1 + ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1); + return {}; +} +} +} + class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { public: + ov::InferRequest m_hd_feature_transformer; + InputsEmbedderPhi3V( const VLMConfig& vlm_config, const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config - ) : IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0} {} + ): + IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0}, + m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()} {} virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images) override { std::string images_prompt; std::vector embeds; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); + ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image); } ov::Tensor inputs_embeds; // if (m_vlm_config.use_image_id) { @@ -1055,17 +1243,17 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { // OPENVINO_ASSERT( // m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), // "Unexpected embedding size" - // ); + //; // ov::Tensor special_tokens = m_tokenizer.encode( // m_vlm_config.im_start // + m_vlm_config.im_end // + m_vlm_config.slice_start // + m_vlm_config.slice_end - // ).input_ids; + //.input_ids; // OPENVINO_ASSERT( // 4 == special_tokens.get_shape().at(1), // "Every special token must be represented with a single int." - // ); + //; // int64_t im_start_id = special_tokens.data()[0]; // int64_t im_end_id = special_tokens.data()[1]; // int64_t slice_start_id = special_tokens.data()[2]; diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 07f4935f0c..6601ad3763 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -909,5 +909,7 @@ EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const Process const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config); m_vision_encoder.set_input_tensor(pixel_values); m_vision_encoder.infer(); + // 2, 5, 3, 336, 336 2, 5, 576, 1024 + std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n'; return {m_vision_encoder.get_output_tensor(), image_size}; } From b34b14ef80b04196ab8b8bf44d36725e21f8d2b3 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Fri, 29 Nov 2024 14:12:33 +0400 Subject: [PATCH 04/28] actual data infer --- .../src/visual_language/inputs_embedder.cpp | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index e6fefe2d44..302b3980c8 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1009,6 +1009,21 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { namespace { namespace phi3_v { +// Reimplementation of python +// N, L, C = image_features.shape +// assert L == 24 * 24 and C == 1024 and N % (h_crop * w_crop) == 0 +// num_images = N // (h_crop * w_crop) +// H = int(L**0.5) +// print(L, H) +// image_features_hd = ( +// image_features.reshape(N, H, H, C) # N, 24, 24, 1024 +// .reshape(N, H // 2, 2, H // 2, 2, C) # N, 12, 2, 12, 2, 1024 +// .permute(0, 1, 3, 2, 4, 5) # N, 12, 12, 2, 2, 1024 +// .reshape(N, -1, 4 * C) # N, 144, 4096 +// .reshape(num_images, h_crop, w_crop, H // 2, H // 2, -1) # n_img, h_crop, w_crop, 12, 12, 4096 +// .permute(0, 1, 3, 2, 4, 5) # n_img, h_crop, 12, w_crop, 12, 4096 +// .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C) # n_img, h_crop*12, w_crop*12, 4096 +// ) ov::InferRequest create_hd_feature_transformer() { using namespace ov; using namespace element; @@ -1159,31 +1174,38 @@ ov::InferRequest create_hd_feature_transformer() { ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model( model, "CPU" ).create_infer_request(); - // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {4, 576, 1024}}); + // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}}); // ov::Tensor h_crop = ov::Tensor{i32, {}}; - // h_crop.data()[0] = 2; + // h_crop.data()[0] = 1; // hd_feature_transformer.set_input_tensor(1, h_crop); // ov::Tensor w_crop = ov::Tensor{i32, {}}; - // w_crop.data()[0] = 2; + // w_crop.data()[0] = 1; // hd_feature_transformer.set_input_tensor(2, w_crop); // hd_feature_transformer.infer(); + // std::cout << hd_feature_transformer.get_output_tensor().get_shape() << '\n'; // [1,24,24,4096] return hd_feature_transformer; } -ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop) { +ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) { ov::Shape shape = image_features.get_shape(); OPENVINO_ASSERT(3 == shape.size()); OPENVINO_ASSERT(1 == shape.at(0)); OPENVINO_ASSERT(24 * 24 == shape.at(1)); OPENVINO_ASSERT(1024 == shape.at(2)); - return {}; + hd_feature_transformer.set_input_tensor(0, image_features); + ov::Tensor height{ov::element::i32, {}, &h_crop}; + hd_feature_transformer.set_input_tensor(1, height); + ov::Tensor width{ov::element::i32, {}, &w_crop}; + hd_feature_transformer.set_input_tensor(2, width); + hd_feature_transformer.infer(); + return hd_feature_transformer.get_output_tensor(); } // image_features.resized_source: (num_crops+1, 24*24, 1024) -ov::Tensor hd_feature_transform(const EncodedImage& image_features) { +ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer) { ov::Tensor global_image_features{ov::element::f32, {1, 24*24, 1024}, image_features.resized_source.data()}; // global feature can be viewed as a special HD case with num_crops 1x1 - ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1); + ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer); return {}; } } @@ -1207,7 +1229,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { std::vector embeds; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); - ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image); + ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer); } ov::Tensor inputs_embeds; // if (m_vlm_config.use_image_id) { From 2da865838efb0a54bbdf93f386b935147a2278d4 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 10 Dec 2024 14:48:47 +0400 Subject: [PATCH 05/28] align tokenizers --- thirdparty/openvino_tokenizers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index 306dcd8dae..904046825b 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit 306dcd8daec36bbc680c50c68de1e954f42b0ab8 +Subproject commit 904046825b6378bae74f16f302b40599aa88d5b3 From 27d913dbc7ff34746c6287083d8c089181cb7c5c Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 11 Dec 2024 15:55:28 +0400 Subject: [PATCH 06/28] skip resize --- .../src/visual_language/inputs_embedder.cpp | 66 ++++++++++++++++++- .../src/visual_language/vision_encoder.cpp | 42 +++++++++--- src/cpp/src/visual_language/vlm_config.cpp | 4 ++ src/cpp/src/visual_language/vlm_config.hpp | 2 + 4 files changed, 102 insertions(+), 12 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 305233aeb4..d6272f4185 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1289,11 +1289,71 @@ ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t return hd_feature_transformer.get_output_tensor(); } +// image_features_hd: (num_images, h_crop*12, w_crop*12, 4096) +// output: (num_images, (h_crop*12) * (w_crop*12+1), 4096) +ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vector& sub_GN) { + const ov::Shape& nhwc = image_features_hd.get_shape(); // [N, 12*h_crop, 12*w_crop, 4096] + const float* in = image_features_hd.data(); + ov::Tensor image_features_hd_new_line{ov::element::f32, {nhwc.at(0), nhwc.at(1) * (nhwc.at(2) + 1), nhwc.at(3)}}; + float* out = image_features_hd_new_line.data(); + for (size_t batch_id = 0; batch_id < nhwc.at(0); ++batch_id) { + for (size_t row_id = 0; row_id < nhwc.at(1); ++row_id) { + for (size_t col_id = 0; col_id < nhwc.at(2); ++col_id) { + std::copy_n( + in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + col_id * nhwc.at(3), + nhwc.at(3), + out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3) + ); + } + std::copy( + sub_GN.begin(), + sub_GN.end(), + out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3) + ); + } + } + // std::cout << "AAAAAAAAAAAAAAAAAAAAAa\n"; + // std::cout << out[12*4096-1]<<'\n'; + // std::cout << out[12*4096+1]<<'\n'; + // std::cout << out[12*4096+4095]<<'\n'; + // std::cout << out[12*4096+4096]<<'\n'; + // std::cout << out[13*2*4096]<<'\n'; + // std::cout << out[(13*2+12)*4096]<<'\n'; + // std::cout << "BBBBBBBBBBBBBBBBB\n"; + return image_features_hd_new_line; +} + // image_features.resized_source: (num_crops+1, 24*24, 1024) -ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer) { - ov::Tensor global_image_features{ov::element::f32, {1, 24*24, 1024}, image_features.resized_source.data()}; +ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector& sub_GN) { + // std::cout << image_features.resized_source.data()[576*1024 + 0] << '\n'; + // std::cout << image_features.resized_source.data()[576*1024 + 1] << '\n'; + // std::cout << image_features.resized_source.data()[576*1024 + 1025] << '\n'; + // std::cout << image_features.resized_source.data()[576*1024 + 4090] << '\n'; + // std::cout << image_features.resized_source.data()[576*1024 + 80000] << '\n'; +// [5,3,336,336] [5,576,1024] +// 0.134461 +// -0.867309 +// -0.274503 +// 1.73786 +// 0.13117 +// [5,3,336,336] [5,576,1024] +// -1.01567 +// -0.291421 +// -0.260488 +// 0.743025 +// 1.4099 + const ov::Shape& image_features_shape = image_features.resized_source.get_shape(); + ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data()}; // global feature can be viewed as a special HD case with num_crops 1x1 ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer); + ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN); + constexpr size_t INPUT_IMAGE_SIZE = 336; + size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE; + size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE; + size_t num_crops = h_crop * w_crop; + + // NOTE: real num_crops is padded + // (num_crops, 24*24, 1024) return {}; } } @@ -1317,7 +1377,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { std::vector embeds; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); - ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer); + ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN); } ov::Tensor inputs_embeds; // if (m_vlm_config.use_image_id) { diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 492b4eca95..584490f632 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -658,13 +658,11 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) { if (trans) { src = clip_image_u8{height, width, {uint8_data, uint8_data + uint8.get_size()}}; bilinear_resize(src, dst, new_h, new_w); - // std::cout << new_h << ' ' << new_w << '\n'; return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()}); } src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}}; - bilinear_resize(src, dst, new_w, new_h); - // std::cout << new_w << ' ' << new_h << '\n'; - // 672, 448 + // bilinear_resize(src, dst, new_w, new_h); + dst = src; // TODO: put resize back return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()}); } @@ -674,9 +672,9 @@ ov::Tensor mean_scale(const ov::Tensor& uint8, const ProcessorConfig& config) { float* float_data = float_normalized.data(); OPENVINO_ASSERT(0 == uint8.get_size() % 3, "RGB"); for (size_t idx = 0; idx < uint8.get_size(); idx += 3) { - float_data[idx] = (float(uint_8_data[idx]) / 255.0f - config.norm_mean[0]) / config.norm_std[0]; - float_data[idx + 1] = (float(uint_8_data[idx + 1]) / 255.0f - config.norm_mean[1]) / config.norm_std[1]; - float_data[idx + 2] = (float(uint_8_data[idx + 2]) / 255.0f - config.norm_mean[2]) / config.norm_std[2]; + float_data[idx] = (float(uint_8_data[idx]) / 255.0f - config.image_mean[0]) / config.image_std[0]; + float_data[idx + 1] = (float(uint_8_data[idx + 1]) / 255.0f - config.image_mean[1]) / config.image_std[1]; + float_data[idx + 2] = (float(uint_8_data[idx + 2]) / 255.0f - config.image_mean[2]) / config.image_std[2]; } return float_normalized; } @@ -922,9 +920,35 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { // TODO: drop num_img_tokens const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config); + // std::cout << pixel_values.data()[3*336*336+0] << '\n'; + // std::cout << pixel_values.data()[3*336*336+1] << '\n'; + // std::cout << pixel_values.data()[3*336*336+100] << '\n'; +// -1.79226 +// -1.74847 +// -1.14993 +// 0.645675 +// 0.660273 +// 1.09823 m_vision_encoder.set_input_tensor(pixel_values); m_vision_encoder.infer(); - // 2, 5, 3, 336, 336 2, 5, 576, 1024 - std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n'; + // std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n'; + // ov::Tensor out = m_vision_encoder.get_output_tensor(); + // std::cout << out.data()[576*1024 + 0] << '\n'; + // std::cout << out.data()[576*1024 + 1] << '\n'; + // std::cout << out.data()[576*1024 + 1025] << '\n'; + // std::cout << out.data()[576*1024 + 4090] << '\n'; + // std::cout << out.data()[576*1024 + 80000] << '\n'; +// [5,3,336,336] [5,576,1024] +// 0.134461 +// -0.867309 +// -0.274503 +// 1.73786 +// 0.13117 +// [5,3,336,336] [5,576,1024] +// -1.01567 +// -0.291421 +// -0.260488 +// 0.743025 +// 1.4099 return {m_vision_encoder.get_output_tensor(), image_size}; } diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp index c4022ab80e..da825b6fce 100644 --- a/src/cpp/src/visual_language/vlm_config.cpp +++ b/src/cpp/src/visual_language/vlm_config.cpp @@ -19,4 +19,8 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) { // Setting llava_next specific config params read_json_param(parsed, "image_newline", image_newline); + // phi3_v + if (parsed.contains("sub_GN")) { + sub_GN = parsed.at("sub_GN").get>>>>().at(0).at(0).at(0); + } } diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp index c126d5495e..5e59f3605f 100644 --- a/src/cpp/src/visual_language/vlm_config.hpp +++ b/src/cpp/src/visual_language/vlm_config.hpp @@ -54,6 +54,8 @@ class VLMConfig { std::string image_context_token = ""; /// @brief A string token denoting end of image embeddings for InternVL2 model. std::string image_end_token = ""; + /// @brief phi3_v new line token embedding to separate images. + std::vector sub_GN = std::vector(4096, 0.0f); /// @brief Default constructor. VLMConfig() = default; From 66f75d55a4861811aede8233b805f90203e6b920 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 12 Dec 2024 14:52:11 +0400 Subject: [PATCH 07/28] vision --- .../src/visual_language/inputs_embedder.cpp | 84 +++++++++++++++++-- src/cpp/src/visual_language/vlm_config.cpp | 5 ++ src/cpp/src/visual_language/vlm_config.hpp | 1 + 3 files changed, 84 insertions(+), 6 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index d6272f4185..ed6ca87dc1 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1277,7 +1277,6 @@ ov::InferRequest create_hd_feature_transformer() { ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) { ov::Shape shape = image_features.get_shape(); OPENVINO_ASSERT(3 == shape.size()); - OPENVINO_ASSERT(1 == shape.at(0)); OPENVINO_ASSERT(24 * 24 == shape.at(1)); OPENVINO_ASSERT(1024 == shape.at(2)); hd_feature_transformer.set_input_tensor(0, image_features); @@ -1323,8 +1322,22 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec return image_features_hd_new_line; } +ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector& second_f, const ov::Tensor& third_1lf) { + size_t first_l = first_1lf.get_shape().at(1); + constexpr size_t second_l = 1; + size_t third_l = third_1lf.get_shape().at(1); + size_t features = first_1lf.get_shape().at(2); + OPENVINO_ASSERT(second_f.size() == features); + ov::Tensor out_1lf{ov::element::f32, {1, first_l + second_l + third_l, features}}; + float* out = out_1lf.data(); + std::copy_n(first_1lf.data(), first_l * features, out); + std::copy(second_f.begin(), second_f.end(), out + first_l * features); + std::copy_n(third_1lf.data(), third_l * features, out + (first_l + second_l) * features); + return out_1lf; +} + // image_features.resized_source: (num_crops+1, 24*24, 1024) -ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector& sub_GN) { +ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector& sub_GN, const std::vector& glb_GN, ov::InferRequest& vision_projection) { // std::cout << image_features.resized_source.data()[576*1024 + 0] << '\n'; // std::cout << image_features.resized_source.data()[576*1024 + 1] << '\n'; // std::cout << image_features.resized_source.data()[576*1024 + 1025] << '\n'; @@ -1346,7 +1359,7 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data()}; // global feature can be viewed as a special HD case with num_crops 1x1 ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer); - ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN); + ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN); // [1,12*(12+1),4096] constexpr size_t INPUT_IMAGE_SIZE = 336; size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE; size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE; @@ -1354,7 +1367,46 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest // NOTE: real num_crops is padded // (num_crops, 24*24, 1024) - return {}; + ov::Tensor sub_image_features{ov::element::f32, { + num_crops, + image_features_shape.at(1), + image_features_shape.at(2) + }, image_features.resized_source.data() + image_features_shape.at(1) * image_features_shape.at(2)}; + ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer); // [1, 24, 24, 4096] + // std::cout<()[0]<<'\n'; + // std::cout<()[1]<<'\n'; + // std::cout<()[4096]<<'\n'; + // std::cout<()[12*13*4096]<<'\n'; + // std::cout<()[12*13*4096+1]<<'\n'; +// 0.134461 +// -0.867309 +// 0.342726 +// -0.0916849 +// -2.65548 +// -1.01567 +// -0.291421 +// -0.993172 +// -1.0575 +// -0.299 + ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] + // std::cout << sub_image_features_hd_newline.get_shape()<<'\n'; + // std::cout<()[0]<<'\n'; +// std::cout<()[1]<<'\n'; +// std::cout<()[4096]<<'\n'; +// std::cout<()[12*13*4096]<<'\n'; +// std::cout<()[12*13*4096+1]<<'\n'; +// 0.134461 +// -0.867309 +// 0.342726 +// 0.0147288 +// -1.87735 +// -1.01567 +// -0.291421 +// -0.993172 +// -1.03232 +// -0.183072 + return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096] } } } @@ -1362,6 +1414,7 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { public: ov::InferRequest m_hd_feature_transformer; + ov::InferRequest m_vision_projection; InputsEmbedderPhi3V( const VLMConfig& vlm_config, @@ -1370,14 +1423,33 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config ): IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0}, - m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()} {} + m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()}, + m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {} ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { std::string images_prompt; std::vector embeds; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); - ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN); + ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection); + std::cout << image_features_proj.data()[0]<<'\n'; + std::cout << image_features_proj.data()[4096]<<'\n'; + std::cout << image_features_proj.data()[4097]<<'\n'; + std::cout << image_features_proj.data()[700*4096]<<'\n'; + std::cout << image_features_proj.data()[700*4097]<<'\n'; + std::cout << image_features_proj.data()[757*4096-1]<<'\n'; + // 0.134461 +// 0.342726 +// 0.0631084 +// 0.434334 +// 0.650556 +// 0 +// -1.01567 +// -0.993172 +// -0.226981 +// -1.89643 +// -0.907323 +// 0 } ov::Tensor inputs_embeds; // if (m_vlm_config.use_image_id) { diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp index da825b6fce..5af1b0d9b6 100644 --- a/src/cpp/src/visual_language/vlm_config.cpp +++ b/src/cpp/src/visual_language/vlm_config.cpp @@ -23,4 +23,9 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) { if (parsed.contains("sub_GN")) { sub_GN = parsed.at("sub_GN").get>>>>().at(0).at(0).at(0); } + OPENVINO_ASSERT(sub_GN.size() == 4096); + if (parsed.contains("glb_GN")) { + glb_GN = parsed.at("glb_GN").get>>>().at(0).at(0); + } + OPENVINO_ASSERT(glb_GN.size() == 4096); } diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp index 5e59f3605f..08de40321e 100644 --- a/src/cpp/src/visual_language/vlm_config.hpp +++ b/src/cpp/src/visual_language/vlm_config.hpp @@ -56,6 +56,7 @@ class VLMConfig { std::string image_end_token = ""; /// @brief phi3_v new line token embedding to separate images. std::vector sub_GN = std::vector(4096, 0.0f); + std::vector glb_GN = std::vector(4096, 0.0f); /// @brief Default constructor. VLMConfig() = default; From c7fc21c6e5566ae4064cddd703891ae595481642 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 12 Dec 2024 18:57:25 +0400 Subject: [PATCH 08/28] regex --- .../src/visual_language/inputs_embedder.cpp | 167 +++++++++--------- 1 file changed, 79 insertions(+), 88 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index ed6ca87dc1..c2591757ab 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -10,12 +10,7 @@ #include "openvino/opsets/opset13.hpp" #include "utils.hpp" - -namespace { - -constexpr size_t BATCH_SIZE = 1; - -} // namespace +#include namespace ov::genai { @@ -618,6 +613,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { } size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens; + constexpr size_t BATCH_SIZE = 1; ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); float* merged_data = merged_embeds.data(); @@ -1311,14 +1307,6 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec ); } } - // std::cout << "AAAAAAAAAAAAAAAAAAAAAa\n"; - // std::cout << out[12*4096-1]<<'\n'; - // std::cout << out[12*4096+1]<<'\n'; - // std::cout << out[12*4096+4095]<<'\n'; - // std::cout << out[12*4096+4096]<<'\n'; - // std::cout << out[13*2*4096]<<'\n'; - // std::cout << out[(13*2+12)*4096]<<'\n'; - // std::cout << "BBBBBBBBBBBBBBBBB\n"; return image_features_hd_new_line; } @@ -1338,23 +1326,6 @@ ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector& // image_features.resized_source: (num_crops+1, 24*24, 1024) ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector& sub_GN, const std::vector& glb_GN, ov::InferRequest& vision_projection) { - // std::cout << image_features.resized_source.data()[576*1024 + 0] << '\n'; - // std::cout << image_features.resized_source.data()[576*1024 + 1] << '\n'; - // std::cout << image_features.resized_source.data()[576*1024 + 1025] << '\n'; - // std::cout << image_features.resized_source.data()[576*1024 + 4090] << '\n'; - // std::cout << image_features.resized_source.data()[576*1024 + 80000] << '\n'; -// [5,3,336,336] [5,576,1024] -// 0.134461 -// -0.867309 -// -0.274503 -// 1.73786 -// 0.13117 -// [5,3,336,336] [5,576,1024] -// -1.01567 -// -0.291421 -// -0.260488 -// 0.743025 -// 1.4099 const ov::Shape& image_features_shape = image_features.resized_source.get_shape(); ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data()}; // global feature can be viewed as a special HD case with num_crops 1x1 @@ -1373,41 +1344,74 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest image_features_shape.at(2) }, image_features.resized_source.data() + image_features_shape.at(1) * image_features_shape.at(2)}; ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer); // [1, 24, 24, 4096] - // std::cout<()[0]<<'\n'; - // std::cout<()[1]<<'\n'; - // std::cout<()[4096]<<'\n'; - // std::cout<()[12*13*4096]<<'\n'; - // std::cout<()[12*13*4096+1]<<'\n'; -// 0.134461 -// -0.867309 -// 0.342726 -// -0.0916849 -// -2.65548 -// -1.01567 -// -0.291421 -// -0.993172 -// -1.0575 -// -0.299 ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] - // std::cout << sub_image_features_hd_newline.get_shape()<<'\n'; - // std::cout<()[0]<<'\n'; -// std::cout<()[1]<<'\n'; -// std::cout<()[4096]<<'\n'; -// std::cout<()[12*13*4096]<<'\n'; -// std::cout<()[12*13*4096+1]<<'\n'; -// 0.134461 -// -0.867309 -// 0.342726 -// 0.0147288 -// -1.87735 -// -1.01567 -// -0.291421 -// -0.993172 -// -1.03232 -// -0.183072 return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096] } + +std::vector split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) { + constexpr int make_suffix_iterator = -1; + std::regex rgx{R"(<\|image_\d+\|>)"}; + std::sregex_token_iterator iter{ + text.begin(), + text.end(), + rgx, + make_suffix_iterator + }; + std::vector tokenized; + for ( ; iter != std::sregex_token_iterator{}; ++iter) { + if (iter->str().empty()) { + continue; + } + tokenized.push_back(tokenizer.encode(*iter).input_ids); + } + return tokenized; +} + +// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { +// ov::Tensor encoded_input_ids; +// if (is_chat_conversation) { +// // KV cache in model already contains prompts and answers from previous iterations. +// // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns +// // token_ids = {, ...}. So if tokenizer applies only to the new prompt, +// // will be inserted on every iteration. +// // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt +// // and takes only the difference between them. +// // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but +// // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. +// m_history.push_back({{"role", "user"}, {"content", prompt}}); +// constexpr bool add_generation_prompt = true; +// std::string new_templated_chat_history; +// try { +// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); +// } catch (const std::exception& error) { +// // Use fallback chat template if it was not found in tokenizer_config.json +// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); +// } +// auto start_tokenizer_time = std::chrono::steady_clock::now(); +// ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; +// if (m_is_cache_empty) { +// encoded_input_ids = new_chat_tokens; +// // after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty +// m_is_cache_empty = false; +// } else { +// TokenizedInputs prev_chat_tokens = m_tokenizer.encode( +// m_templated_chat_history +// ); +// encoded_input_ids = utils::subtract_chat_tokenized_inputs( +// {new_chat_tokens}, prev_chat_tokens +// ).input_ids; +// } +// auto end_tokenizer_time = std::chrono::steady_clock::now(); +// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); +// m_templated_chat_history = std::move(new_templated_chat_history); +// } else { +// auto start_tokenizer_time = std::chrono::steady_clock::now(); +// encoded_input_ids = m_tokenizer.encode(prompt).input_ids; +// auto end_tokenizer_time = std::chrono::steady_clock::now(); +// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); +// } +// return encoded_input_ids; +// } } } @@ -1415,6 +1419,8 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { public: ov::InferRequest m_hd_feature_transformer; ov::InferRequest m_vision_projection; + // Used to insert <|image_i|>\n per image (not a slice). + size_t m_image_id = 1; InputsEmbedderPhi3V( const VLMConfig& vlm_config, @@ -1427,30 +1433,19 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {} ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { - std::string images_prompt; - std::vector embeds; + // TODO: perfmetrics + std::cout << prompt<<'\n'; + std::stringstream images_prompt; + std::vector images_features_proj; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); - ov::Tensor image_features_proj = phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection); - std::cout << image_features_proj.data()[0]<<'\n'; - std::cout << image_features_proj.data()[4096]<<'\n'; - std::cout << image_features_proj.data()[4097]<<'\n'; - std::cout << image_features_proj.data()[700*4096]<<'\n'; - std::cout << image_features_proj.data()[700*4097]<<'\n'; - std::cout << image_features_proj.data()[757*4096-1]<<'\n'; - // 0.134461 -// 0.342726 -// 0.0631084 -// 0.434334 -// 0.650556 -// 0 -// -1.01567 -// -0.993172 -// -0.226981 -// -1.89643 -// -0.907323 -// 0 + images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection)); + images_prompt << "<|image_" << m_image_id << "|>\n"; + ++m_image_id; } + images_prompt << prompt; + phi3_v::split_tokenize(images_prompt.str(), m_tokenizer); + ov::Tensor inputs_embeds; // if (m_vlm_config.use_image_id) { // images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; @@ -1549,10 +1544,6 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { IInputsEmbedder::finish_chat(); m_image_id = 0; } - -private: - // Used to insert <|image_i|>\n per image (not a slice). - size_t m_image_id; }; InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, From 83834a24a027f4243de3a670bdf8fd79c165fa08 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 8 Jan 2025 14:29:50 +0400 Subject: [PATCH 09/28] code style --- .../src/visual_language/inputs_embedder.cpp | 931 +++++++++--------- 1 file changed, 491 insertions(+), 440 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index b1027c533b..4404ddfe27 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1,16 +1,16 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include "openvino/genai/visual_language/perf_metrics.hpp" #include "visual_language/inputs_embedder.hpp" -#include "visual_language/clip.hpp" -#include "visual_language/vision_encoder.hpp" -#include "visual_language/embedding_model.hpp" -#include "openvino/opsets/opset13.hpp" +#include +#include "openvino/genai/visual_language/perf_metrics.hpp" +#include "openvino/opsets/opset13.hpp" #include "utils.hpp" -#include +#include "visual_language/clip.hpp" +#include "visual_language/embedding_model.hpp" +#include "visual_language/vision_encoder.hpp" namespace ov::genai { @@ -40,12 +40,15 @@ class InputsEmbedder::IInputsEmbedder { // Tail of previous output for LM in chat mode is missing in KV cache. std::optional m_last_disappeared_token = std::nullopt; // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history - // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history + // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add + // best answer to history so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to + // keep in history ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; public: - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + ov::genai::VLMPerfMetrics& metrics) = 0; EmbeddingsModel get_embedding_model() const { return m_embedding; @@ -63,7 +66,10 @@ class InputsEmbedder::IInputsEmbedder { return m_kv_history_manager.num_tokens_to_remove_from_kv_cache; } - void update_tokenized_history(const std::vector& encoded_result, std::optional last_disappeared_token, bool is_beam_search, size_t last_answer_len) { + void update_tokenized_history(const std::vector& encoded_result, + std::optional last_disappeared_token, + bool is_beam_search, + size_t last_answer_len) { if (is_beam_search) { m_kv_history_manager.trusted_history_length = m_tokenized_history.size(); m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len; @@ -72,7 +78,7 @@ class InputsEmbedder::IInputsEmbedder { } m_last_disappeared_token = last_disappeared_token; - + std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history)); } @@ -109,52 +115,48 @@ class InputsEmbedder::IInputsEmbedder { } protected: - IInputsEmbedder( - const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) : - m_vlm_config{vlm_config}, - m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config), - m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config), - m_tokenizer{model_dir, device_config} { } - - IInputsEmbedder( - const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) : - m_vlm_config{vlm_config}, - m_vision_encoder( - get_model_weights_pair(models_map, "vision_embeddings").first, - get_model_weights_pair(models_map, "vision_embeddings").second, - config_dir_path, - m_vlm_config.model_type, - device, - device_config - ), - m_embedding( - get_model_weights_pair(models_map, "text_embeddings").first, - get_model_weights_pair(models_map, "text_embeddings").second, - m_vlm_config.scale_emb, - device, - device_config - ), - m_tokenizer(tokenizer) { } - - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { + IInputsEmbedder(const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) + : m_vlm_config{vlm_config}, + m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config), + m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config), + m_tokenizer{model_dir, device_config} {} + + IInputsEmbedder(const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) + : m_vlm_config{vlm_config}, + m_vision_encoder(get_model_weights_pair(models_map, "vision_embeddings").first, + get_model_weights_pair(models_map, "vision_embeddings").second, + config_dir_path, + m_vlm_config.model_type, + device, + device_config), + m_embedding(get_model_weights_pair(models_map, "text_embeddings").first, + get_model_weights_pair(models_map, "text_embeddings").second, + m_vlm_config.scale_emb, + device, + device_config), + m_tokenizer(tokenizer) {} + + ov::Tensor get_encoded_input_ids(const std::string& prompt, + ov::genai::VLMPerfMetrics& metrics, + const std::string& chat_template_fallback = "") { ov::Tensor encoded_input_ids; if (m_is_chat_conversation) { // KV cache in model already contains prompts and answers from previous iterations. // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns // token_ids = {, ...}. So if tokenizer applies only to the new prompt, // will be inserted on every iteration. - // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt - // and takes only the difference between them. - // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but - // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new + // prompt and takes only the difference between them. The chat history cannot be saved as already encoded + // tokens because generate call doesn't return token, but KV cache contains it. So we have to add it + // manually or get it by tokenization all chat history. m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; std::string new_templated_chat_history; @@ -162,19 +164,24 @@ class InputsEmbedder::IInputsEmbedder { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); } catch (const std::exception& error) { // Use fallback chat template if it was not found in tokenizer_config.json - new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); + new_templated_chat_history = + m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; - TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); + ov::Tensor new_chat_tokens = + m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; + TokenizedInputs prev_chat_tokens = + m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); // some symbols combinations can be encoded by the tokenizer in different ways - // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history - // so let's check it out, find the trusted part and use it in on the next step + // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from + // the old history so let's check it out, find the trusted part and use it in on the next step size_t trusted_history_length = 0; if (!m_tokenized_history.empty()) { std::set stop_tokens = {m_tokenizer.get_eos_token_id()}; - trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens); + trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, + m_tokenized_history, + stop_tokens); } if (m_tokenized_history.empty()) { @@ -182,81 +189,94 @@ class InputsEmbedder::IInputsEmbedder { } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) { // does_kv_cache_need_to_update will be true here if beam search is activated - // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly - // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager + // in beam search mode we want to remove all history about last model answer from kv cache and add the + // best answer directly if we have difference in model answer and decoded answer it anyway will be less + // then entire history, so let's use data from m_kv_history_manager if (m_kv_history_manager.does_kv_cache_need_to_update()) { trusted_history_length = m_kv_history_manager.trusted_history_length; } else { - m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length; - // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it - m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0; + m_kv_history_manager.num_tokens_to_remove_from_kv_cache = + m_tokenized_history.size() - trusted_history_length; + // if prev generation was finished because of max len was reached, kv cache is missed one last + // token, let's keep it + m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= + m_last_disappeared_token.has_value() ? 1 : 0; } ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(), {1, new_chat_tokens.get_shape().at(1) - trusted_history_length}, new_chat_tokens.data() + trusted_history_length); encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(), - {1, new_chat_tokens.get_shape().at(1) - trusted_history_length}); + {1, new_chat_tokens.get_shape().at(1) - trusted_history_length}); new_tensor.copy_to(encoded_input_ids); } else { - encoded_input_ids = utils::subtract_chat_tokenized_inputs( - {new_chat_tokens}, prev_chat_tokens - ).input_ids; + encoded_input_ids = + utils::subtract_chat_tokenized_inputs({new_chat_tokens}, prev_chat_tokens).input_ids; if (m_last_disappeared_token.has_value()) - encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); + encoded_input_ids = + ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); } auto end_tokenizer_time = std::chrono::steady_clock::now(); - metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations.emplace_back( + PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history); m_tokenized_history.clear(); - std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); + std::copy_n(new_chat_tokens.data(), + new_chat_tokens.get_size(), + std::back_inserter(m_tokenized_history)); } else { auto start_tokenizer_time = std::chrono::steady_clock::now(); encoded_input_ids = m_tokenizer.encode(prompt).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); - metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations.emplace_back( + PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_tokenized_history.clear(); - std::copy_n(encoded_input_ids.data(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history)); + std::copy_n(encoded_input_ids.data(), + encoded_input_ids.get_size(), + std::back_inserter(m_tokenized_history)); } return encoded_input_ids; } /** - * @brief Unpads an image tensor of a padded and resized image. - * Used for packing image features of llava_next models. - * - * @param tensor An image tensor with a shape (embed_dim, height, width) - * @param original_size A size of original image - * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) - */ + * @brief Unpads an image tensor of a padded and resized image. + * Used for packing image features of llava_next models. + * + * @param tensor An image tensor with a shape (embed_dim, height, width) + * @param original_size A size of original image + * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) + */ /** - * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]). - * - * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or [HWC]. - * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C]. - */ + * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]). + * + * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or + * [HWC]. + * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C]. + */ std::vector to_single_image_tensors(const std::vector& images) { std::vector single_image_tensors; for (const auto& image : images) { ov::Tensor reshaped_image = image; ov::Shape image_shape = image.get_shape(); switch (image_shape.size()) { - case 3: - reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)}); - break; - case 4: break; - default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); + case 3: + reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)}); + break; + case 4: + break; + default: + OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); } ov::Shape reshaped_image_shape = reshaped_image.get_shape(); for (size_t batch_idx = 0; batch_idx < reshaped_image_shape.at(0); ++batch_idx) { ov::Tensor single_image{ reshaped_image.get_element_type(), {1, reshaped_image_shape.at(1), reshaped_image_shape.at(2), reshaped_image_shape.at(3)}, - reshaped_image.data() + batch_idx * reshaped_image_shape.at(1) * reshaped_image_shape.at(2) * reshaped_image_shape.at(3) - }; + reshaped_image.data() + batch_idx * reshaped_image_shape.at(1) * + reshaped_image_shape.at(2) * reshaped_image_shape.at(3)}; single_image_tensors.push_back(std::move(single_image)); } } @@ -277,12 +297,11 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { size_t m_image_id = 0; public: - InputsEmbedderMiniCPM( - const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) : - IInputsEmbedder(vlm_config, model_dir, device, device_config) { + InputsEmbedderMiniCPM(const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) + : IInputsEmbedder(vlm_config, model_dir, device, device_config) { auto compiled_model = utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, device_config); ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model"); @@ -291,25 +310,26 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); } - InputsEmbedderMiniCPM( - const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) : - IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { - m_resampler = utils::singleton_core().compile_model( - get_model_weights_pair(models_map, "resampler").first, - get_model_weights_pair(models_map, "resampler").second, - device, - device_config - ).create_infer_request(); - - m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); - } + InputsEmbedderMiniCPM(const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) + : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { + m_resampler = utils::singleton_core() + .compile_model(get_model_weights_pair(models_map, "resampler").first, + get_model_weights_pair(models_map, "resampler").second, + device, + device_config) + .create_infer_request(); - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); + } + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + ov::genai::VLMPerfMetrics& metrics) override { std::string images_prompt; std::vector embeds; @@ -347,24 +367,18 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics); ov::Tensor inputs_embeds = m_embedding.infer(encoded_input); - OPENVINO_ASSERT( - m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), - "Unexpected embedding size" - ); + OPENVINO_ASSERT(m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), "Unexpected embedding size"); auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor special_tokens = m_tokenizer.encode( - m_vlm_config.im_start - + m_vlm_config.im_end - + m_vlm_config.slice_start - + m_vlm_config.slice_end - ).input_ids; + ov::Tensor special_tokens = + m_tokenizer + .encode(m_vlm_config.im_start + m_vlm_config.im_end + m_vlm_config.slice_start + m_vlm_config.slice_end) + .input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); - OPENVINO_ASSERT( - 4 == special_tokens.get_shape().at(1), - "Every special token must be represented with a single int." - ); + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += + ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + OPENVINO_ASSERT(4 == special_tokens.get_shape().at(1), + "Every special token must be represented with a single int."); int64_t im_start_id = special_tokens.data()[0]; int64_t im_end_id = special_tokens.data()[1]; int64_t slice_start_id = special_tokens.data()[2]; @@ -376,12 +390,15 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { int64_t* end = ids + encoded_input_size; float* inputs_embeds_data = inputs_embeds.data(); for (const EncodedImage& encoded_image : embeds) { - const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size}); + const ov::Tensor& resampled_source = + resample(encoded_image.resized_source, {encoded_image.resized_source_size}); float* emb = resampled_source.data(); ids = std::find(ids, end, im_start_id); OPENVINO_ASSERT(end != ids); ++ids; - std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + std::copy_n(emb, + resampled_source.get_size(), + inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids += m_vlm_config.query_num; if (encoded_image.slices) { size_t token_idx = 0; @@ -390,12 +407,17 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { size_t d2 = slices_shape.at(2); size_t d3 = slices_shape.at(3); - ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; + ov::Tensor encoded_view{ + ov::element::f32, + {1, d2, d3}, + encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size}); ids = std::find(ids, end, slice_start_id); OPENVINO_ASSERT(end != ids); ++ids; - std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + std::copy_n(vision_embed_tensor_i_j.data(), + vision_embed_tensor_i_j.get_size(), + inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids += m_vlm_config.query_num; } } @@ -425,11 +447,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) { return height_width.height * height_width.width; }); - adjust_pos_cache( - target_sizes, - m_vlm_config.hidden_size, - m_pos_embed_cache - ); + adjust_pos_cache(target_sizes, m_vlm_config.hidden_size, m_pos_embed_cache); size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len}); float* mask_data = key_padding_mask.data(); @@ -444,11 +462,9 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { size_t target_w = target_sizes.at(i).width; for (size_t h_idx = 0; h_idx < target_h; ++h_idx) { for (size_t w_idx = 0; w_idx < target_w; ++w_idx) { - std::copy_n( - cache_data + (h_idx * _d1 + w_idx) * embed_len, - embed_len, - pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len - ); + std::copy_n(cache_data + (h_idx * _d1 + w_idx) * embed_len, + embed_len, + pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len); } } for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { @@ -457,8 +473,8 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f); std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f); } - m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] - m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] + m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] + m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] m_resampler.infer(); return m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] @@ -478,12 +494,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { for (size_t j = 0; j < res_d_1; ++j) { size_t k = 0; for (; k < first.get_shape().at(2); ++k) { - res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] - = first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k]; + res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] = + first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k]; } for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) { - res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] - = second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l]; + res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] = + second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l]; } } } @@ -529,16 +545,14 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Shape grid_shape = grid.get_shape(); float* grid_data = grid.data(); ov::Shape plane_shape{grid_shape.at(1), grid_shape.at(2)}; - ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{ - ov::element::f32, - plane_shape, - grid_data - }); // (H, W, D/2) - ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{ - ov::element::f32, - plane_shape, - grid_data + plane_shape.at(0) * plane_shape.at(1) - }); // (H, W, D/2) + ov::Tensor emb_h = + get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, + ov::Tensor{ov::element::f32, plane_shape, grid_data}); // (H, W, D/2) + ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new( + embed_dim / 2, + ov::Tensor{ov::element::f32, + plane_shape, + grid_data + plane_shape.at(0) * plane_shape.at(1)}); // (H, W, D/2) return concatenate_last_dim(emb_h, emb_w); } @@ -560,17 +574,19 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { return get_2d_sincos_pos_embed_from_grid(embed_dim, grid); } - void adjust_pos_cache( - const std::vector& target_sizes, - size_t hidden_size, - ov::Tensor& pos_embed_cache - ) { - size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { - return left.height < right.height; - })->height; - size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { - return left.width < right.width; - })->width; + void adjust_pos_cache(const std::vector& target_sizes, size_t hidden_size, ov::Tensor& pos_embed_cache) { + size_t max_h = std::max_element(target_sizes.begin(), + target_sizes.end(), + [](const ImageSize& left, const ImageSize& right) { + return left.height < right.height; + }) + ->height; + size_t max_w = std::max_element(target_sizes.begin(), + target_sizes.end(), + [](const ImageSize& left, const ImageSize& right) { + return left.width < right.width; + }) + ->width; size_t allocated_height, allocated_width; if (pos_embed_cache) { const ov::Shape& allocated_shape = pos_embed_cache.get_shape(); @@ -582,36 +598,37 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { if (max_h > allocated_height || max_w > allocated_width) { allocated_height = std::max(max_h, allocated_height); allocated_width = std::max(max_w, allocated_width); - pos_embed_cache = get_2d_sincos_pos_embed( - hidden_size, {allocated_height, allocated_width} - ); + pos_embed_cache = get_2d_sincos_pos_embed(hidden_size, {allocated_height, allocated_width}); } } }; class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { public: - InputsEmbedderLLaVA( - const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) : - IInputsEmbedder(vlm_config, model_dir, device, device_config) { } - - InputsEmbedderLLaVA( - const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) : - IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderLLaVA(const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) + : IInputsEmbedder(vlm_config, model_dir, device, device_config) {} + + InputsEmbedderLLaVA(const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) + : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + ov::genai::VLMPerfMetrics& metrics) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json - std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; - + std::string chat_template_fallback = + "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' " + "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if " + "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; @@ -632,21 +649,21 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { return text_embeds; } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_token = + m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += + ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id); } protected: - ov::Tensor merge_text_and_image_embeddings_llava( - const ov::Tensor& input_ids, - const ov::Tensor& text_embeds, - const std::vector& image_embeds, - int64_t image_token_id - ) { + ov::Tensor merge_text_and_image_embeddings_llava(const ov::Tensor& input_ids, + const ov::Tensor& text_embeds, + const std::vector& image_embeds, + int64_t image_token_id) { auto text_embeds_shape = text_embeds.get_shape(); size_t text_embeds_seq_length = text_embeds_shape[1]; size_t hidden_size = text_embeds_shape[2]; @@ -661,22 +678,18 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { } } auto num_images = image_embeds.size(); - OPENVINO_ASSERT( - num_image_tokens == num_images, - "Number of image tokens in input_ids different from num_images." - ); + OPENVINO_ASSERT(num_image_tokens == num_images, + "Number of image tokens in input_ids different from num_images."); size_t total_image_seq_length = 0; for (const auto& single_image_embeds : image_embeds) { - OPENVINO_ASSERT( - text_embeds_shape[2] == single_image_embeds.get_shape().at(2), - "Incompatible shapes between text_embeds and image_embeds" - ); + OPENVINO_ASSERT(text_embeds_shape[2] == single_image_embeds.get_shape().at(2), + "Incompatible shapes between text_embeds and image_embeds"); total_image_seq_length += single_image_embeds.get_shape().at(1); } size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens; - constexpr size_t BATCH_SIZE = 1; + constexpr size_t BATCH_SIZE = 1; ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); float* merged_data = merged_embeds.data(); @@ -687,15 +700,11 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { const float* image_embeds_data = image_embeds[image_idx].data(); size_t image_seq_length = image_embeds[image_idx].get_shape()[1]; - std::copy_n(image_embeds_data, - image_seq_length * hidden_size, - merged_data + merged_idx * hidden_size); + std::copy_n(image_embeds_data, image_seq_length * hidden_size, merged_data + merged_idx * hidden_size); merged_idx += image_seq_length; image_idx++; } else { - std::copy_n(text_embeds_data + s * hidden_size, - hidden_size, - merged_data + merged_idx * hidden_size); + std::copy_n(text_embeds_data + s * hidden_size, hidden_size, merged_data + merged_idx * hidden_size); merged_idx++; } } @@ -705,33 +714,36 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { public: - InputsEmbedderLLaVANext( - const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) : - InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) { } - - InputsEmbedderLLaVANext( - const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) : - InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderLLaVANext(const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) + : InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) {} + + InputsEmbedderLLaVANext(const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) + : InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + ov::genai::VLMPerfMetrics& metrics) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json - std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; + std::string chat_template_fallback = + "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' " + "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if " + "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + ov::Tensor image_newline; for (const auto& image : single_images) { @@ -744,9 +756,10 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::copy(m_vlm_config.image_newline.begin(), m_vlm_config.image_newline.end(), image_newline_data); } - ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width] + ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width] - ov::Tensor packed_features = pack_image_features_llava_next(encoded_image, original_image_size, image_newline); + ov::Tensor packed_features = + pack_image_features_llava_next(encoded_image, original_image_size, image_newline); image_embeds.push_back(std::move(packed_features)); formatted_prompt += image_token + "\n"; @@ -760,29 +773,29 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { return text_embeds; } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_token = + m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += + ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id); } private: /** - * @brief Processes base and patches image features extracted from encoded image. - * Used in getting inputs embeds for llava_next models. - * - * @param encoded_image An encoded image retrieved from vision encoder - * @param original_image_size A size of the original image - * @param image_newline An image newline tensor with a shape (embed_dim) - * @return A tensor with a shape (1, new_seq_len, embed_dim) - */ - ov::Tensor pack_image_features_llava_next( - const EncodedImage& encoded_image, - const ImageSize& original_image_size, - const ov::Tensor& image_newline - ) { + * @brief Processes base and patches image features extracted from encoded image. + * Used in getting inputs embeds for llava_next models. + * + * @param encoded_image An encoded image retrieved from vision encoder + * @param original_image_size A size of the original image + * @param image_newline An image newline tensor with a shape (embed_dim) + * @return A tensor with a shape (1, new_seq_len, embed_dim) + */ + ov::Tensor pack_image_features_llava_next(const EncodedImage& encoded_image, + const ImageSize& original_image_size, + const ov::Tensor& image_newline) { auto image_feature = encoded_image.resized_source; auto image_feature_shape = image_feature.get_shape(); size_t num_patches = image_feature_shape[0]; @@ -800,11 +813,12 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::copy(src_data, src_data + patch_seq_len * embed_dim, dst_data); // Extract other grid patches - ov::Tensor patches_image_feature(image_feature.get_element_type(), {num_patches - 1, patch_seq_len, embed_dim}); + ov::Tensor patches_image_feature(image_feature.get_element_type(), + {num_patches - 1, patch_seq_len, embed_dim}); dst_data = patches_image_feature.data(); std::copy(src_data + patch_seq_len * embed_dim, - src_data + num_patches * patch_seq_len * embed_dim, - dst_data); + src_data + num_patches * patch_seq_len * embed_dim, + dst_data); // Process grid patches image feature size_t height = encoded_image.resized_source_size.height; @@ -812,7 +826,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { size_t num_patch_height = encoded_image.patches_grid.first; size_t num_patch_width = encoded_image.patches_grid.second; - ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature, num_patch_height, num_patch_width, height, width); + ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature, + num_patch_height, + num_patch_width, + height, + width); ov::Tensor unpadded_image_feature = unpad_image(reshaped_image_feature, original_image_size); @@ -820,7 +838,8 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor processed_image_feature = flatten_and_transpose(image_feature_with_newline); - // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2, embed_dim]) + // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2, + // embed_dim]) auto base_shape = base_image_feature.get_shape(); auto processed_shape = processed_image_feature.get_shape(); @@ -832,32 +851,30 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::copy(base_data, base_data + base_shape[1] * embed_dim, result.data()); // Copy processed image feature data std::copy(processed_data, - processed_data + processed_shape[0] * embed_dim, - result.data() + base_shape[1] * embed_dim); + processed_data + processed_shape[0] * embed_dim, + result.data() + base_shape[1] * embed_dim); return result; } else { // If there is only one patch, return the original (base) image feature concatenated with image_newline ov::Tensor result(image_feature.get_element_type(), {1, patch_seq_len + 1, embed_dim}); // Copy base image feature data std::copy(image_feature_data + embed_dim, - image_feature_data + patch_seq_len * embed_dim, - result.data()); + image_feature_data + patch_seq_len * embed_dim, + result.data()); // Append image_newline data - std::copy(newline_data, - newline_data + embed_dim, - result.data() + patch_seq_len * embed_dim); + std::copy(newline_data, newline_data + embed_dim, result.data() + patch_seq_len * embed_dim); return result; } } /** - * @brief Adds image newline tensor to patches image feature tensor. - * Used for packing image features of llava_next models. - * - * @param image_feature A tensor with a shape (embed_dim, height, width) - * @param image_newline A tensor with a shape (embed_dim) - * @return A tensor with a shape (embed_dim, height, width + 1) - */ + * @brief Adds image newline tensor to patches image feature tensor. + * Used for packing image features of llava_next models. + * + * @param image_feature A tensor with a shape (embed_dim, height, width) + * @param image_newline A tensor with a shape (embed_dim) + * @return A tensor with a shape (embed_dim, height, width + 1) + */ ov::Tensor add_image_newline(const ov::Tensor& image_feature, const ov::Tensor& image_newline) { auto shape = image_feature.get_shape(); @@ -867,7 +884,8 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { size_t height = shape[1]; size_t width = shape[2]; - OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim, "image_newline dimension must match embed_dim of image_feature"); + OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim, + "image_newline dimension must match embed_dim of image_feature"); const float* image_feature_data = image_feature.data(); const float* newline_data = image_newline.data(); @@ -878,11 +896,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { for (size_t e = 0; e < embed_dim; ++e) { for (size_t h = 0; h < height; ++h) { // Copy original image feature data - std::copy( - image_feature_data + (e * height * width + h * width), - image_feature_data + (e * height * width + (h + 1) * width), - feature_with_newline_data + (e * height * (width + 1) + h * (width + 1)) - ); + std::copy(image_feature_data + (e * height * width + h * width), + image_feature_data + (e * height * width + (h + 1) * width), + feature_with_newline_data + (e * height * (width + 1) + h * (width + 1))); // Add image newline feature_with_newline_data[e * height * (width + 1) + h * (width + 1) + width] = newline_data[e]; } @@ -892,12 +908,12 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { } /** - * @brief Flattens and transposes tensor. - * Used for packing image features of llava_next models. - * - * @param tensor A tensor with a shape (embed_dim, height, width) - * @return A tensor with a shape (height * width, embed_dim) - */ + * @brief Flattens and transposes tensor. + * Used for packing image features of llava_next models. + * + * @param tensor A tensor with a shape (embed_dim, height, width) + * @return A tensor with a shape (height * width, embed_dim) + */ ov::Tensor flatten_and_transpose(const ov::Tensor& tensor) { auto shape = tensor.get_shape(); OPENVINO_ASSERT(shape.size() == 3, "Flattening tensor must have 3 dimensions"); @@ -921,7 +937,6 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { return flatten_feature; } - ov::Tensor reshape_and_rearrange_image_feature(const ov::Tensor& image_feature, int num_patch_height, int num_patch_width, @@ -934,15 +949,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { size_t patch_seq_len = shape[1]; size_t embed_dim = shape[2]; - OPENVINO_ASSERT( - num_patches == num_patch_height * num_patch_width, - "Number of patches does not match the specified grid size" - ); + OPENVINO_ASSERT(num_patches == num_patch_height * num_patch_width, + "Number of patches does not match the specified grid size"); - OPENVINO_ASSERT( - patch_seq_len == height * width, - "Patch sequence length does not match the specified height and width" - ); + OPENVINO_ASSERT(patch_seq_len == height * width, + "Patch sequence length does not match the specified height and width"); // Reshape tensor data and permute dimensions // [num_patches, patch_seq_len, embed_dim] -> [embed_dim, num_patch_height, height, num_patch_width, width] @@ -965,20 +976,19 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor result(image_feature.get_element_type(), {static_cast(embed_dim), static_cast(num_patch_height * height), - static_cast(num_patch_width * width)} - ); + static_cast(num_patch_width * width)}); std::copy(reshaped_data.begin(), reshaped_data.end(), result.data()); return result; } /** - * @brief Unpads an image tensor of a padded and resized image. - * Used for packing image features of llava_next models. - * - * @param tensor An image tensor with a shape (embed_dim, height, width) - * @param original_size A size of original image - * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) - */ + * @brief Unpads an image tensor of a padded and resized image. + * Used for packing image features of llava_next models. + * + * @param tensor An image tensor with a shape (embed_dim, height, width) + * @param original_size A size of original image + * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) + */ ov::Tensor unpad_image(const ov::Tensor& tensor, const ImageSize& original_size) { size_t original_height = original_size.height; size_t original_width = original_size.width; @@ -1003,9 +1013,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { for (int h = 0; h < unpadded_height_dim; ++h) { std::copy( tensor.data() + (e * current_height * current_width + (padding + h) * current_width), - tensor.data() + (e * current_height * current_width + (padding + h) * current_width + current_width), - unpadded_tensor.data() + (e * unpadded_height_dim * current_width + h * current_width) - ); + tensor.data() + + (e * current_height * current_width + (padding + h) * current_width + current_width), + unpadded_tensor.data() + (e * unpadded_height_dim * current_width + h * current_width)); } } } else { @@ -1017,11 +1027,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { for (size_t e = 0; e < embed_dim; ++e) { for (int h = 0; h < current_height; ++h) { - std::copy( - tensor.data() + (e * current_height * current_width + h * current_width + padding), - tensor.data() + (e * current_height * current_width + h * current_width + padding + unpadded_width_dim), - unpadded_tensor.data() + (e * current_height * unpadded_width_dim + h * unpadded_width_dim) - ); + std::copy(tensor.data() + (e * current_height * current_width + h * current_width + padding), + tensor.data() + (e * current_height * current_width + h * current_width + padding + + unpadded_width_dim), + unpadded_tensor.data() + + (e * current_height * unpadded_width_dim + h * unpadded_width_dim)); } } } @@ -1032,40 +1042,40 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { public: - InputsEmbedderInternVLChat( - const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) : - IInputsEmbedder(vlm_config, model_dir, device, device_config) { } - - InputsEmbedderInternVLChat( - const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) : - IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } - - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderInternVLChat(const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) + : IInputsEmbedder(vlm_config, model_dir, device, device_config) {} + + InputsEmbedderInternVLChat(const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) + : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + ov::genai::VLMPerfMetrics& metrics) override { std::string image_start_token = m_vlm_config.image_start_token; std::string image_context_token = m_vlm_config.image_context_token; std::string image_end_token = m_vlm_config.image_end_token; - + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + for (const auto& image : single_images) { EncodedImage encoded_image = m_vision_encoder.encode(image); ov::Tensor single_image_embeds = encoded_image.resized_source; const size_t num_patches = single_image_embeds.get_shape().at(0); const size_t num_image_tokens = single_image_embeds.get_shape().at(1); - + formatted_prompt += image_start_token; for (int i = 0; i < num_patches * num_image_tokens; ++i) { formatted_prompt += image_context_token; @@ -1083,21 +1093,22 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { return text_embeds; } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor encoded_image_context_token = m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_context_token = + m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); - int64_t image_context_token_id = encoded_image_context_token.data()[encoded_image_context_token.get_size() - 1]; + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += + ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + int64_t image_context_token_id = + encoded_image_context_token.data()[encoded_image_context_token.get_size() - 1]; return merge_text_and_image_embeddings_internvl(input_ids, text_embeds, image_embeds, image_context_token_id); } protected: - ov::Tensor merge_text_and_image_embeddings_internvl( - const ov::Tensor& input_ids, - const ov::Tensor& text_embeds, - const std::vector& image_embeds, - int64_t image_context_token_id - ) { + ov::Tensor merge_text_and_image_embeddings_internvl(const ov::Tensor& input_ids, + const ov::Tensor& text_embeds, + const std::vector& image_embeds, + int64_t image_context_token_id) { auto text_embeds_shape = text_embeds.get_shape(); size_t batch_size = text_embeds_shape.at(0); size_t seq_len = text_embeds_shape.at(1); @@ -1131,12 +1142,14 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { if (image_context_tokens_mask[flat_idx]) { const ov::Tensor& single_image_embeds = image_embeds[image_idx]; - const size_t num_all_image_tokens = single_image_embeds.get_shape().at(0) * single_image_embeds.get_shape().at(1); // num_patches * num_image_tokens + const size_t num_all_image_tokens = + single_image_embeds.get_shape().at(0) * + single_image_embeds.get_shape().at(1); // num_patches * num_image_tokens const float* image_embeds_data = single_image_embeds.data(); std::copy_n(image_embeds_data + image_context_token_idx * embed_dim, embed_dim, merged_embeds_data + offset); - + ++image_context_token_idx; if (image_context_token_idx == num_all_image_tokens) { @@ -1277,30 +1290,23 @@ ov::InferRequest create_hd_feature_transformer() { // t28 = opset.Unsqueeze([t26, t27], {}, # i64[], i32[] -> i64[1] // t29 = opset.Constant(model, 29, # -> i64[1]([2]) // t30 = opset.Constant(model, 30, # -> i64[1]([2]) - // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] - // t32 = opset.Reshape([t23, t31], {'special_zero': False}, # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024] - // t33 = opset.Constant(model, 33, - // t34 = opset.Transpose([t32, t33], {}, # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] - // t35 = opset.Constant(model, 35, # -> i64[1]([-1]) - // t36 = opset.Constant(model, 36, # -> i64[1]([4]) - // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'}, # i64[1], i64[1] -> i64[1] - // t38 = opset.Concat([t8, t35, t37], {'axis': 0}, # i64[1], i64[1], i64[1] -> i64[3] - // t39 = opset.Reshape([t34, t38], {'special_zero': False}, # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] - // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] - // t41 = opset.Convert([t40], {'destination_type': 'i64'}, # i32[] -> i64[] - // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] - // t43 = opset.Floor([t42], {}, # i64[] -> i64[] - // t44 = opset.Constant(model, 44, # -> i32[](0) - // t45 = opset.Unsqueeze([t43, t44], {}, # i64[], i32[] -> i64[1] - // t46 = opset.Convert([t1], {'destination_type': 'i64'}, # i32[] -> i64[] - // t47 = opset.Unsqueeze([t46, t44], {}, # i64[], i32[] -> i64[1] - // t48 = opset.Convert([t2], {'destination_type': 'i64'}, # i32[] -> i64[] - // t49 = opset.Unsqueeze([t48, t44], {}, # i64[], i32[] -> i64[1] - // t50 = opset.Constant(model, 50, # -> i64[1]([-1]) - // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] - // t52 = opset.Reshape([t39, t51], {'special_zero': False}, # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] - // t53 = opset.Constant(model, 53, - // t54 = opset.Transpose([t52, t53], {}, # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?] + // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] + // -> i64[6] t32 = opset.Reshape([t23, t31], {'special_zero': False}, # f32[?,24,24,1024], i64[6] -> + // f32[?,12,2,12,2,1024] t33 = opset.Constant(model, 33, t34 = opset.Transpose([t32, t33], {}, # + // f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] t35 = opset.Constant(model, 35, # -> i64[1]([-1]) t36 = + // opset.Constant(model, 36, # -> i64[1]([4]) t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'}, # + // i64[1], i64[1] -> i64[1] t38 = opset.Concat([t8, t35, t37], {'axis': 0}, # i64[1], i64[1], i64[1] -> i64[3] t39 + // = opset.Reshape([t34, t38], {'special_zero': False}, # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] t40 = + // opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] t41 = opset.Convert([t40], + // {'destination_type': 'i64'}, # i32[] -> i64[] t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', + // 'm_pythondiv': True}, # i64[], i64[] -> i64[] t43 = opset.Floor([t42], {}, # i64[] -> i64[] t44 = + // opset.Constant(model, 44, # -> i32[](0) t45 = opset.Unsqueeze([t43, t44], {}, # i64[], i32[] -> i64[1] t46 = + // opset.Convert([t1], {'destination_type': 'i64'}, # i32[] -> i64[] t47 = opset.Unsqueeze([t46, t44], {}, # + // i64[], i32[] -> i64[1] t48 = opset.Convert([t2], {'destination_type': 'i64'}, # i32[] -> i64[] t49 = + // opset.Unsqueeze([t48, t44], {}, # i64[], i32[] -> i64[1] t50 = opset.Constant(model, 50, # -> i64[1]([-1]) t51 + // = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> + // i64[6] t52 = opset.Reshape([t39, t51], {'special_zero': False}, # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] t53 + // = opset.Constant(model, 53, t54 = opset.Transpose([t52, t53], {}, # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?] // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] // t56 = opset.Convert([t55], {'destination_type': 'i64'}, # i32[] -> i64[] // t57 = opset.Constant(model, 57, # -> i64[](2) @@ -1317,9 +1323,8 @@ ov::InferRequest create_hd_feature_transformer() { // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1] -> i64[4] // t69 = opset.Reshape([t54, t68], {'special_zero': False}, # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?] shared_ptr model = make_shared(make_shared(t69), ParameterVector{t0, t1, t2}); - ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model( - model, "CPU" - ).create_infer_request(); + ov::InferRequest hd_feature_transformer = + utils::singleton_core().compile_model(model, "CPU").create_infer_request(); // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}}); // ov::Tensor h_crop = ov::Tensor{i32, {}}; // h_crop.data()[0] = 1; @@ -1332,7 +1337,10 @@ ov::InferRequest create_hd_feature_transformer() { return hd_feature_transformer; } -ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) { +ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, + size_t h_crop, + size_t w_crop, + InferRequest& hd_feature_transformer) { ov::Shape shape = image_features.get_shape(); OPENVINO_ASSERT(3 == shape.size()); OPENVINO_ASSERT(24 * 24 == shape.at(1)); @@ -1356,23 +1364,24 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec for (size_t batch_id = 0; batch_id < nhwc.at(0); ++batch_id) { for (size_t row_id = 0; row_id < nhwc.at(1); ++row_id) { for (size_t col_id = 0; col_id < nhwc.at(2); ++col_id) { - std::copy_n( - in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + col_id * nhwc.at(3), - nhwc.at(3), - out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3) - ); + std::copy_n(in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + + col_id * nhwc.at(3), + nhwc.at(3), + out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3)); } - std::copy( - sub_GN.begin(), - sub_GN.end(), - out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3) - ); + std::copy(sub_GN.begin(), + sub_GN.end(), + out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3)); } } return image_features_hd_new_line; } -ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector& second_f, const ov::Tensor& third_1lf) { +ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, + const std::vector& second_f, + const ov::Tensor& third_1lf) { size_t first_l = first_1lf.get_shape().at(1); constexpr size_t second_l = 1; size_t third_l = third_1lf.get_shape().at(1); @@ -1387,12 +1396,20 @@ ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector& } // image_features.resized_source: (num_crops+1, 24*24, 1024) -ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector& sub_GN, const std::vector& glb_GN, ov::InferRequest& vision_projection) { +ov::Tensor hd_feature_transform(const EncodedImage& image_features, + InferRequest& hd_feature_transformer, + const std::vector& sub_GN, + const std::vector& glb_GN, + ov::InferRequest& vision_projection) { const ov::Shape& image_features_shape = image_features.resized_source.get_shape(); - ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data()}; + ov::Tensor global_image_features{ov::element::f32, + {1, image_features_shape.at(1), image_features_shape.at(2)}, + image_features.resized_source.data()}; // global feature can be viewed as a special HD case with num_crops 1x1 - ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer); - ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN); // [1,12*(12+1),4096] + ov::Tensor global_image_features_hd = + reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer); + ov::Tensor global_image_features_hd_newline = + add_image_newline(global_image_features_hd, sub_GN); // [1,12*(12+1),4096] constexpr size_t INPUT_IMAGE_SIZE = 336; size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE; size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE; @@ -1400,27 +1417,23 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest // NOTE: real num_crops is padded // (num_crops, 24*24, 1024) - ov::Tensor sub_image_features{ov::element::f32, { - num_crops, - image_features_shape.at(1), - image_features_shape.at(2) - }, image_features.resized_source.data() + image_features_shape.at(1) * image_features_shape.at(2)}; - ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer); // [1, 24, 24, 4096] - ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] + ov::Tensor sub_image_features{ + ov::element::f32, + {num_crops, image_features_shape.at(1), image_features_shape.at(2)}, + image_features.resized_source.data() + image_features_shape.at(1) * image_features_shape.at(2)}; + ov::Tensor sub_image_features_hd = + reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer); // [1, 24, 24, 4096] + ov::Tensor sub_image_features_hd_newline = + add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096] } std::vector split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) { constexpr int make_suffix_iterator = -1; std::regex rgx{R"(<\|image_\d+\|>)"}; - std::sregex_token_iterator iter{ - text.begin(), - text.end(), - rgx, - make_suffix_iterator - }; + std::sregex_token_iterator iter{text.begin(), text.end(), rgx, make_suffix_iterator}; std::vector tokenized; - for ( ; iter != std::sregex_token_iterator{}; ++iter) { + for (; iter != std::sregex_token_iterator{}; ++iter) { if (iter->str().empty()) { continue; } @@ -1429,16 +1442,19 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token return tokenized; } -// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { +// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, +// ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { // ov::Tensor encoded_input_ids; // if (is_chat_conversation) { // // KV cache in model already contains prompts and answers from previous iterations. // // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns // // token_ids = {, ...}. So if tokenizer applies only to the new prompt, // // will be inserted on every iteration. -// // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt +// // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new +// prompt // // and takes only the difference between them. -// // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but +// // The chat history cannot be saved as already encoded tokens because generate call doesn't return +// token, but // // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. // m_history.push_back({{"role", "user"}, {"content", prompt}}); // constexpr bool add_generation_prompt = true; @@ -1447,7 +1463,8 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token // new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); // } catch (const std::exception& error) { // // Use fallback chat template if it was not found in tokenizer_config.json -// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); +// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, +// chat_template_fallback); // } // auto start_tokenizer_time = std::chrono::steady_clock::now(); // ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; @@ -1464,18 +1481,19 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token // ).input_ids; // } // auto end_tokenizer_time = std::chrono::steady_clock::now(); -// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); -// m_templated_chat_history = std::move(new_templated_chat_history); +// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - +// start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history); // } else { // auto start_tokenizer_time = std::chrono::steady_clock::now(); // encoded_input_ids = m_tokenizer.encode(prompt).input_ids; // auto end_tokenizer_time = std::chrono::steady_clock::now(); -// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); +// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - +// start_tokenizer_time)); // } // return encoded_input_ids; // } -} -} +} // namespace phi3_v +} // namespace class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { public: @@ -1484,24 +1502,31 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { // Used to insert <|image_i|>\n per image (not a slice). size_t m_image_id = 1; - InputsEmbedderPhi3V( - const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config - ): - IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0}, - m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()}, - m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {} - - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderPhi3V(const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) + : IInputsEmbedder(vlm_config, model_dir, device, device_config), + m_image_id{0}, + m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()}, + m_vision_projection{utils::singleton_core() + .compile_model(model_dir / "openvino_vision_projection_model.xml", device) + .create_infer_request()} {} + + ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + ov::genai::VLMPerfMetrics& metrics) override { // TODO: perfmetrics - std::cout << prompt<<'\n'; + OPENVINO_ASSERT(!std::regex_search(prompt, std::regex{R"(<\|image_\d+\|>)"}), "<|image_i|> can't be used in the prompt because it's reserved for images"); std::stringstream images_prompt; std::vector images_features_proj; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); - images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection)); + images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, + m_hd_feature_transformer, + m_vlm_config.sub_GN, + m_vlm_config.glb_GN, + m_vision_projection)); images_prompt << "<|image_" << m_image_id << "|>\n"; ++m_image_id; } @@ -1564,27 +1589,26 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { // int64_t* end = ids + encoded_input_size; // float* inputs_embeds_data = inputs_embeds.data(); // for (const EncodedImage& encoded_image : embeds) { - // const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size}); - // float* emb = resampled_source.data(); - // ids = std::find(ids, end, im_start_id); - // OPENVINO_ASSERT(end != ids); + // const ov::Tensor& resampled_source = resample(encoded_image.resized_source, + // {encoded_image.resized_source_size}); float* emb = resampled_source.data(); ids = std::find(ids, + // end, im_start_id); OPENVINO_ASSERT(end != ids); // ++ids; - // std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - // ids += m_vlm_config.query_num; - // if (encoded_image.slices) { + // std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * + // m_vlm_config.hidden_size); ids += m_vlm_config.query_num; if (encoded_image.slices) { // size_t token_idx = 0; // const ov::Shape& slices_shape = encoded_image.slices.get_shape(); // for (size_t i = 0; i < slices_shape.at(0); ++i) { // for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { // size_t d2 = slices_shape.at(2); // size_t d3 = slices_shape.at(3); - // ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; - // const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size}); - // ids = std::find(ids, end, slice_start_id); - // OPENVINO_ASSERT(end != ids); + // ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + + // (i * slices_shape.at(1) + ja) * d2 * d3}; const ov::Tensor& vision_embed_tensor_i_j = + // resample(encoded_view, {encoded_image.slices_size}); ids = std::find(ids, end, + // slice_start_id); OPENVINO_ASSERT(end != ids); // ++ids; - // std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - // ids += m_vlm_config.query_num; + // std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), + // inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids += + // m_vlm_config.query_num; // } // } // } @@ -1623,7 +1647,8 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, } else if (vlm_config.model_type == VLMModelType::PHI3_V) { m_impl = std::make_shared(vlm_config, model_dir, device, device_config); } else { - OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); + OPENVINO_THROW( + "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); } } @@ -1634,19 +1659,42 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, const std::string& device, const ov::AnyMap device_config) { if (vlm_config.model_type == VLMModelType::MINICPM) { - m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); + m_impl = std::make_shared(vlm_config, + models_map, + tokenizer, + config_dir_path, + device, + device_config); } else if (vlm_config.model_type == VLMModelType::LLAVA) { - m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); + m_impl = std::make_shared(vlm_config, + models_map, + tokenizer, + config_dir_path, + device, + device_config); } else if (vlm_config.model_type == VLMModelType::LLAVA_NEXT) { - m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); + m_impl = std::make_shared(vlm_config, + models_map, + tokenizer, + config_dir_path, + device, + device_config); } else if (vlm_config.model_type == VLMModelType::INTERNVL_CHAT) { - m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); + m_impl = std::make_shared(vlm_config, + models_map, + tokenizer, + config_dir_path, + device, + device_config); } else { - OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); + OPENVINO_THROW( + "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) { +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, + const std::vector& images, + ov::genai::VLMPerfMetrics& metrics) { return m_impl->get_inputs_embeds(prompt, images, metrics); } @@ -1658,7 +1706,10 @@ std::vector InputsEmbedder::get_tokenized_history() const { return m_impl->get_tokenized_history(); } -void InputsEmbedder::update_tokenized_history(const std::vector& encoded_result, std::optional last_disappeared_token, bool is_beam_search, size_t last_answer_len) { +void InputsEmbedder::update_tokenized_history(const std::vector& encoded_result, + std::optional last_disappeared_token, + bool is_beam_search, + size_t last_answer_len) { return m_impl->update_tokenized_history(encoded_result, last_disappeared_token, is_beam_search, last_answer_len); } @@ -1682,4 +1733,4 @@ void InputsEmbedder::finish_chat() { return m_impl->finish_chat(); } -} // namespace ov::genai +} // namespace ov::genai From 524982f4715f99f4a882761945a047937726d26f Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 8 Jan 2025 14:56:30 +0400 Subject: [PATCH 10/28] Revert "code style" This reverts commit 83834a24a027f4243de3a670bdf8fd79c165fa08. --- .../src/visual_language/inputs_embedder.cpp | 931 +++++++++--------- 1 file changed, 440 insertions(+), 491 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 4404ddfe27..b1027c533b 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1,16 +1,16 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 +#include "openvino/genai/visual_language/perf_metrics.hpp" #include "visual_language/inputs_embedder.hpp" -#include - -#include "openvino/genai/visual_language/perf_metrics.hpp" -#include "openvino/opsets/opset13.hpp" -#include "utils.hpp" #include "visual_language/clip.hpp" -#include "visual_language/embedding_model.hpp" #include "visual_language/vision_encoder.hpp" +#include "visual_language/embedding_model.hpp" +#include "openvino/opsets/opset13.hpp" + +#include "utils.hpp" +#include namespace ov::genai { @@ -40,15 +40,12 @@ class InputsEmbedder::IInputsEmbedder { // Tail of previous output for LM in chat mode is missing in KV cache. std::optional m_last_disappeared_token = std::nullopt; // If sequence contains some symbols, which could be ambiguous encoded by tokenizer, we need to trim kv cache - // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add - // best answer to history so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to - // keep in history + // If we use beam search sampling with chat mode we need to remove last answer of the model from kv cache and add best answer to history + // so, let's keep info about amount of tokens to trim from kv cache and amount of tokens to keep in history ov::genai::utils::HistoryRemoveManager m_kv_history_manager = {0, 0}; public: - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, - const std::vector& images, - ov::genai::VLMPerfMetrics& metrics) = 0; + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) = 0; EmbeddingsModel get_embedding_model() const { return m_embedding; @@ -66,10 +63,7 @@ class InputsEmbedder::IInputsEmbedder { return m_kv_history_manager.num_tokens_to_remove_from_kv_cache; } - void update_tokenized_history(const std::vector& encoded_result, - std::optional last_disappeared_token, - bool is_beam_search, - size_t last_answer_len) { + void update_tokenized_history(const std::vector& encoded_result, std::optional last_disappeared_token, bool is_beam_search, size_t last_answer_len) { if (is_beam_search) { m_kv_history_manager.trusted_history_length = m_tokenized_history.size(); m_kv_history_manager.num_tokens_to_remove_from_kv_cache = last_answer_len; @@ -78,7 +72,7 @@ class InputsEmbedder::IInputsEmbedder { } m_last_disappeared_token = last_disappeared_token; - + std::copy(encoded_result.begin(), encoded_result.end(), std::back_inserter(m_tokenized_history)); } @@ -115,48 +109,52 @@ class InputsEmbedder::IInputsEmbedder { } protected: - IInputsEmbedder(const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) - : m_vlm_config{vlm_config}, - m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config), - m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config), - m_tokenizer{model_dir, device_config} {} - - IInputsEmbedder(const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) - : m_vlm_config{vlm_config}, - m_vision_encoder(get_model_weights_pair(models_map, "vision_embeddings").first, - get_model_weights_pair(models_map, "vision_embeddings").second, - config_dir_path, - m_vlm_config.model_type, - device, - device_config), - m_embedding(get_model_weights_pair(models_map, "text_embeddings").first, - get_model_weights_pair(models_map, "text_embeddings").second, - m_vlm_config.scale_emb, - device, - device_config), - m_tokenizer(tokenizer) {} - - ov::Tensor get_encoded_input_ids(const std::string& prompt, - ov::genai::VLMPerfMetrics& metrics, - const std::string& chat_template_fallback = "") { + IInputsEmbedder( + const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) : + m_vlm_config{vlm_config}, + m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config), + m_embedding(model_dir, m_vlm_config.scale_emb, device, device_config), + m_tokenizer{model_dir, device_config} { } + + IInputsEmbedder( + const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) : + m_vlm_config{vlm_config}, + m_vision_encoder( + get_model_weights_pair(models_map, "vision_embeddings").first, + get_model_weights_pair(models_map, "vision_embeddings").second, + config_dir_path, + m_vlm_config.model_type, + device, + device_config + ), + m_embedding( + get_model_weights_pair(models_map, "text_embeddings").first, + get_model_weights_pair(models_map, "text_embeddings").second, + m_vlm_config.scale_emb, + device, + device_config + ), + m_tokenizer(tokenizer) { } + + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { ov::Tensor encoded_input_ids; if (m_is_chat_conversation) { // KV cache in model already contains prompts and answers from previous iterations. // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns // token_ids = {, ...}. So if tokenizer applies only to the new prompt, // will be inserted on every iteration. - // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new - // prompt and takes only the difference between them. The chat history cannot be saved as already encoded - // tokens because generate call doesn't return token, but KV cache contains it. So we have to add it - // manually or get it by tokenization all chat history. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; std::string new_templated_chat_history; @@ -164,24 +162,19 @@ class InputsEmbedder::IInputsEmbedder { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); } catch (const std::exception& error) { // Use fallback chat template if it was not found in tokenizer_config.json - new_templated_chat_history = - m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); + new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor new_chat_tokens = - m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; - TokenizedInputs prev_chat_tokens = - m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); + ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; + TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); // some symbols combinations can be encoded by the tokenizer in different ways - // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from - // the old history so let's check it out, find the trusted part and use it in on the next step + // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history + // so let's check it out, find the trusted part and use it in on the next step size_t trusted_history_length = 0; if (!m_tokenized_history.empty()) { std::set stop_tokens = {m_tokenizer.get_eos_token_id()}; - trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, - m_tokenized_history, - stop_tokens); + trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens); } if (m_tokenized_history.empty()) { @@ -189,94 +182,81 @@ class InputsEmbedder::IInputsEmbedder { } else if (trusted_history_length != SIZE_MAX || m_kv_history_manager.does_kv_cache_need_to_update()) { // does_kv_cache_need_to_update will be true here if beam search is activated - // in beam search mode we want to remove all history about last model answer from kv cache and add the - // best answer directly if we have difference in model answer and decoded answer it anyway will be less - // then entire history, so let's use data from m_kv_history_manager + // in beam search mode we want to remove all history about last model answer from kv cache and add the best answer directly + // if we have difference in model answer and decoded answer it anyway will be less then entire history, so let's use data from m_kv_history_manager if (m_kv_history_manager.does_kv_cache_need_to_update()) { trusted_history_length = m_kv_history_manager.trusted_history_length; } else { - m_kv_history_manager.num_tokens_to_remove_from_kv_cache = - m_tokenized_history.size() - trusted_history_length; - // if prev generation was finished because of max len was reached, kv cache is missed one last - // token, let's keep it - m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= - m_last_disappeared_token.has_value() ? 1 : 0; + m_kv_history_manager.num_tokens_to_remove_from_kv_cache = m_tokenized_history.size() - trusted_history_length; + // if prev generation was finished because of max len was reached, kv cache is missed one last token, let's keep it + m_kv_history_manager.num_tokens_to_remove_from_kv_cache -= m_last_disappeared_token.has_value() ? 1 : 0; } ov::Tensor new_tensor = ov::Tensor(new_chat_tokens.get_element_type(), {1, new_chat_tokens.get_shape().at(1) - trusted_history_length}, new_chat_tokens.data() + trusted_history_length); encoded_input_ids = ov::Tensor(new_chat_tokens.get_element_type(), - {1, new_chat_tokens.get_shape().at(1) - trusted_history_length}); + {1, new_chat_tokens.get_shape().at(1) - trusted_history_length}); new_tensor.copy_to(encoded_input_ids); } else { - encoded_input_ids = - utils::subtract_chat_tokenized_inputs({new_chat_tokens}, prev_chat_tokens).input_ids; + encoded_input_ids = utils::subtract_chat_tokenized_inputs( + {new_chat_tokens}, prev_chat_tokens + ).input_ids; if (m_last_disappeared_token.has_value()) - encoded_input_ids = - ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); + encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); } auto end_tokenizer_time = std::chrono::steady_clock::now(); - metrics.raw_metrics.tokenization_durations.emplace_back( - PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history); m_tokenized_history.clear(); - std::copy_n(new_chat_tokens.data(), - new_chat_tokens.get_size(), - std::back_inserter(m_tokenized_history)); + std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); } else { auto start_tokenizer_time = std::chrono::steady_clock::now(); encoded_input_ids = m_tokenizer.encode(prompt).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); - metrics.raw_metrics.tokenization_durations.emplace_back( - PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_tokenized_history.clear(); - std::copy_n(encoded_input_ids.data(), - encoded_input_ids.get_size(), - std::back_inserter(m_tokenized_history)); + std::copy_n(encoded_input_ids.data(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history)); } return encoded_input_ids; } /** - * @brief Unpads an image tensor of a padded and resized image. - * Used for packing image features of llava_next models. - * - * @param tensor An image tensor with a shape (embed_dim, height, width) - * @param original_size A size of original image - * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) - */ + * @brief Unpads an image tensor of a padded and resized image. + * Used for packing image features of llava_next models. + * + * @param tensor An image tensor with a shape (embed_dim, height, width) + * @param original_size A size of original image + * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) + */ /** - * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]). - * - * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or - * [HWC]. - * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C]. - */ + * @brief Converts a vector of batched images ([NHWC]) into a vector of individual image tensors ([1HWC]). + * + * @param images A vector of tensors representing the images. Each tensor can have a shape of either [NHWC] or [HWC]. + * @return A vector of tensors where each tensor represents a single image with a shape of [1, H, W, C]. + */ std::vector to_single_image_tensors(const std::vector& images) { std::vector single_image_tensors; for (const auto& image : images) { ov::Tensor reshaped_image = image; ov::Shape image_shape = image.get_shape(); switch (image_shape.size()) { - case 3: - reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)}); - break; - case 4: - break; - default: - OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); + case 3: + reshaped_image.set_shape({1, image_shape.at(0), image_shape.at(1), image_shape.at(2)}); + break; + case 4: break; + default: OPENVINO_THROW("Input image must have [NHWC] or [HWC] layout"); } ov::Shape reshaped_image_shape = reshaped_image.get_shape(); for (size_t batch_idx = 0; batch_idx < reshaped_image_shape.at(0); ++batch_idx) { ov::Tensor single_image{ reshaped_image.get_element_type(), {1, reshaped_image_shape.at(1), reshaped_image_shape.at(2), reshaped_image_shape.at(3)}, - reshaped_image.data() + batch_idx * reshaped_image_shape.at(1) * - reshaped_image_shape.at(2) * reshaped_image_shape.at(3)}; + reshaped_image.data() + batch_idx * reshaped_image_shape.at(1) * reshaped_image_shape.at(2) * reshaped_image_shape.at(3) + }; single_image_tensors.push_back(std::move(single_image)); } } @@ -297,11 +277,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { size_t m_image_id = 0; public: - InputsEmbedderMiniCPM(const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) - : IInputsEmbedder(vlm_config, model_dir, device, device_config) { + InputsEmbedderMiniCPM( + const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) : + IInputsEmbedder(vlm_config, model_dir, device, device_config) { auto compiled_model = utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, device_config); ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model"); @@ -310,26 +291,25 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); } - InputsEmbedderMiniCPM(const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) - : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { - m_resampler = utils::singleton_core() - .compile_model(get_model_weights_pair(models_map, "resampler").first, - get_model_weights_pair(models_map, "resampler").second, - device, - device_config) - .create_infer_request(); - - m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); - } + InputsEmbedderMiniCPM( + const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) : + IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { + m_resampler = utils::singleton_core().compile_model( + get_model_weights_pair(models_map, "resampler").first, + get_model_weights_pair(models_map, "resampler").second, + device, + device_config + ).create_infer_request(); + + m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); + } - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, - const std::vector& images, - ov::genai::VLMPerfMetrics& metrics) override { + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { std::string images_prompt; std::vector embeds; @@ -367,18 +347,24 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Tensor encoded_input = get_encoded_input_ids(images_prompt, metrics); ov::Tensor inputs_embeds = m_embedding.infer(encoded_input); - OPENVINO_ASSERT(m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), "Unexpected embedding size"); + OPENVINO_ASSERT( + m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), + "Unexpected embedding size" + ); auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor special_tokens = - m_tokenizer - .encode(m_vlm_config.im_start + m_vlm_config.im_end + m_vlm_config.slice_start + m_vlm_config.slice_end) - .input_ids; + ov::Tensor special_tokens = m_tokenizer.encode( + m_vlm_config.im_start + + m_vlm_config.im_end + + m_vlm_config.slice_start + + m_vlm_config.slice_end + ).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += - ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); - OPENVINO_ASSERT(4 == special_tokens.get_shape().at(1), - "Every special token must be represented with a single int."); + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + OPENVINO_ASSERT( + 4 == special_tokens.get_shape().at(1), + "Every special token must be represented with a single int." + ); int64_t im_start_id = special_tokens.data()[0]; int64_t im_end_id = special_tokens.data()[1]; int64_t slice_start_id = special_tokens.data()[2]; @@ -390,15 +376,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { int64_t* end = ids + encoded_input_size; float* inputs_embeds_data = inputs_embeds.data(); for (const EncodedImage& encoded_image : embeds) { - const ov::Tensor& resampled_source = - resample(encoded_image.resized_source, {encoded_image.resized_source_size}); + const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size}); float* emb = resampled_source.data(); ids = std::find(ids, end, im_start_id); OPENVINO_ASSERT(end != ids); ++ids; - std::copy_n(emb, - resampled_source.get_size(), - inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids += m_vlm_config.query_num; if (encoded_image.slices) { size_t token_idx = 0; @@ -407,17 +390,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { size_t d2 = slices_shape.at(2); size_t d3 = slices_shape.at(3); - ov::Tensor encoded_view{ - ov::element::f32, - {1, d2, d3}, - encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; + ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size}); ids = std::find(ids, end, slice_start_id); OPENVINO_ASSERT(end != ids); ++ids; - std::copy_n(vision_embed_tensor_i_j.data(), - vision_embed_tensor_i_j.get_size(), - inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids += m_vlm_config.query_num; } } @@ -447,7 +425,11 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) { return height_width.height * height_width.width; }); - adjust_pos_cache(target_sizes, m_vlm_config.hidden_size, m_pos_embed_cache); + adjust_pos_cache( + target_sizes, + m_vlm_config.hidden_size, + m_pos_embed_cache + ); size_t max_patch_len = *std::max_element(patch_len.begin(), patch_len.end()); ov::Tensor key_padding_mask(ov::element::f32, {bs, max_patch_len}); float* mask_data = key_padding_mask.data(); @@ -462,9 +444,11 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { size_t target_w = target_sizes.at(i).width; for (size_t h_idx = 0; h_idx < target_h; ++h_idx) { for (size_t w_idx = 0; w_idx < target_w; ++w_idx) { - std::copy_n(cache_data + (h_idx * _d1 + w_idx) * embed_len, - embed_len, - pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len); + std::copy_n( + cache_data + (h_idx * _d1 + w_idx) * embed_len, + embed_len, + pos_embed_data + (h_idx * target_w + w_idx) * bs * embed_len + i * embed_len + ); } } for (size_t flat = target_h * target_w; flat < max_patch_len; ++flat) { @@ -473,8 +457,8 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { std::fill_n(mask_data + i * max_patch_len, patch_len[i], 0.0f); std::fill_n(mask_data + i * max_patch_len + patch_len[i], max_patch_len - patch_len[i], 1.0f); } - m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] - m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] + m_resampler.set_tensor("image_feature", encoded_image); // [N, H*W, old_hidden_size] + m_resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size] m_resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W] m_resampler.infer(); return m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] @@ -494,12 +478,12 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { for (size_t j = 0; j < res_d_1; ++j) { size_t k = 0; for (; k < first.get_shape().at(2); ++k) { - res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] = - first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k]; + res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] + = first_data[i * res_d_1 * first.get_shape().at(2) + j * first.get_shape().at(2) + k]; } for (size_t l = 0; l < second.get_shape().at(2); ++l, ++k) { - res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] = - second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l]; + res_data[i * res_d_1 * res_d_2 + j * res_d_2 + k] + = second_data[i * res_d_1 * second.get_shape().at(2) + j * second.get_shape().at(2) + l]; } } } @@ -545,14 +529,16 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Shape grid_shape = grid.get_shape(); float* grid_data = grid.data(); ov::Shape plane_shape{grid_shape.at(1), grid_shape.at(2)}; - ov::Tensor emb_h = - get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, - ov::Tensor{ov::element::f32, plane_shape, grid_data}); // (H, W, D/2) - ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new( - embed_dim / 2, - ov::Tensor{ov::element::f32, - plane_shape, - grid_data + plane_shape.at(0) * plane_shape.at(1)}); // (H, W, D/2) + ov::Tensor emb_h = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{ + ov::element::f32, + plane_shape, + grid_data + }); // (H, W, D/2) + ov::Tensor emb_w = get_1d_sincos_pos_embed_from_grid_new(embed_dim / 2, ov::Tensor{ + ov::element::f32, + plane_shape, + grid_data + plane_shape.at(0) * plane_shape.at(1) + }); // (H, W, D/2) return concatenate_last_dim(emb_h, emb_w); } @@ -574,19 +560,17 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { return get_2d_sincos_pos_embed_from_grid(embed_dim, grid); } - void adjust_pos_cache(const std::vector& target_sizes, size_t hidden_size, ov::Tensor& pos_embed_cache) { - size_t max_h = std::max_element(target_sizes.begin(), - target_sizes.end(), - [](const ImageSize& left, const ImageSize& right) { - return left.height < right.height; - }) - ->height; - size_t max_w = std::max_element(target_sizes.begin(), - target_sizes.end(), - [](const ImageSize& left, const ImageSize& right) { - return left.width < right.width; - }) - ->width; + void adjust_pos_cache( + const std::vector& target_sizes, + size_t hidden_size, + ov::Tensor& pos_embed_cache + ) { + size_t max_h = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { + return left.height < right.height; + })->height; + size_t max_w = std::max_element(target_sizes.begin(), target_sizes.end(), [](const ImageSize& left, const ImageSize& right) { + return left.width < right.width; + })->width; size_t allocated_height, allocated_width; if (pos_embed_cache) { const ov::Shape& allocated_shape = pos_embed_cache.get_shape(); @@ -598,37 +582,36 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { if (max_h > allocated_height || max_w > allocated_width) { allocated_height = std::max(max_h, allocated_height); allocated_width = std::max(max_w, allocated_width); - pos_embed_cache = get_2d_sincos_pos_embed(hidden_size, {allocated_height, allocated_width}); + pos_embed_cache = get_2d_sincos_pos_embed( + hidden_size, {allocated_height, allocated_width} + ); } } }; class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { public: - InputsEmbedderLLaVA(const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) - : IInputsEmbedder(vlm_config, model_dir, device, device_config) {} - - InputsEmbedderLLaVA(const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) - : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} - - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, - const std::vector& images, - ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderLLaVA( + const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) : + IInputsEmbedder(vlm_config, model_dir, device, device_config) { } + + InputsEmbedderLLaVA( + const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) : + IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json - std::string chat_template_fallback = - "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' " - "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if " - "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; - + std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; @@ -649,21 +632,21 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { return text_embeds; } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor encoded_image_token = - m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += - ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id); } protected: - ov::Tensor merge_text_and_image_embeddings_llava(const ov::Tensor& input_ids, - const ov::Tensor& text_embeds, - const std::vector& image_embeds, - int64_t image_token_id) { + ov::Tensor merge_text_and_image_embeddings_llava( + const ov::Tensor& input_ids, + const ov::Tensor& text_embeds, + const std::vector& image_embeds, + int64_t image_token_id + ) { auto text_embeds_shape = text_embeds.get_shape(); size_t text_embeds_seq_length = text_embeds_shape[1]; size_t hidden_size = text_embeds_shape[2]; @@ -678,18 +661,22 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { } } auto num_images = image_embeds.size(); - OPENVINO_ASSERT(num_image_tokens == num_images, - "Number of image tokens in input_ids different from num_images."); + OPENVINO_ASSERT( + num_image_tokens == num_images, + "Number of image tokens in input_ids different from num_images." + ); size_t total_image_seq_length = 0; for (const auto& single_image_embeds : image_embeds) { - OPENVINO_ASSERT(text_embeds_shape[2] == single_image_embeds.get_shape().at(2), - "Incompatible shapes between text_embeds and image_embeds"); + OPENVINO_ASSERT( + text_embeds_shape[2] == single_image_embeds.get_shape().at(2), + "Incompatible shapes between text_embeds and image_embeds" + ); total_image_seq_length += single_image_embeds.get_shape().at(1); } size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens; - constexpr size_t BATCH_SIZE = 1; + constexpr size_t BATCH_SIZE = 1; ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); float* merged_data = merged_embeds.data(); @@ -700,11 +687,15 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { const float* image_embeds_data = image_embeds[image_idx].data(); size_t image_seq_length = image_embeds[image_idx].get_shape()[1]; - std::copy_n(image_embeds_data, image_seq_length * hidden_size, merged_data + merged_idx * hidden_size); + std::copy_n(image_embeds_data, + image_seq_length * hidden_size, + merged_data + merged_idx * hidden_size); merged_idx += image_seq_length; image_idx++; } else { - std::copy_n(text_embeds_data + s * hidden_size, hidden_size, merged_data + merged_idx * hidden_size); + std::copy_n(text_embeds_data + s * hidden_size, + hidden_size, + merged_data + merged_idx * hidden_size); merged_idx++; } } @@ -714,36 +705,33 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { public: - InputsEmbedderLLaVANext(const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) - : InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) {} - - InputsEmbedderLLaVANext(const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) - : InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} - - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, - const std::vector& images, - ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderLLaVANext( + const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) : + InputsEmbedderLLaVA(vlm_config, model_dir, device, device_config) { } + + InputsEmbedderLLaVANext( + const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) : + InputsEmbedderLLaVA(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { std::string image_token = m_vlm_config.im_start; // Adapted from llava-1.5-7b-hf chat_template.json - std::string chat_template_fallback = - "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' " - "}}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if " - "add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; + std::string chat_template_fallback = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'USER: ' + message['content'] + ' ' }}{% else %}{{ 'ASSISTANT: ' + message['content'] + ' ' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}"; std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + ov::Tensor image_newline; for (const auto& image : single_images) { @@ -756,10 +744,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::copy(m_vlm_config.image_newline.begin(), m_vlm_config.image_newline.end(), image_newline_data); } - ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width] + ImageSize original_image_size{image.get_shape().at(1), image.get_shape().at(2)}; // [height, width] - ov::Tensor packed_features = - pack_image_features_llava_next(encoded_image, original_image_size, image_newline); + ov::Tensor packed_features = pack_image_features_llava_next(encoded_image, original_image_size, image_newline); image_embeds.push_back(std::move(packed_features)); formatted_prompt += image_token + "\n"; @@ -773,29 +760,29 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { return text_embeds; } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor encoded_image_token = - m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_token = m_tokenizer.encode(m_vlm_config.im_start, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += - ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); int64_t image_token_id = encoded_image_token.data()[encoded_image_token.get_size() - 1]; return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_id); } private: /** - * @brief Processes base and patches image features extracted from encoded image. - * Used in getting inputs embeds for llava_next models. - * - * @param encoded_image An encoded image retrieved from vision encoder - * @param original_image_size A size of the original image - * @param image_newline An image newline tensor with a shape (embed_dim) - * @return A tensor with a shape (1, new_seq_len, embed_dim) - */ - ov::Tensor pack_image_features_llava_next(const EncodedImage& encoded_image, - const ImageSize& original_image_size, - const ov::Tensor& image_newline) { + * @brief Processes base and patches image features extracted from encoded image. + * Used in getting inputs embeds for llava_next models. + * + * @param encoded_image An encoded image retrieved from vision encoder + * @param original_image_size A size of the original image + * @param image_newline An image newline tensor with a shape (embed_dim) + * @return A tensor with a shape (1, new_seq_len, embed_dim) + */ + ov::Tensor pack_image_features_llava_next( + const EncodedImage& encoded_image, + const ImageSize& original_image_size, + const ov::Tensor& image_newline + ) { auto image_feature = encoded_image.resized_source; auto image_feature_shape = image_feature.get_shape(); size_t num_patches = image_feature_shape[0]; @@ -813,12 +800,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::copy(src_data, src_data + patch_seq_len * embed_dim, dst_data); // Extract other grid patches - ov::Tensor patches_image_feature(image_feature.get_element_type(), - {num_patches - 1, patch_seq_len, embed_dim}); + ov::Tensor patches_image_feature(image_feature.get_element_type(), {num_patches - 1, patch_seq_len, embed_dim}); dst_data = patches_image_feature.data(); std::copy(src_data + patch_seq_len * embed_dim, - src_data + num_patches * patch_seq_len * embed_dim, - dst_data); + src_data + num_patches * patch_seq_len * embed_dim, + dst_data); // Process grid patches image feature size_t height = encoded_image.resized_source_size.height; @@ -826,11 +812,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { size_t num_patch_height = encoded_image.patches_grid.first; size_t num_patch_width = encoded_image.patches_grid.second; - ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature, - num_patch_height, - num_patch_width, - height, - width); + ov::Tensor reshaped_image_feature = reshape_and_rearrange_image_feature(patches_image_feature, num_patch_height, num_patch_width, height, width); ov::Tensor unpadded_image_feature = unpad_image(reshaped_image_feature, original_image_size); @@ -838,8 +820,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor processed_image_feature = flatten_and_transpose(image_feature_with_newline); - // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2, - // embed_dim]) + // Concatenate base image feature ([1, seq_len_1, emded_dim]) and patches image feature ([seq_len_2, embed_dim]) auto base_shape = base_image_feature.get_shape(); auto processed_shape = processed_image_feature.get_shape(); @@ -851,30 +832,32 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::copy(base_data, base_data + base_shape[1] * embed_dim, result.data()); // Copy processed image feature data std::copy(processed_data, - processed_data + processed_shape[0] * embed_dim, - result.data() + base_shape[1] * embed_dim); + processed_data + processed_shape[0] * embed_dim, + result.data() + base_shape[1] * embed_dim); return result; } else { // If there is only one patch, return the original (base) image feature concatenated with image_newline ov::Tensor result(image_feature.get_element_type(), {1, patch_seq_len + 1, embed_dim}); // Copy base image feature data std::copy(image_feature_data + embed_dim, - image_feature_data + patch_seq_len * embed_dim, - result.data()); + image_feature_data + patch_seq_len * embed_dim, + result.data()); // Append image_newline data - std::copy(newline_data, newline_data + embed_dim, result.data() + patch_seq_len * embed_dim); + std::copy(newline_data, + newline_data + embed_dim, + result.data() + patch_seq_len * embed_dim); return result; } } /** - * @brief Adds image newline tensor to patches image feature tensor. - * Used for packing image features of llava_next models. - * - * @param image_feature A tensor with a shape (embed_dim, height, width) - * @param image_newline A tensor with a shape (embed_dim) - * @return A tensor with a shape (embed_dim, height, width + 1) - */ + * @brief Adds image newline tensor to patches image feature tensor. + * Used for packing image features of llava_next models. + * + * @param image_feature A tensor with a shape (embed_dim, height, width) + * @param image_newline A tensor with a shape (embed_dim) + * @return A tensor with a shape (embed_dim, height, width + 1) + */ ov::Tensor add_image_newline(const ov::Tensor& image_feature, const ov::Tensor& image_newline) { auto shape = image_feature.get_shape(); @@ -884,8 +867,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { size_t height = shape[1]; size_t width = shape[2]; - OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim, - "image_newline dimension must match embed_dim of image_feature"); + OPENVINO_ASSERT(image_newline.get_shape()[0] == embed_dim, "image_newline dimension must match embed_dim of image_feature"); const float* image_feature_data = image_feature.data(); const float* newline_data = image_newline.data(); @@ -896,9 +878,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { for (size_t e = 0; e < embed_dim; ++e) { for (size_t h = 0; h < height; ++h) { // Copy original image feature data - std::copy(image_feature_data + (e * height * width + h * width), - image_feature_data + (e * height * width + (h + 1) * width), - feature_with_newline_data + (e * height * (width + 1) + h * (width + 1))); + std::copy( + image_feature_data + (e * height * width + h * width), + image_feature_data + (e * height * width + (h + 1) * width), + feature_with_newline_data + (e * height * (width + 1) + h * (width + 1)) + ); // Add image newline feature_with_newline_data[e * height * (width + 1) + h * (width + 1) + width] = newline_data[e]; } @@ -908,12 +892,12 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { } /** - * @brief Flattens and transposes tensor. - * Used for packing image features of llava_next models. - * - * @param tensor A tensor with a shape (embed_dim, height, width) - * @return A tensor with a shape (height * width, embed_dim) - */ + * @brief Flattens and transposes tensor. + * Used for packing image features of llava_next models. + * + * @param tensor A tensor with a shape (embed_dim, height, width) + * @return A tensor with a shape (height * width, embed_dim) + */ ov::Tensor flatten_and_transpose(const ov::Tensor& tensor) { auto shape = tensor.get_shape(); OPENVINO_ASSERT(shape.size() == 3, "Flattening tensor must have 3 dimensions"); @@ -937,6 +921,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { return flatten_feature; } + ov::Tensor reshape_and_rearrange_image_feature(const ov::Tensor& image_feature, int num_patch_height, int num_patch_width, @@ -949,11 +934,15 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { size_t patch_seq_len = shape[1]; size_t embed_dim = shape[2]; - OPENVINO_ASSERT(num_patches == num_patch_height * num_patch_width, - "Number of patches does not match the specified grid size"); + OPENVINO_ASSERT( + num_patches == num_patch_height * num_patch_width, + "Number of patches does not match the specified grid size" + ); - OPENVINO_ASSERT(patch_seq_len == height * width, - "Patch sequence length does not match the specified height and width"); + OPENVINO_ASSERT( + patch_seq_len == height * width, + "Patch sequence length does not match the specified height and width" + ); // Reshape tensor data and permute dimensions // [num_patches, patch_seq_len, embed_dim] -> [embed_dim, num_patch_height, height, num_patch_width, width] @@ -976,19 +965,20 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor result(image_feature.get_element_type(), {static_cast(embed_dim), static_cast(num_patch_height * height), - static_cast(num_patch_width * width)}); + static_cast(num_patch_width * width)} + ); std::copy(reshaped_data.begin(), reshaped_data.end(), result.data()); return result; } /** - * @brief Unpads an image tensor of a padded and resized image. - * Used for packing image features of llava_next models. - * - * @param tensor An image tensor with a shape (embed_dim, height, width) - * @param original_size A size of original image - * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) - */ + * @brief Unpads an image tensor of a padded and resized image. + * Used for packing image features of llava_next models. + * + * @param tensor An image tensor with a shape (embed_dim, height, width) + * @param original_size A size of original image + * @return An unpadded image tensor with a shape (embed_dim, new_height, new_width) + */ ov::Tensor unpad_image(const ov::Tensor& tensor, const ImageSize& original_size) { size_t original_height = original_size.height; size_t original_width = original_size.width; @@ -1013,9 +1003,9 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { for (int h = 0; h < unpadded_height_dim; ++h) { std::copy( tensor.data() + (e * current_height * current_width + (padding + h) * current_width), - tensor.data() + - (e * current_height * current_width + (padding + h) * current_width + current_width), - unpadded_tensor.data() + (e * unpadded_height_dim * current_width + h * current_width)); + tensor.data() + (e * current_height * current_width + (padding + h) * current_width + current_width), + unpadded_tensor.data() + (e * unpadded_height_dim * current_width + h * current_width) + ); } } } else { @@ -1027,11 +1017,11 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { for (size_t e = 0; e < embed_dim; ++e) { for (int h = 0; h < current_height; ++h) { - std::copy(tensor.data() + (e * current_height * current_width + h * current_width + padding), - tensor.data() + (e * current_height * current_width + h * current_width + padding + - unpadded_width_dim), - unpadded_tensor.data() + - (e * current_height * unpadded_width_dim + h * unpadded_width_dim)); + std::copy( + tensor.data() + (e * current_height * current_width + h * current_width + padding), + tensor.data() + (e * current_height * current_width + h * current_width + padding + unpadded_width_dim), + unpadded_tensor.data() + (e * current_height * unpadded_width_dim + h * unpadded_width_dim) + ); } } } @@ -1042,40 +1032,40 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { public: - InputsEmbedderInternVLChat(const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) - : IInputsEmbedder(vlm_config, model_dir, device, device_config) {} - - InputsEmbedderInternVLChat(const VLMConfig& vlm_config, - const ModelsMap& models_map, - const Tokenizer& tokenizer, - const std::filesystem::path& config_dir_path, - const std::string& device, - const ov::AnyMap device_config) - : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} - - virtual ov::Tensor get_inputs_embeds(const std::string& prompt, - const std::vector& images, - ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderInternVLChat( + const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config) : + IInputsEmbedder(vlm_config, model_dir, device, device_config) { } + + InputsEmbedderInternVLChat( + const VLMConfig& vlm_config, + const ModelsMap& models_map, + const Tokenizer& tokenizer, + const std::filesystem::path& config_dir_path, + const std::string& device, + const ov::AnyMap device_config) : + IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } + + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { std::string image_start_token = m_vlm_config.image_start_token; std::string image_context_token = m_vlm_config.image_context_token; std::string image_end_token = m_vlm_config.image_end_token; - + std::vector single_images = to_single_image_tensors(images); std::string formatted_prompt; std::vector image_embeds; image_embeds.reserve(single_images.size()); - + for (const auto& image : single_images) { EncodedImage encoded_image = m_vision_encoder.encode(image); ov::Tensor single_image_embeds = encoded_image.resized_source; const size_t num_patches = single_image_embeds.get_shape().at(0); const size_t num_image_tokens = single_image_embeds.get_shape().at(1); - + formatted_prompt += image_start_token; for (int i = 0; i < num_patches * num_image_tokens; ++i) { formatted_prompt += image_context_token; @@ -1093,22 +1083,21 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { return text_embeds; } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor encoded_image_context_token = - m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_context_token = m_tokenizer.encode(image_context_token, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); - metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += - ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); - int64_t image_context_token_id = - encoded_image_context_token.data()[encoded_image_context_token.get_size() - 1]; + metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + int64_t image_context_token_id = encoded_image_context_token.data()[encoded_image_context_token.get_size() - 1]; return merge_text_and_image_embeddings_internvl(input_ids, text_embeds, image_embeds, image_context_token_id); } protected: - ov::Tensor merge_text_and_image_embeddings_internvl(const ov::Tensor& input_ids, - const ov::Tensor& text_embeds, - const std::vector& image_embeds, - int64_t image_context_token_id) { + ov::Tensor merge_text_and_image_embeddings_internvl( + const ov::Tensor& input_ids, + const ov::Tensor& text_embeds, + const std::vector& image_embeds, + int64_t image_context_token_id + ) { auto text_embeds_shape = text_embeds.get_shape(); size_t batch_size = text_embeds_shape.at(0); size_t seq_len = text_embeds_shape.at(1); @@ -1142,14 +1131,12 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { if (image_context_tokens_mask[flat_idx]) { const ov::Tensor& single_image_embeds = image_embeds[image_idx]; - const size_t num_all_image_tokens = - single_image_embeds.get_shape().at(0) * - single_image_embeds.get_shape().at(1); // num_patches * num_image_tokens + const size_t num_all_image_tokens = single_image_embeds.get_shape().at(0) * single_image_embeds.get_shape().at(1); // num_patches * num_image_tokens const float* image_embeds_data = single_image_embeds.data(); std::copy_n(image_embeds_data + image_context_token_idx * embed_dim, embed_dim, merged_embeds_data + offset); - + ++image_context_token_idx; if (image_context_token_idx == num_all_image_tokens) { @@ -1290,23 +1277,30 @@ ov::InferRequest create_hd_feature_transformer() { // t28 = opset.Unsqueeze([t26, t27], {}, # i64[], i32[] -> i64[1] // t29 = opset.Constant(model, 29, # -> i64[1]([2]) // t30 = opset.Constant(model, 30, # -> i64[1]([2]) - // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] - // -> i64[6] t32 = opset.Reshape([t23, t31], {'special_zero': False}, # f32[?,24,24,1024], i64[6] -> - // f32[?,12,2,12,2,1024] t33 = opset.Constant(model, 33, t34 = opset.Transpose([t32, t33], {}, # - // f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] t35 = opset.Constant(model, 35, # -> i64[1]([-1]) t36 = - // opset.Constant(model, 36, # -> i64[1]([4]) t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'}, # - // i64[1], i64[1] -> i64[1] t38 = opset.Concat([t8, t35, t37], {'axis': 0}, # i64[1], i64[1], i64[1] -> i64[3] t39 - // = opset.Reshape([t34, t38], {'special_zero': False}, # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] t40 = - // opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] t41 = opset.Convert([t40], - // {'destination_type': 'i64'}, # i32[] -> i64[] t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', - // 'm_pythondiv': True}, # i64[], i64[] -> i64[] t43 = opset.Floor([t42], {}, # i64[] -> i64[] t44 = - // opset.Constant(model, 44, # -> i32[](0) t45 = opset.Unsqueeze([t43, t44], {}, # i64[], i32[] -> i64[1] t46 = - // opset.Convert([t1], {'destination_type': 'i64'}, # i32[] -> i64[] t47 = opset.Unsqueeze([t46, t44], {}, # - // i64[], i32[] -> i64[1] t48 = opset.Convert([t2], {'destination_type': 'i64'}, # i32[] -> i64[] t49 = - // opset.Unsqueeze([t48, t44], {}, # i64[], i32[] -> i64[1] t50 = opset.Constant(model, 50, # -> i64[1]([-1]) t51 - // = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> - // i64[6] t52 = opset.Reshape([t39, t51], {'special_zero': False}, # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] t53 - // = opset.Constant(model, 53, t54 = opset.Transpose([t52, t53], {}, # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?] + // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] + // t32 = opset.Reshape([t23, t31], {'special_zero': False}, # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024] + // t33 = opset.Constant(model, 33, + // t34 = opset.Transpose([t32, t33], {}, # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] + // t35 = opset.Constant(model, 35, # -> i64[1]([-1]) + // t36 = opset.Constant(model, 36, # -> i64[1]([4]) + // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'}, # i64[1], i64[1] -> i64[1] + // t38 = opset.Concat([t8, t35, t37], {'axis': 0}, # i64[1], i64[1], i64[1] -> i64[3] + // t39 = opset.Reshape([t34, t38], {'special_zero': False}, # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] + // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] + // t41 = opset.Convert([t40], {'destination_type': 'i64'}, # i32[] -> i64[] + // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] + // t43 = opset.Floor([t42], {}, # i64[] -> i64[] + // t44 = opset.Constant(model, 44, # -> i32[](0) + // t45 = opset.Unsqueeze([t43, t44], {}, # i64[], i32[] -> i64[1] + // t46 = opset.Convert([t1], {'destination_type': 'i64'}, # i32[] -> i64[] + // t47 = opset.Unsqueeze([t46, t44], {}, # i64[], i32[] -> i64[1] + // t48 = opset.Convert([t2], {'destination_type': 'i64'}, # i32[] -> i64[] + // t49 = opset.Unsqueeze([t48, t44], {}, # i64[], i32[] -> i64[1] + // t50 = opset.Constant(model, 50, # -> i64[1]([-1]) + // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] + // t52 = opset.Reshape([t39, t51], {'special_zero': False}, # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] + // t53 = opset.Constant(model, 53, + // t54 = opset.Transpose([t52, t53], {}, # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?] // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] // t56 = opset.Convert([t55], {'destination_type': 'i64'}, # i32[] -> i64[] // t57 = opset.Constant(model, 57, # -> i64[](2) @@ -1323,8 +1317,9 @@ ov::InferRequest create_hd_feature_transformer() { // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1] -> i64[4] // t69 = opset.Reshape([t54, t68], {'special_zero': False}, # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?] shared_ptr model = make_shared(make_shared(t69), ParameterVector{t0, t1, t2}); - ov::InferRequest hd_feature_transformer = - utils::singleton_core().compile_model(model, "CPU").create_infer_request(); + ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model( + model, "CPU" + ).create_infer_request(); // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}}); // ov::Tensor h_crop = ov::Tensor{i32, {}}; // h_crop.data()[0] = 1; @@ -1337,10 +1332,7 @@ ov::InferRequest create_hd_feature_transformer() { return hd_feature_transformer; } -ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, - size_t h_crop, - size_t w_crop, - InferRequest& hd_feature_transformer) { +ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) { ov::Shape shape = image_features.get_shape(); OPENVINO_ASSERT(3 == shape.size()); OPENVINO_ASSERT(24 * 24 == shape.at(1)); @@ -1364,24 +1356,23 @@ ov::Tensor add_image_newline(const ov::Tensor& image_features_hd, const std::vec for (size_t batch_id = 0; batch_id < nhwc.at(0); ++batch_id) { for (size_t row_id = 0; row_id < nhwc.at(1); ++row_id) { for (size_t col_id = 0; col_id < nhwc.at(2); ++col_id) { - std::copy_n(in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + - col_id * nhwc.at(3), - nhwc.at(3), - out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + - row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3)); + std::copy_n( + in + batch_id * nhwc.at(1) * nhwc.at(2) * nhwc.at(3) + row_id * nhwc.at(2) * nhwc.at(3) + col_id * nhwc.at(3), + nhwc.at(3), + out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + col_id * nhwc.at(3) + ); } - std::copy(sub_GN.begin(), - sub_GN.end(), - out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + - row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3)); + std::copy( + sub_GN.begin(), + sub_GN.end(), + out + batch_id * nhwc.at(1) * (nhwc.at(2) + 1) * nhwc.at(3) + row_id * (nhwc.at(2) + 1) * nhwc.at(3) + nhwc.at(2) * nhwc.at(3) + ); } } return image_features_hd_new_line; } -ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, - const std::vector& second_f, - const ov::Tensor& third_1lf) { +ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, const std::vector& second_f, const ov::Tensor& third_1lf) { size_t first_l = first_1lf.get_shape().at(1); constexpr size_t second_l = 1; size_t third_l = third_1lf.get_shape().at(1); @@ -1396,20 +1387,12 @@ ov::Tensor concatenate_2d(const ov::Tensor& first_1lf, } // image_features.resized_source: (num_crops+1, 24*24, 1024) -ov::Tensor hd_feature_transform(const EncodedImage& image_features, - InferRequest& hd_feature_transformer, - const std::vector& sub_GN, - const std::vector& glb_GN, - ov::InferRequest& vision_projection) { +ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest& hd_feature_transformer, const std::vector& sub_GN, const std::vector& glb_GN, ov::InferRequest& vision_projection) { const ov::Shape& image_features_shape = image_features.resized_source.get_shape(); - ov::Tensor global_image_features{ov::element::f32, - {1, image_features_shape.at(1), image_features_shape.at(2)}, - image_features.resized_source.data()}; + ov::Tensor global_image_features{ov::element::f32, {1, image_features_shape.at(1), image_features_shape.at(2)}, image_features.resized_source.data()}; // global feature can be viewed as a special HD case with num_crops 1x1 - ov::Tensor global_image_features_hd = - reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer); - ov::Tensor global_image_features_hd_newline = - add_image_newline(global_image_features_hd, sub_GN); // [1,12*(12+1),4096] + ov::Tensor global_image_features_hd = reshape_hd_patches_2x2merge(global_image_features, 1, 1, hd_feature_transformer); + ov::Tensor global_image_features_hd_newline = add_image_newline(global_image_features_hd, sub_GN); // [1,12*(12+1),4096] constexpr size_t INPUT_IMAGE_SIZE = 336; size_t h_crop = image_features.resized_source_size.height / INPUT_IMAGE_SIZE; size_t w_crop = image_features.resized_source_size.width / INPUT_IMAGE_SIZE; @@ -1417,23 +1400,27 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, // NOTE: real num_crops is padded // (num_crops, 24*24, 1024) - ov::Tensor sub_image_features{ - ov::element::f32, - {num_crops, image_features_shape.at(1), image_features_shape.at(2)}, - image_features.resized_source.data() + image_features_shape.at(1) * image_features_shape.at(2)}; - ov::Tensor sub_image_features_hd = - reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer); // [1, 24, 24, 4096] - ov::Tensor sub_image_features_hd_newline = - add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] + ov::Tensor sub_image_features{ov::element::f32, { + num_crops, + image_features_shape.at(1), + image_features_shape.at(2) + }, image_features.resized_source.data() + image_features_shape.at(1) * image_features_shape.at(2)}; + ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer); // [1, 24, 24, 4096] + ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096] } std::vector split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) { constexpr int make_suffix_iterator = -1; std::regex rgx{R"(<\|image_\d+\|>)"}; - std::sregex_token_iterator iter{text.begin(), text.end(), rgx, make_suffix_iterator}; + std::sregex_token_iterator iter{ + text.begin(), + text.end(), + rgx, + make_suffix_iterator + }; std::vector tokenized; - for (; iter != std::sregex_token_iterator{}; ++iter) { + for ( ; iter != std::sregex_token_iterator{}; ++iter) { if (iter->str().empty()) { continue; } @@ -1442,19 +1429,16 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token return tokenized; } -// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, -// ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { +// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { // ov::Tensor encoded_input_ids; // if (is_chat_conversation) { // // KV cache in model already contains prompts and answers from previous iterations. // // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns // // token_ids = {, ...}. So if tokenizer applies only to the new prompt, // // will be inserted on every iteration. -// // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new -// prompt +// // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt // // and takes only the difference between them. -// // The chat history cannot be saved as already encoded tokens because generate call doesn't return -// token, but +// // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but // // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. // m_history.push_back({{"role", "user"}, {"content", prompt}}); // constexpr bool add_generation_prompt = true; @@ -1463,8 +1447,7 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token // new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); // } catch (const std::exception& error) { // // Use fallback chat template if it was not found in tokenizer_config.json -// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, -// chat_template_fallback); +// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); // } // auto start_tokenizer_time = std::chrono::steady_clock::now(); // ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; @@ -1481,19 +1464,18 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token // ).input_ids; // } // auto end_tokenizer_time = std::chrono::steady_clock::now(); -// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - -// start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history); +// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); +// m_templated_chat_history = std::move(new_templated_chat_history); // } else { // auto start_tokenizer_time = std::chrono::steady_clock::now(); // encoded_input_ids = m_tokenizer.encode(prompt).input_ids; // auto end_tokenizer_time = std::chrono::steady_clock::now(); -// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - -// start_tokenizer_time)); +// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); // } // return encoded_input_ids; // } -} // namespace phi3_v -} // namespace +} +} class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { public: @@ -1502,31 +1484,24 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { // Used to insert <|image_i|>\n per image (not a slice). size_t m_image_id = 1; - InputsEmbedderPhi3V(const VLMConfig& vlm_config, - const std::filesystem::path& model_dir, - const std::string& device, - const ov::AnyMap device_config) - : IInputsEmbedder(vlm_config, model_dir, device, device_config), - m_image_id{0}, - m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()}, - m_vision_projection{utils::singleton_core() - .compile_model(model_dir / "openvino_vision_projection_model.xml", device) - .create_infer_request()} {} - - ov::Tensor get_inputs_embeds(const std::string& prompt, - const std::vector& images, - ov::genai::VLMPerfMetrics& metrics) override { + InputsEmbedderPhi3V( + const VLMConfig& vlm_config, + const std::filesystem::path& model_dir, + const std::string& device, + const ov::AnyMap device_config + ): + IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0}, + m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()}, + m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {} + + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { // TODO: perfmetrics - OPENVINO_ASSERT(!std::regex_search(prompt, std::regex{R"(<\|image_\d+\|>)"}), "<|image_i|> can't be used in the prompt because it's reserved for images"); + std::cout << prompt<<'\n'; std::stringstream images_prompt; std::vector images_features_proj; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); - images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, - m_hd_feature_transformer, - m_vlm_config.sub_GN, - m_vlm_config.glb_GN, - m_vision_projection)); + images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection)); images_prompt << "<|image_" << m_image_id << "|>\n"; ++m_image_id; } @@ -1589,26 +1564,27 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { // int64_t* end = ids + encoded_input_size; // float* inputs_embeds_data = inputs_embeds.data(); // for (const EncodedImage& encoded_image : embeds) { - // const ov::Tensor& resampled_source = resample(encoded_image.resized_source, - // {encoded_image.resized_source_size}); float* emb = resampled_source.data(); ids = std::find(ids, - // end, im_start_id); OPENVINO_ASSERT(end != ids); + // const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size}); + // float* emb = resampled_source.data(); + // ids = std::find(ids, end, im_start_id); + // OPENVINO_ASSERT(end != ids); // ++ids; - // std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * - // m_vlm_config.hidden_size); ids += m_vlm_config.query_num; if (encoded_image.slices) { + // std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + // ids += m_vlm_config.query_num; + // if (encoded_image.slices) { // size_t token_idx = 0; // const ov::Shape& slices_shape = encoded_image.slices.get_shape(); // for (size_t i = 0; i < slices_shape.at(0); ++i) { // for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { // size_t d2 = slices_shape.at(2); // size_t d3 = slices_shape.at(3); - // ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + - // (i * slices_shape.at(1) + ja) * d2 * d3}; const ov::Tensor& vision_embed_tensor_i_j = - // resample(encoded_view, {encoded_image.slices_size}); ids = std::find(ids, end, - // slice_start_id); OPENVINO_ASSERT(end != ids); + // ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; + // const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size}); + // ids = std::find(ids, end, slice_start_id); + // OPENVINO_ASSERT(end != ids); // ++ids; - // std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), - // inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); ids += - // m_vlm_config.query_num; + // std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); + // ids += m_vlm_config.query_num; // } // } // } @@ -1647,8 +1623,7 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, } else if (vlm_config.model_type == VLMModelType::PHI3_V) { m_impl = std::make_shared(vlm_config, model_dir, device, device_config); } else { - OPENVINO_THROW( - "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); + OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); } } @@ -1659,42 +1634,19 @@ InputsEmbedder::InputsEmbedder(const VLMConfig& vlm_config, const std::string& device, const ov::AnyMap device_config) { if (vlm_config.model_type == VLMModelType::MINICPM) { - m_impl = std::make_shared(vlm_config, - models_map, - tokenizer, - config_dir_path, - device, - device_config); + m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); } else if (vlm_config.model_type == VLMModelType::LLAVA) { - m_impl = std::make_shared(vlm_config, - models_map, - tokenizer, - config_dir_path, - device, - device_config); + m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); } else if (vlm_config.model_type == VLMModelType::LLAVA_NEXT) { - m_impl = std::make_shared(vlm_config, - models_map, - tokenizer, - config_dir_path, - device, - device_config); + m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); } else if (vlm_config.model_type == VLMModelType::INTERNVL_CHAT) { - m_impl = std::make_shared(vlm_config, - models_map, - tokenizer, - config_dir_path, - device, - device_config); + m_impl = std::make_shared(vlm_config, models_map, tokenizer, config_dir_path, device, device_config); } else { - OPENVINO_THROW( - "Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); + OPENVINO_THROW("Unsupported model type in VLM InputsEmbedder class. Please, create feature request on new model support"); } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, - const std::vector& images, - ov::genai::VLMPerfMetrics& metrics) { +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) { return m_impl->get_inputs_embeds(prompt, images, metrics); } @@ -1706,10 +1658,7 @@ std::vector InputsEmbedder::get_tokenized_history() const { return m_impl->get_tokenized_history(); } -void InputsEmbedder::update_tokenized_history(const std::vector& encoded_result, - std::optional last_disappeared_token, - bool is_beam_search, - size_t last_answer_len) { +void InputsEmbedder::update_tokenized_history(const std::vector& encoded_result, std::optional last_disappeared_token, bool is_beam_search, size_t last_answer_len) { return m_impl->update_tokenized_history(encoded_result, last_disappeared_token, is_beam_search, last_answer_len); } @@ -1733,4 +1682,4 @@ void InputsEmbedder::finish_chat() { return m_impl->finish_chat(); } -} // namespace ov::genai +} // namespace ov::genai From edb2dc13e68724a875ddd5332d136ac54289fa95 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 9 Jan 2025 14:41:15 +0400 Subject: [PATCH 11/28] working chat --- .../src/visual_language/inputs_embedder.cpp | 214 ++++++------------ 1 file changed, 74 insertions(+), 140 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index b1027c533b..1f1e162127 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -676,7 +676,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { } size_t merged_seq_length = text_embeds_seq_length + total_image_seq_length - num_image_tokens; - constexpr size_t BATCH_SIZE = 1; + constexpr size_t BATCH_SIZE = 1; ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); float* merged_data = merged_embeds.data(); @@ -1407,7 +1407,13 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest }, image_features.resized_source.data() + image_features_shape.at(1) * image_features_shape.at(2)}; ov::Tensor sub_image_features_hd = reshape_hd_patches_2x2merge(sub_image_features, h_crop, w_crop, hd_feature_transformer); // [1, 24, 24, 4096] ov::Tensor sub_image_features_hd_newline = add_image_newline(sub_image_features_hd, sub_GN); // [1,h_crop*12*(w_crop*12+1), 4096] - return concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096] + ov::Tensor image_embeddings = concatenate_2d(sub_image_features_hd_newline, glb_GN, global_image_features_hd_newline); // [1,l,4096] + vision_projection.set_input_tensor(image_embeddings); + vision_projection.infer(); + ov::Tensor out = vision_projection.get_output_tensor(); + ov::Tensor res{out.get_element_type(), out.get_shape()}; + out.copy_to(res); + return res; } std::vector split_tokenize(const std::string& text, ov::genai::Tokenizer& tokenizer) { @@ -1428,52 +1434,6 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token } return tokenized; } - -// ov::Tensor apply_template_and_tokenize(bool is_chat_conversation, const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { -// ov::Tensor encoded_input_ids; -// if (is_chat_conversation) { -// // KV cache in model already contains prompts and answers from previous iterations. -// // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns -// // token_ids = {, ...}. So if tokenizer applies only to the new prompt, -// // will be inserted on every iteration. -// // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt -// // and takes only the difference between them. -// // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but -// // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. -// m_history.push_back({{"role", "user"}, {"content", prompt}}); -// constexpr bool add_generation_prompt = true; -// std::string new_templated_chat_history; -// try { -// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); -// } catch (const std::exception& error) { -// // Use fallback chat template if it was not found in tokenizer_config.json -// new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); -// } -// auto start_tokenizer_time = std::chrono::steady_clock::now(); -// ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history).input_ids; -// if (m_is_cache_empty) { -// encoded_input_ids = new_chat_tokens; -// // after first `get_inputs_embeds` is called, we supposed LLM is inferred and cache is not empty -// m_is_cache_empty = false; -// } else { -// TokenizedInputs prev_chat_tokens = m_tokenizer.encode( -// m_templated_chat_history -// ); -// encoded_input_ids = utils::subtract_chat_tokenized_inputs( -// {new_chat_tokens}, prev_chat_tokens -// ).input_ids; -// } -// auto end_tokenizer_time = std::chrono::steady_clock::now(); -// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); -// m_templated_chat_history = std::move(new_templated_chat_history); -// } else { -// auto start_tokenizer_time = std::chrono::steady_clock::now(); -// encoded_input_ids = m_tokenizer.encode(prompt).input_ids; -// auto end_tokenizer_time = std::chrono::steady_clock::now(); -// metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); -// } -// return encoded_input_ids; -// } } } @@ -1495,100 +1455,74 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {} ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { - // TODO: perfmetrics - std::cout << prompt<<'\n'; - std::stringstream images_prompt; + OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt"); std::vector images_features_proj; - for (const ov::Tensor& image : to_single_image_tensors(images)) { - EncodedImage encoded_image = m_vision_encoder.encode(image); - images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection)); - images_prompt << "<|image_" << m_image_id << "|>\n"; - ++m_image_id; + std::vector tokens; + if (m_history.empty()) { + std::stringstream images_prompt; + for (const ov::Tensor& image : to_single_image_tensors(images)) { + EncodedImage encoded_image = m_vision_encoder.encode(image); + images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection)); + images_prompt << "<|image_" << m_image_id << "|>\n"; + ++m_image_id; + } + images_prompt << prompt; + std::string new_templated_chat_history; + if (m_is_chat_conversation) { + m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}}); + constexpr bool add_generation_prompt = true; + m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } else { + m_templated_chat_history = images_prompt.str(); + } + auto start_tokenizer_time = std::chrono::steady_clock::now(); + tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer); + + auto end_tokenizer_time = std::chrono::steady_clock::now(); + metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + if (m_is_chat_conversation) { + for (const ov::Tensor& chunk : tokens) { + m_tokenized_history.insert(m_tokenized_history.end(), chunk.data(), chunk.data() + chunk.get_size()); + } + } + } else { + tokens = {get_encoded_input_ids(prompt, metrics)}; + } + OPENVINO_ASSERT(tokens.size() - 1 == images_features_proj.size()); + size_t features_length = 0; + for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) { + size_t text_length = tokens.at(im_id).get_shape().at(1); + size_t im_length = images_features_proj.at(im_id).get_shape().at(1); + features_length += text_length + im_length; } - images_prompt << prompt; - phi3_v::split_tokenize(images_prompt.str(), m_tokenizer); - - ov::Tensor inputs_embeds; - // if (m_vlm_config.use_image_id) { - // images_prompt += m_vlm_config.im_id_start + std::to_string(m_image_id) + m_vlm_config.im_id_end; - // ++m_image_id; - // } - // std::string unk64; - // for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { - // unk64 += m_vlm_config.unk; - // } - // images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; - // if (encoded_image.slices) { - // ov::Shape slices_shape = encoded_image.slices.get_shape(); - // for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { - // for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { - // images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; - // } - // images_prompt += '\n'; - // } - // } - // if ('\n' != *(images_prompt.end() - 1)) { - // // Image wasn't sliced, add \n to the end of image anyway. - // // Strangely, \n isn't placed between . - // images_prompt += '\n'; - // } - // embeds.push_back(std::move(encoded_image)); - // } - // images_prompt += prompt; - - // ov::Tensor encoded_input = get_encoded_input_ids(images_prompt); - - // ov::Tensor inputs_embeds = m_embedding.infer(encoded_input); - // OPENVINO_ASSERT( - // m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), - // "Unexpected embedding size" - //; - // ov::Tensor special_tokens = m_tokenizer.encode( - // m_vlm_config.im_start - // + m_vlm_config.im_end - // + m_vlm_config.slice_start - // + m_vlm_config.slice_end - //.input_ids; - // OPENVINO_ASSERT( - // 4 == special_tokens.get_shape().at(1), - // "Every special token must be represented with a single int." - //; - // int64_t im_start_id = special_tokens.data()[0]; - // int64_t im_end_id = special_tokens.data()[1]; - // int64_t slice_start_id = special_tokens.data()[2]; - // int64_t slice_end_id = special_tokens.data()[3]; - // int64_t im_start_pos = 0, slice_start_pos = 0; - // int64_t* begin = encoded_input.data(); - // int64_t* ids = begin; - // size_t encoded_input_size = encoded_input.get_size(); - // int64_t* end = ids + encoded_input_size; - // float* inputs_embeds_data = inputs_embeds.data(); - // for (const EncodedImage& encoded_image : embeds) { - // const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size}); - // float* emb = resampled_source.data(); - // ids = std::find(ids, end, im_start_id); - // OPENVINO_ASSERT(end != ids); - // ++ids; - // std::copy_n(emb, resampled_source.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - // ids += m_vlm_config.query_num; - // if (encoded_image.slices) { - // size_t token_idx = 0; - // const ov::Shape& slices_shape = encoded_image.slices.get_shape(); - // for (size_t i = 0; i < slices_shape.at(0); ++i) { - // for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { - // size_t d2 = slices_shape.at(2); - // size_t d3 = slices_shape.at(3); - // ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; - // const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size}); - // ids = std::find(ids, end, slice_start_id); - // OPENVINO_ASSERT(end != ids); - // ++ids; - // std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds_data + std::distance(begin, ids) * m_vlm_config.hidden_size); - // ids += m_vlm_config.query_num; - // } - // } - // } - // } + features_length += tokens.back().get_shape().at(1); + ov::Tensor inputs_embeds{ov::element::f32, {1, features_length, m_vlm_config.hidden_size}}; + size_t offset = 0; + for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) { + const ov::Tensor& text_embeds = m_embedding.infer(tokens.at(im_id)); + const ov::Tensor& image_embeds = images_features_proj.at(im_id); + size_t text_length = text_embeds.get_shape().at(1); + size_t im_length = image_embeds.get_shape().at(1); + std::copy_n( + text_embeds.data(), + text_embeds.get_size(), + inputs_embeds.data() + offset * m_vlm_config.hidden_size + ); + offset += text_length; + std::copy_n( + image_embeds.data(), + image_embeds.get_size(), + inputs_embeds.data() + offset * m_vlm_config.hidden_size + ); + offset += im_length; + } + const ov::Tensor& text_embeds = m_embedding.infer(tokens.back()); + size_t text_length = text_embeds.get_shape().at(1); + std::copy_n( + text_embeds.data(), + text_embeds.get_size(), + inputs_embeds.data() + offset * m_vlm_config.hidden_size + ); if (!m_is_chat_conversation) { m_image_id = 0; From f4c8bb8096a604cd09274a0d5be1738f383b781a Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 9 Jan 2025 14:49:19 +0400 Subject: [PATCH 12/28] Put resize back --- src/cpp/src/visual_language/vision_encoder.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 0ca433b992..36de524b54 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -697,8 +697,7 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) { return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()}); } src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}}; - // bilinear_resize(src, dst, new_w, new_h); - dst = src; // TODO: put resize back + bilinear_resize(src, dst, new_w, new_h); return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()}); } From 2d988ab5a091d89d05b4569045674e5fc13d1420 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 9 Jan 2025 15:59:39 +0400 Subject: [PATCH 13/28] clean up --- .../visual_language_chat.cpp | 22 ++--- .../src/visual_language/inputs_embedder.cpp | 86 +------------------ .../src/visual_language/vision_encoder.cpp | 36 +------- tests/python_tests/test_vlm_pipeline.py | 23 +++-- 4 files changed, 30 insertions(+), 137 deletions(-) diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index 186e58df9e..e426965e66 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) { return !(std::cout << subword << std::flush); } -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) try { if (3 != argc) { throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); } @@ -48,14 +48,14 @@ int main(int argc, char* argv[]) { "question:\n"; } pipe.finish_chat(); -// } catch (const std::exception& error) { -// try { -// std::cerr << error.what() << '\n'; -// } catch (const std::ios_base::failure&) {} -// return EXIT_FAILURE; -// } catch (...) { -// try { -// std::cerr << "Non-exception object thrown\n"; -// } catch (const std::ios_base::failure&) {} -// return EXIT_FAILURE; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 1f1e162127..a8ac0f119c 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1245,91 +1245,10 @@ ov::InferRequest create_hd_feature_transformer() { auto t67 = make_shared(t66, t60); auto t68 = make_shared(NodeVector{t45, t61, t67, t37}, 0); auto t69 = make_shared(t54, t68, false); - - // t0 = opset.Parameter({'shape': [-1, 576, 1024], 'element_type': 'f32'}, # -> f32[?,576,1024] - // t1 = opset.Parameter({'shape': [], 'element_type': 'i32'}, # -> i32[] - // t2 = opset.Parameter({'shape': [], 'element_type': 'i32'}, # -> i32[] - // t3 = opset.ShapeOf([t0], {'output_type': 'i64'}, # f32[?,576,1024] -> i64[3] - // t4 = opset.Constant(model, 4, # -> i64[](0) - // t5 = opset.Constant(model, 5, # -> i64[](0) - // t6 = opset.Gather([t3, t4, t5], {'batch_dims': 0}, # i64[3], i64[], i64[] -> i64[] - // t7 = opset.Constant(model, 7, # -> i64[1]([1]) - // t8 = opset.Reshape([t6, t7], {'special_zero': False}, # i64[], i64[1] -> i64[1] - // t9 = opset.Constant(model, 9, # -> i64[](1) - // t10 = opset.Constant(model, 10, # -> i64[](0) - // t11 = opset.Gather([t3, t9, t10], {'batch_dims': 0}, # i64[3], i64[], i64[] -> i64[] - // t12 = opset.Convert([t11], {'destination_type': 'f32'}, # i64[] -> f32[] - // t13 = opset.Constant(model, 13, # -> f32[](0.5) - // t14 = opset.Power([t12, t13], {'auto_broadcast': 'numpy'}, # f32[], f32[] -> f32[] - // t15 = opset.Convert([t14], {'destination_type': 'i32'}, # f32[] -> i32[] - // t16 = opset.Convert([t15], {'destination_type': 'i64'}, # i32[] -> i64[] - // t17 = opset.Constant(model, 17, # -> i32[](0) - // t18 = opset.Unsqueeze([t16, t17], {}, # i64[], i32[] -> i64[1] - // t19 = opset.Constant(model, 19, # -> i64[1]([2]) - // t20 = opset.Constant(model, 20, # -> i64[](0) - // t21 = opset.Gather([t3, t19, t20], {'batch_dims': 0}, # i64[3], i64[1], i64[] -> i64[1] - // t22 = opset.Concat([t8, t18, t18, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1] -> i64[4] - // t23 = opset.Reshape([t0, t22], {'special_zero': False}, # f32[?,576,1024], i64[4] -> f32[?,24,24,1024] - // t24 = opset.Constant(model, 24, # -> i64[](2) - // t25 = opset.Divide([t16, t24], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] - // t26 = opset.Floor([t25], {}, # i64[] -> i64[] - // t27 = opset.Constant(model, 27, # -> i32[](0) - // t28 = opset.Unsqueeze([t26, t27], {}, # i64[], i32[] -> i64[1] - // t29 = opset.Constant(model, 29, # -> i64[1]([2]) - // t30 = opset.Constant(model, 30, # -> i64[1]([2]) - // t31 = opset.Concat([t8, t28, t29, t28, t30, t21], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] - // t32 = opset.Reshape([t23, t31], {'special_zero': False}, # f32[?,24,24,1024], i64[6] -> f32[?,12,2,12,2,1024] - // t33 = opset.Constant(model, 33, - // t34 = opset.Transpose([t32, t33], {}, # f32[?,12,2,12,2,1024], i64[6] -> f32[?,12,12,2,2,1024] - // t35 = opset.Constant(model, 35, # -> i64[1]([-1]) - // t36 = opset.Constant(model, 36, # -> i64[1]([4]) - // t37 = opset.Multiply([t21, t36], {'auto_broadcast': 'numpy'}, # i64[1], i64[1] -> i64[1] - // t38 = opset.Concat([t8, t35, t37], {'axis': 0}, # i64[1], i64[1], i64[1] -> i64[3] - // t39 = opset.Reshape([t34, t38], {'special_zero': False}, # f32[?,12,12,2,2,1024], i64[3] -> f32[?,?,4096] - // t40 = opset.Multiply([t1, t2], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] - // t41 = opset.Convert([t40], {'destination_type': 'i64'}, # i32[] -> i64[] - // t42 = opset.Divide([t6, t41], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] - // t43 = opset.Floor([t42], {}, # i64[] -> i64[] - // t44 = opset.Constant(model, 44, # -> i32[](0) - // t45 = opset.Unsqueeze([t43, t44], {}, # i64[], i32[] -> i64[1] - // t46 = opset.Convert([t1], {'destination_type': 'i64'}, # i32[] -> i64[] - // t47 = opset.Unsqueeze([t46, t44], {}, # i64[], i32[] -> i64[1] - // t48 = opset.Convert([t2], {'destination_type': 'i64'}, # i32[] -> i64[] - // t49 = opset.Unsqueeze([t48, t44], {}, # i64[], i32[] -> i64[1] - // t50 = opset.Constant(model, 50, # -> i64[1]([-1]) - // t51 = opset.Concat([t45, t47, t49, t28, t28, t50], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1], i64[1], i64[1] -> i64[6] - // t52 = opset.Reshape([t39, t51], {'special_zero': False}, # f32[?,?,4096], i64[6] -> f32[?,?,?,?,?,?] - // t53 = opset.Constant(model, 53, - // t54 = opset.Transpose([t52, t53], {}, # f32[?,?,?,?,?,?], i64[6] -> f32[?,?,?,?,?,?] - // t55 = opset.Multiply([t1, t15], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] - // t56 = opset.Convert([t55], {'destination_type': 'i64'}, # i32[] -> i64[] - // t57 = opset.Constant(model, 57, # -> i64[](2) - // t58 = opset.Divide([t56, t57], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] - // t59 = opset.Floor([t58], {}, # i64[] -> i64[] - // t60 = opset.Constant(model, 60, # -> i32[](0) - // t61 = opset.Unsqueeze([t59, t60], {}, # i64[], i32[] -> i64[1] - // t62 = opset.Multiply([t2, t15], {'auto_broadcast': 'numpy'}, # i32[], i32[] -> i32[] - // t63 = opset.Convert([t62], {'destination_type': 'i64'}, # i32[] -> i64[] - // t64 = opset.Constant(model, 64, # -> i64[](2) - // t65 = opset.Divide([t63, t64], {'auto_broadcast': 'numpy', 'm_pythondiv': True}, # i64[], i64[] -> i64[] - // t66 = opset.Floor([t65], {}, # i64[] -> i64[] - // t67 = opset.Unsqueeze([t66, t60], {}, # i64[], i32[] -> i64[1] - // t68 = opset.Concat([t45, t61, t67, t37], {'axis': 0}, # i64[1], i64[1], i64[1], i64[1] -> i64[4] - // t69 = opset.Reshape([t54, t68], {'special_zero': False}, # f32[?,?,?,?,?,?], i64[4] -> f32[?,?,?,?] shared_ptr model = make_shared(make_shared(t69), ParameterVector{t0, t1, t2}); - ov::InferRequest hd_feature_transformer = utils::singleton_core().compile_model( + return utils::singleton_core().compile_model( model, "CPU" ).create_infer_request(); - // hd_feature_transformer.set_input_tensor(0, ov::Tensor{f32, {1, 576, 1024}}); - // ov::Tensor h_crop = ov::Tensor{i32, {}}; - // h_crop.data()[0] = 1; - // hd_feature_transformer.set_input_tensor(1, h_crop); - // ov::Tensor w_crop = ov::Tensor{i32, {}}; - // w_crop.data()[0] = 1; - // hd_feature_transformer.set_input_tensor(2, w_crop); - // hd_feature_transformer.infer(); - // std::cout << hd_feature_transformer.get_output_tensor().get_shape() << '\n'; // [1,24,24,4096] - return hd_feature_transformer; } ov::Tensor reshape_hd_patches_2x2merge(const ov::Tensor& image_features, size_t h_crop, size_t w_crop, InferRequest& hd_feature_transformer) { @@ -1458,7 +1377,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt"); std::vector images_features_proj; std::vector tokens; - if (m_history.empty()) { + if (!images.empty()) { std::stringstream images_prompt; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); @@ -1467,7 +1386,6 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ++m_image_id; } images_prompt << prompt; - std::string new_templated_chat_history; if (m_is_chat_conversation) { m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}}); constexpr bool add_generation_prompt = true; diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 36de524b54..931849916f 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -824,7 +824,7 @@ ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops) return padded; } -std::tuple get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { +std::tuple get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops); // TODO: this is just resize_and_pad_image() from clip.hpp. ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)}; clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data(), hd_image.data() + hd_image.get_size()}}; @@ -838,8 +838,7 @@ std::tuple get_pixel_values_phi3_v(const ov::Tens ov::Tensor slices = slice_image(hd_image); ov::Tensor concatenated = concatenate_batch(global_image, slices); ov::Tensor pixel_values = pad_to_max_num_crops_tensor(concatenated, config.phi3_v.num_crops); - size_t num_img_tokens = (image_size.height / INPUT_IMAGE_SIZE) * (image_size.width / INPUT_IMAGE_SIZE) * config.phi3_v.num_img_tokens + 1 + (image_size.height / INPUT_IMAGE_SIZE + 1) * size_t(std::sqrt(config.phi3_v.num_img_tokens)); - return {std::move(pixel_values), image_size, num_img_tokens}; + return {std::move(pixel_values), image_size}; } } // namespace phi3_v } // anonymous namespace @@ -955,37 +954,8 @@ EncodedImage VisionEncoder::encode_internvl(const ov::Tensor& image, const Proce } EncodedImage VisionEncoder::encode_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { - // TODO: drop num_img_tokens - const auto& [pixel_values, image_size, num_img_tokens] = phi3_v::get_pixel_values_phi3_v(image, config); - // std::cout << pixel_values.data()[3*336*336+0] << '\n'; - // std::cout << pixel_values.data()[3*336*336+1] << '\n'; - // std::cout << pixel_values.data()[3*336*336+100] << '\n'; -// -1.79226 -// -1.74847 -// -1.14993 -// 0.645675 -// 0.660273 -// 1.09823 + const auto& [pixel_values, image_size] = phi3_v::get_pixel_values_phi3_v(image, config); m_vision_encoder.set_input_tensor(pixel_values); m_vision_encoder.infer(); - // std::cout << pixel_values.get_shape() << ' ' << m_vision_encoder.get_output_tensor().get_shape() << '\n'; - // ov::Tensor out = m_vision_encoder.get_output_tensor(); - // std::cout << out.data()[576*1024 + 0] << '\n'; - // std::cout << out.data()[576*1024 + 1] << '\n'; - // std::cout << out.data()[576*1024 + 1025] << '\n'; - // std::cout << out.data()[576*1024 + 4090] << '\n'; - // std::cout << out.data()[576*1024 + 80000] << '\n'; -// [5,3,336,336] [5,576,1024] -// 0.134461 -// -0.867309 -// -0.274503 -// 1.73786 -// 0.13117 -// [5,3,336,336] [5,576,1024] -// -1.01567 -// -0.291421 -// -0.260488 -// 0.743025 -// 1.4099 return {m_vision_encoder.get_output_tensor(), image_size}; } diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 81c181bc54..53f678fc6a 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -9,17 +9,17 @@ from openvino_genai import VLMPipeline, GenerationConfig from common import get_image_by_link, get_beam_search, get_multinomial_all_parameters -def get_ov_model(cache): - model_dir = cache.mkdir("tiny-random-minicpmv-2_6") +def get_ov_model(model_id, cache): + model_dir = cache.mkdir(model_id.split('/')[-1]) if (model_dir / "openvino_language_model.xml").exists(): return model_dir - model_id = "katuni4ka/tiny-random-minicpmv-2_6" processor = transformers.AutoProcessor.from_pretrained(model_id, trust_remote_code=True) processor.tokenizer.save_pretrained(model_dir) ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(processor.tokenizer, with_detokenizer=True) openvino.save_model(ov_tokenizer, model_dir / "openvino_tokenizer.xml") openvino.save_model(ov_detokenizer, model_dir / "openvino_detokenizer.xml") model = OVModelForVisualCausalLM.from_pretrained(model_id, compile=False, device="CPU", export=True, trust_remote_code=True) + processor.chat_template = processor.tokenizer.chat_template # It seems that tiny-random-phi3-vision is saved incorrectly. That line works this around. processor.save_pretrained(model_dir) model.save_pretrained(model_dir) return model_dir @@ -44,12 +44,16 @@ def get_ov_model(cache): @pytest.mark.precommit @pytest.mark.nightly -def test_vlm_pipeline(cache): +@pytest.mark.parametrize("model_id", [ + "katuni4ka/tiny-random-minicpmv-2_6", + "katuni4ka/tiny-random-phi3-vision", +]) +def test_vlm_pipeline(model_id, cache): def streamer(word: str) -> bool: return False - models_path = get_ov_model(cache) - generation_config = GenerationConfig(max_new_tokens=30) + models_path = get_ov_model(model_id, cache) + generation_config = GenerationConfig(max_new_tokens=100) for links in image_links_for_testing: images = [] @@ -70,7 +74,7 @@ def streamer(word: str) -> bool: @pytest.mark.precommit @pytest.mark.nightly def test_vlm_get_tokenizer(cache): - models_path = get_ov_model(cache) + models_path = get_ov_model("katuni4ka/tiny-random-minicpmv-2_6", cache) pipe = VLMPipeline(models_path, "CPU") tokenizer = pipe.get_tokenizer() tokenizer.encode("") @@ -83,15 +87,16 @@ def test_vlm_get_tokenizer(cache): get_multinomial_all_parameters(), ]) def test_sampling(config, cache): - models_path = get_ov_model(cache) + models_path = get_ov_model("katuni4ka/tiny-random-minicpmv-2_6", cache) image = get_image_by_link(image_links[0]) pipe = VLMPipeline(models_path, "CPU") pipe.generate(prompts[0], image=image, generation_config=config) @pytest.mark.precommit +@pytest.mark.nightly def test_perf_metrics(cache): import numpy as np - models_path = get_ov_model(cache) + models_path = get_ov_model("katuni4ka/tiny-random-minicpmv-2_6", cache) images = [get_image_by_link(image_links[0])] From 6e24a25c8ab094f1b583fe7b06f2884bff20e6e4 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 9 Jan 2025 16:07:19 +0400 Subject: [PATCH 14/28] clean up --- SUPPORTED_MODELS.md | 1 + src/cpp/src/visual_language/vision_encoder.cpp | 3 +-- tests/python_tests/test_vlm_pipeline.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md index 71f382d529..9f404f7a1c 100644 --- a/SUPPORTED_MODELS.md +++ b/SUPPORTED_MODELS.md @@ -361,6 +361,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize Phi3VForCausalLM phi3_v + Not supported
  • microsoft/Phi-3-vision-128k-instruct
  • diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 931849916f..c5ca2b1025 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -651,7 +651,6 @@ constexpr size_t INPUT_IMAGE_SIZE = 336; ov::Tensor padding_336(const ov::Tensor& unpadded) { ov::Shape _1ss3 = unpadded.get_shape(); size_t s1 = _1ss3.at(1), s2 = _1ss3.at(2); - // TODO: test horizontal and vertical images if (s1 < s2) { size_t tar = size_t(std::ceil(float(s1) / INPUT_IMAGE_SIZE) * INPUT_IMAGE_SIZE); size_t top_padding = (tar - s1) / 2; @@ -825,7 +824,7 @@ ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops) } std::tuple get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { - ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops); // TODO: this is just resize_and_pad_image() from clip.hpp. + ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops); ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)}; clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data(), hd_image.data() + hd_image.get_size()}}; clip_image_u8 dst; diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 52345cbd06..3867806fe4 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -53,7 +53,7 @@ def streamer(word: str) -> bool: return False models_path = get_ov_model(model_id, cache) - generation_config = GenerationConfig(max_new_tokens=100) + generation_config = GenerationConfig(max_new_tokens=30) for links in image_links_for_testing: images = [] From 3fd78e461290d88fecaa3413079ed967a422ac16 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 9 Jan 2025 16:24:06 +0400 Subject: [PATCH 15/28] fix compilation --- src/cpp/src/visual_language/inputs_embedder.cpp | 2 +- src/cpp/src/visual_language/vision_encoder.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index a8ac0f119c..8755f3683c 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1371,7 +1371,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ): IInputsEmbedder(vlm_config, model_dir, device, device_config), m_image_id{0}, m_hd_feature_transformer{phi3_v::create_hd_feature_transformer()}, - m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device).create_infer_request()} {} + m_vision_projection{utils::singleton_core().compile_model(model_dir / "openvino_vision_projection_model.xml", device, {}).create_infer_request()} {} ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt"); diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index c5ca2b1025..56165e392c 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -691,11 +691,11 @@ ov::Tensor HD_transform(const ov::Tensor& uint8, size_t num_crops) { clip_image_u8 src{}, dst{}; uint8_t* uint8_data = uint8.data(); if (trans) { - src = clip_image_u8{height, width, {uint8_data, uint8_data + uint8.get_size()}}; + src = clip_image_u8{int(height), int(width), {uint8_data, uint8_data + uint8.get_size()}}; bilinear_resize(src, dst, new_h, new_w); return padding_336(ov::Tensor{ov::element::u8, {1, new_w, new_h, 3}, dst.buf.data()}); } - src = clip_image_u8{width, height, {uint8_data, uint8_data + uint8.get_size()}}; + src = clip_image_u8{int(width), int(height), {uint8_data, uint8_data + uint8.get_size()}}; bilinear_resize(src, dst, new_w, new_h); return padding_336(ov::Tensor{ov::element::u8, {1, new_h, new_w, 3}, dst.buf.data()}); } @@ -826,7 +826,7 @@ ov::Tensor pad_to_max_num_crops_tensor(const ov::Tensor& nchw, size_t max_crops) std::tuple get_pixel_values_phi3_v(const ov::Tensor& image, const ProcessorConfig& config) { ov::Tensor hd_image = HD_transform(image, config.phi3_v.num_crops); ImageSize image_size{hd_image.get_shape().at(2), hd_image.get_shape().at(1)}; - clip_image_u8 img{hd_image.get_shape().at(2), hd_image.get_shape().at(1), {hd_image.data(), hd_image.data() + hd_image.get_size()}}; + clip_image_u8 img{int(hd_image.get_shape().at(2)), int(hd_image.get_shape().at(1)), {hd_image.data(), hd_image.data() + hd_image.get_size()}}; clip_image_u8 dst; bicubic_resize(img, dst, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE); ov::Tensor global_image{ov::element::u8, {1, INPUT_IMAGE_SIZE, INPUT_IMAGE_SIZE, 3}, dst.buf.data()}; From 91b170fa9f34c9556f9c0169c1b51e6c0641e53b Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 9 Jan 2025 17:39:43 +0400 Subject: [PATCH 16/28] fix prefix --- .../src/visual_language/inputs_embedder.cpp | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 8755f3683c..bf51abe3f9 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -144,7 +144,7 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "", bool add_special_tokens_for_chat = false) { ov::Tensor encoded_input_ids; if (m_is_chat_conversation) { // KV cache in model already contains prompts and answers from previous iterations. @@ -165,8 +165,8 @@ class InputsEmbedder::IInputsEmbedder { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; - TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)); + ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids; + TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)); // some symbols combinations can be encoded by the tokenizer in different ways // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history @@ -1349,7 +1349,8 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token if (iter->str().empty()) { continue; } - tokenized.push_back(tokenizer.encode(*iter).input_ids); + std::string substr = *iter; + tokenized.push_back(tokenizer.encode(substr, ov::genai::add_special_tokens(true)).input_ids); } return tokenized; } @@ -1377,7 +1378,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt"); std::vector images_features_proj; std::vector tokens; - if (!images.empty()) { + if (m_history.empty()) { std::stringstream images_prompt; for (const ov::Tensor& image : to_single_image_tensors(images)) { EncodedImage encoded_image = m_vision_encoder.encode(image); @@ -1394,17 +1395,18 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { m_templated_chat_history = images_prompt.str(); } auto start_tokenizer_time = std::chrono::steady_clock::now(); + ov::Tensor unmodified_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(true)).input_ids; tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer); auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); if (m_is_chat_conversation) { - for (const ov::Tensor& chunk : tokens) { - m_tokenized_history.insert(m_tokenized_history.end(), chunk.data(), chunk.data() + chunk.get_size()); - } + m_tokenized_history = std::vector{unmodified_tokens.data(), unmodified_tokens.data() + unmodified_tokens.get_size()}; } } else { - tokens = {get_encoded_input_ids(prompt, metrics)}; + constexpr char ignored[] = ""; + constexpr bool add_special_tokens = true; + tokens = {get_encoded_input_ids(prompt, metrics, ignored, add_special_tokens)}; } OPENVINO_ASSERT(tokens.size() - 1 == images_features_proj.size()); size_t features_length = 0; From 793e4c828feee5e1c2e7adcf2cd0ba7c2a6b8d20 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Fri, 10 Jan 2025 14:32:16 +0400 Subject: [PATCH 17/28] Add instructions to reproduce --- .../src/visual_language/inputs_embedder.cpp | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index bf51abe3f9..bae27533a3 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1170,6 +1170,39 @@ namespace phi3_v { // .permute(0, 1, 3, 2, 4, 5) # n_img, h_crop, 12, w_crop, 12, 4096 // .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C) # n_img, h_crop*12, w_crop*12, 4096 // ) +// Obtained in the following way +// import torch +// import openvino as ov +// import numpy as np +// class Model(torch.nn.Module): +// def forward(self, image_features, h_crop, w_crop): +// """ +// image_features: (num_images*num_crops, 24*24, 1024) +// output: (num_images, h_crop*12, w_crop*12, 4096), h_crop*w_crop == num_crops +// """ +// N, L, C = image_features.shape +// num_images = N // (h_crop * w_crop) +// H = (torch.tensor(L, dtype=torch.float32)**0.5).int() +// image_features_hd = ( +// image_features.reshape(N, H, H, C) # N, 24, 24, 1024 +// .reshape(N, H // 2, 2, H // 2, 2, C) # N, 12, 2, 12, 2, 1024 +// .permute(0, 1, 3, 2, 4, 5) # N, 12, 12, 2, 2, 1024 +// .reshape(N, -1, 4 * C) # N, 144, 4096 +// .reshape(num_images, h_crop, w_crop, H // 2, H // 2, -1) # n_img, h_crop, w_crop, 12, 12, 4096 +// .permute(0, 1, 3, 2, 4, 5) # n_img, h_crop, 12, w_crop, 12, 4096 +// .reshape(num_images, h_crop * H // 2, w_crop * H // 2, 4 * C) # n_img, h_crop*12, w_crop*12, 4096 +// return {"o": image_features_hd} +// model = Model() +// example_input = {"image_features": torch.rand((4, 576, 1024), dtype=torch.float32), "h_crop": torch.tensor(2, dtype=torch.int32), "w_crop": torch.tensor(2, dtype=torch.int32)} +// ov_model = ov.convert_model(model, example_input=example_input, input=ov.PartialShape([-1, 576, 1024])) +// # ov_model.outputs[0].get_tensor().set_names({"out"}) +// ov.save_model(ov_model, "reshape_hd_patches_2x2merge.xml") +// inp = np.arange(4 * 576 * 1024).reshape([4, 576, 1024]) +// test = ov.Core().compile_model(ov_model, "CPU") +// print(ov_model) +// print(test([inp, 2, 2])["o"].flatten()) +// 2. Run https://github.com/slyalin/openvino_devtools/blob/bcd4a51b1354b24b2316ac3e1c77b2f87ae7a497/openvino_devtools/ov2py.py with the IR. +// 3. Translate the printed Python implementation to C++. ov::InferRequest create_hd_feature_transformer() { using namespace ov; using namespace element; From bab2d46d751df925c562b6373a56e0f6e0e9b89c Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 14 Jan 2025 17:38:25 +0400 Subject: [PATCH 18/28] Split get_encoded_input_ids --- .../src/visual_language/inputs_embedder.cpp | 57 ++++++++++++------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 5639f46537..448155d7a5 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -144,17 +144,8 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool add_special_tokens_for_chat = false) { - ov::Tensor encoded_input_ids; + std::pair apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {} bool add_special_tokens_for_chat = false) { if (m_is_chat_conversation) { - // KV cache in model already contains prompts and answers from previous iterations. - // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns - // token_ids = {, ...}. So if tokenizer applies only to the new prompt, - // will be inserted on every iteration. - // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt - // and takes only the difference between them. - // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but - // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; std::string new_templated_chat_history; @@ -166,7 +157,31 @@ class InputsEmbedder::IInputsEmbedder { } auto start_tokenizer_time = std::chrono::steady_clock::now(); ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids; - TokenizedInputs prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)); + ov::Tensor prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids; + auto end_tokenizer_time = std::chrono::steady_clock::now(); + metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + m_templated_chat_history = std::move(new_templated_chat_history); + return {new_chat_tokens, prev_chat_tokens}; + } else { + auto start_tokenizer_time = std::chrono::steady_clock::now(); + ov::Tensor encoded_input_ids = m_tokenizer.encode(prompt).input_ids; + auto end_tokenizer_time = std::chrono::steady_clock::now(); + metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); + return {encoded_input_ids, ov::Tensor()}; + } + } + + ov::Tensor update_history(const ov::Tensor& new_chat_tokens, const ov::Tensor& prev_chat_tokens) { + if (m_is_chat_conversation) { + ov::Tensor encoded_input_ids; + // KV cache in model already contains prompts and answers from previous iterations. + // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns + // token_ids = {, ...}. So if tokenizer applies only to the new prompt, + // will be inserted on every iteration. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. // some symbols combinations can be encoded by the tokenizer in different ways // if we met sequence with such combination of symbols, we cannot correctly subtract the new history from the old history @@ -174,7 +189,7 @@ class InputsEmbedder::IInputsEmbedder { size_t trusted_history_length = 0; if (!m_tokenized_history.empty()) { std::set stop_tokens = {m_tokenizer.get_eos_token_id()}; - trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens.input_ids, m_tokenized_history, stop_tokens); + trusted_history_length = ov::genai::utils::get_first_history_difference(prev_chat_tokens, m_tokenized_history, stop_tokens); } if (m_tokenized_history.empty()) { @@ -200,27 +215,25 @@ class InputsEmbedder::IInputsEmbedder { new_tensor.copy_to(encoded_input_ids); } else { encoded_input_ids = utils::subtract_chat_tokenized_inputs( - {new_chat_tokens}, prev_chat_tokens + {new_chat_tokens}, {prev_chat_tokens} ).input_ids; if (m_last_disappeared_token.has_value()) encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); } - auto end_tokenizer_time = std::chrono::steady_clock::now(); - metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); - m_templated_chat_history = std::move(new_templated_chat_history); m_tokenized_history.clear(); std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); + return encoded_input_ids; } else { - auto start_tokenizer_time = std::chrono::steady_clock::now(); - encoded_input_ids = m_tokenizer.encode(prompt).input_ids; - auto end_tokenizer_time = std::chrono::steady_clock::now(); - metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_tokenized_history.clear(); - std::copy_n(encoded_input_ids.data(), encoded_input_ids.get_size(), std::back_inserter(m_tokenized_history)); + std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); + return new_chat_tokens; } + } - return encoded_input_ids; + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "", bool add_special_tokens_for_chat = false) { + const auto [new_chat_tokens, prev_chat_tokens] = apply_chat_template_tokenize(prompt, metrics, chat_template_fallback, add_special_tokens_for_chat); + return update_history(new_chat_tokens, prev_chat_tokens); } /** From b3ca05aeec0e8c9f88dcd14844b87e130b30019c Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 14 Jan 2025 18:12:12 +0400 Subject: [PATCH 19/28] sintax --- src/cpp/src/visual_language/inputs_embedder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 448155d7a5..4e417ddb34 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -144,7 +144,7 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - std::pair apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {} bool add_special_tokens_for_chat = false) { + std::pair apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool add_special_tokens_for_chat = false) { if (m_is_chat_conversation) { m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; From 02d36b52a94e4ca12a2d0981d7f17df14358581b Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 15 Jan 2025 14:38:40 +0400 Subject: [PATCH 20/28] Phi-3.5-vision-instruc history --- .../visual_language_chat.cpp | 22 ++-- .../src/visual_language/inputs_embedder.cpp | 103 +++++++++++++----- 2 files changed, 87 insertions(+), 38 deletions(-) diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index e426965e66..186e58df9e 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) { return !(std::cout << subword << std::flush); } -int main(int argc, char* argv[]) try { +int main(int argc, char* argv[]) { if (3 != argc) { throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); } @@ -48,14 +48,14 @@ int main(int argc, char* argv[]) try { "question:\n"; } pipe.finish_chat(); -} catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; -} catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; +// } catch (const std::exception& error) { +// try { +// std::cerr << error.what() << '\n'; +// } catch (const std::ios_base::failure&) {} +// return EXIT_FAILURE; +// } catch (...) { +// try { +// std::cerr << "Non-exception object thrown\n"; +// } catch (const std::ios_base::failure&) {} +// return EXIT_FAILURE; } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 4e417ddb34..d52490c1b6 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1400,6 +1400,53 @@ std::vector split_tokenize(const std::string& text, ov::genai::Token } return tokenized; } + +ov::Tensor insert_image_placeholders(const std::vector& chunks, size_t tokens_per_image) { + size_t merged_length = 0; + for (const ov::Tensor& chunk : chunks) { + merged_length += chunk.get_shape().at(1); + } + merged_length += chunks.empty() ? 0 : (chunks.size() - 1) * tokens_per_image; + ov::Tensor merged{ov::element::i64, {1, merged_length}}; + size_t offset = 0; + int64_t image_id = -1; + for (const ov::Tensor& chunk : chunks) { + size_t length = chunk.get_shape().at(1); + std::copy_n( + chunk.data(), + length, + merged.data() + offset + ); + offset += length; + if (offset < merged_length) { + std::fill_n( + merged.data() + offset, + tokens_per_image, + image_id + ); + offset += tokens_per_image; + --image_id; + } + } + return merged; +} + +std::vector drop_image_placeholders(const ov::Tensor& tokens) { + std::vector chunks; + size_t offset = 0; + while (offset < tokens.get_shape().at(1)) { + size_t length = 0; + while (offset + length < tokens.get_shape().at(1) && tokens.data()[offset + length] >= 0) { + ++length; + } + chunks.emplace_back(ov::element::i64, ov::Shape{1, length}, tokens.data() + offset); + offset += length; + while (offset < tokens.get_shape().at(1) && tokens.data()[offset] < 0) { + ++offset; + } + } + return chunks; +} } } @@ -1423,42 +1470,44 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics) override { OPENVINO_ASSERT(images.empty() || m_history.empty(), "Images can only be provided for initial prompt"); std::vector images_features_proj; - std::vector tokens; - if (m_history.empty()) { - std::stringstream images_prompt; - for (const ov::Tensor& image : to_single_image_tensors(images)) { - EncodedImage encoded_image = m_vision_encoder.encode(image); - images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection)); - images_prompt << "<|image_" << m_image_id << "|>\n"; - ++m_image_id; - } - images_prompt << prompt; - if (m_is_chat_conversation) { - m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}}); - constexpr bool add_generation_prompt = true; - m_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); - } else { - m_templated_chat_history = images_prompt.str(); - } + std::stringstream images_prompt; + for (const ov::Tensor& image : to_single_image_tensors(images)) { + EncodedImage encoded_image = m_vision_encoder.encode(image); + images_features_proj.push_back(phi3_v::hd_feature_transform(encoded_image, m_hd_feature_transformer, m_vlm_config.sub_GN, m_vlm_config.glb_GN, m_vision_projection)); + images_prompt << "<|image_" << m_image_id << "|>\n"; + ++m_image_id; + } + images_prompt << prompt; + std::vector new_chat_tokens; + std::vector prev_chat_tokens; + if (m_is_chat_conversation) { + m_history.push_back({{"role", "user"}, {"content", images_prompt.str()}}); + constexpr bool add_generation_prompt = true; + std::string new_templated_chat_history; + new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor unmodified_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(true)).input_ids; - tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer); - + new_chat_tokens = phi3_v::split_tokenize(new_templated_chat_history, m_tokenizer); + prev_chat_tokens = phi3_v::split_tokenize(m_templated_chat_history, m_tokenizer); auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); - if (m_is_chat_conversation) { - m_tokenized_history = std::vector{unmodified_tokens.data(), unmodified_tokens.data() + unmodified_tokens.get_size()}; - } + m_templated_chat_history = std::move(new_templated_chat_history); } else { - constexpr char ignored[] = ""; - constexpr bool add_special_tokens = true; - tokens = {get_encoded_input_ids(prompt, metrics, ignored, add_special_tokens)}; + auto start_tokenizer_time = std::chrono::steady_clock::now(); + new_chat_tokens = phi3_v::split_tokenize(images_prompt.str(), m_tokenizer); + auto end_tokenizer_time = std::chrono::steady_clock::now(); + metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); } - OPENVINO_ASSERT(tokens.size() - 1 == images_features_proj.size()); + size_t tokens_per_image = images_features_proj.empty() ? 0 : images_features_proj.at(0).get_shape().at(1); + ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, tokens_per_image); + ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, tokens_per_image); + ov::Tensor new_tokens = update_history(new_merged_tokens, prev_merged_tokens); + std::vector tokens = phi3_v::drop_image_placeholders(new_tokens); + OPENVINO_ASSERT(tokens.size() == images_features_proj.size() + 1); size_t features_length = 0; for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) { size_t text_length = tokens.at(im_id).get_shape().at(1); size_t im_length = images_features_proj.at(im_id).get_shape().at(1); + OPENVINO_ASSERT(im_length == tokens_per_image); features_length += text_length + im_length; } features_length += tokens.back().get_shape().at(1); From 9336dac8d348a413f54eca82888062810e05dedf Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Fri, 17 Jan 2025 12:04:43 +0400 Subject: [PATCH 21/28] Save tokens per image --- src/cpp/src/visual_language/inputs_embedder.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index d52490c1b6..4f788a6e73 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1456,6 +1456,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { ov::InferRequest m_vision_projection; // Used to insert <|image_i|>\n per image (not a slice). size_t m_image_id = 1; + size_t m_tokens_per_image = 0; InputsEmbedderPhi3V( const VLMConfig& vlm_config, @@ -1497,9 +1498,11 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); } - size_t tokens_per_image = images_features_proj.empty() ? 0 : images_features_proj.at(0).get_shape().at(1); - ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, tokens_per_image); - ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, tokens_per_image); + if (0 == m_tokens_per_image && !images_features_proj.empty()) { + m_tokens_per_image = images_features_proj.at(0).get_shape().at(1); + } + ov::Tensor new_merged_tokens = phi3_v::insert_image_placeholders(new_chat_tokens, m_tokens_per_image); + ov::Tensor prev_merged_tokens = phi3_v::insert_image_placeholders(prev_chat_tokens, m_tokens_per_image); ov::Tensor new_tokens = update_history(new_merged_tokens, prev_merged_tokens); std::vector tokens = phi3_v::drop_image_placeholders(new_tokens); OPENVINO_ASSERT(tokens.size() == images_features_proj.size() + 1); @@ -1507,7 +1510,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { for (size_t im_id = 0; im_id < images_features_proj.size(); ++im_id) { size_t text_length = tokens.at(im_id).get_shape().at(1); size_t im_length = images_features_proj.at(im_id).get_shape().at(1); - OPENVINO_ASSERT(im_length == tokens_per_image); + OPENVINO_ASSERT(im_length == m_tokens_per_image); features_length += text_length + im_length; } features_length += tokens.back().get_shape().at(1); From 0652749d0dd120dc5696d7f725feaa8d66e1d95e Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 20 Jan 2025 11:20:18 +0400 Subject: [PATCH 22/28] Resolve merge conflict --- src/cpp/src/visual_language/inputs_embedder.cpp | 4 ---- thirdparty/openvino_tokenizers | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index c7ecba38cb..24e932c0fd 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -221,10 +221,6 @@ class InputsEmbedder::IInputsEmbedder { if (m_last_disappeared_token.has_value()) encoded_input_ids = ov::genai::utils::push_front_inputs(encoded_input_ids, *m_last_disappeared_token); } -<<<<<<< HEAD -======= - m_templated_chat_history = std::move(new_templated_chat_history); ->>>>>>> phi-3.5-vision-instruct m_tokenized_history.clear(); std::copy_n(new_chat_tokens.data(), new_chat_tokens.get_size(), std::back_inserter(m_tokenized_history)); return encoded_input_ids; diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index 708712d84d..d5f0abf827 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit 708712d84d3201f816c5e44532c9e1b14e4d8be8 +Subproject commit d5f0abf8271f3cd8fc98d747b3e569fbeacca532 From d546486a234729edd33da607975f79554e49b2d6 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 20 Jan 2025 13:45:28 +0400 Subject: [PATCH 23/28] clean up --- .../visual_language_chat.cpp | 22 +++++++++---------- .../src/visual_language/inputs_embedder.cpp | 10 ++++----- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index 186e58df9e..e426965e66 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -9,7 +9,7 @@ bool print_subword(std::string&& subword) { return !(std::cout << subword << std::flush); } -int main(int argc, char* argv[]) { +int main(int argc, char* argv[]) try { if (3 != argc) { throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); } @@ -48,14 +48,14 @@ int main(int argc, char* argv[]) { "question:\n"; } pipe.finish_chat(); -// } catch (const std::exception& error) { -// try { -// std::cerr << error.what() << '\n'; -// } catch (const std::ios_base::failure&) {} -// return EXIT_FAILURE; -// } catch (...) { -// try { -// std::cerr << "Non-exception object thrown\n"; -// } catch (const std::ios_base::failure&) {} -// return EXIT_FAILURE; +} catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; +} catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; } diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 24e932c0fd..eddb0eaa95 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -144,7 +144,7 @@ class InputsEmbedder::IInputsEmbedder { ), m_tokenizer(tokenizer) { } - std::pair apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}, bool add_special_tokens_for_chat = false) { + std::pair apply_chat_template_tokenize(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = {}) { if (m_is_chat_conversation) { m_history.push_back({{"role", "user"}, {"content", prompt}}); constexpr bool add_generation_prompt = true; @@ -156,8 +156,8 @@ class InputsEmbedder::IInputsEmbedder { new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt, chat_template_fallback); } auto start_tokenizer_time = std::chrono::steady_clock::now(); - ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids; - ov::Tensor prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(add_special_tokens_for_chat)).input_ids; + ov::Tensor new_chat_tokens = m_tokenizer.encode(new_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor prev_chat_tokens = m_tokenizer.encode(m_templated_chat_history, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); metrics.raw_metrics.tokenization_durations.emplace_back(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); m_templated_chat_history = std::move(new_templated_chat_history); @@ -231,8 +231,8 @@ class InputsEmbedder::IInputsEmbedder { } } - ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "", bool add_special_tokens_for_chat = false) { - const auto [new_chat_tokens, prev_chat_tokens] = apply_chat_template_tokenize(prompt, metrics, chat_template_fallback, add_special_tokens_for_chat); + ov::Tensor get_encoded_input_ids(const std::string& prompt, ov::genai::VLMPerfMetrics& metrics, const std::string& chat_template_fallback = "") { + const auto [new_chat_tokens, prev_chat_tokens] = apply_chat_template_tokenize(prompt, metrics, chat_template_fallback); return update_history(new_chat_tokens, prev_chat_tokens); } From 98f73e23a7ddb04b80e74b75257167a05e0ad30e Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Mon, 20 Jan 2025 14:47:04 +0400 Subject: [PATCH 24/28] clean up --- src/cpp/src/visual_language/inputs_embedder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index d36b16b6f2..66b17e5804 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -1454,7 +1454,7 @@ std::vector drop_image_placeholders(const ov::Tensor& tokens) { return chunks; } } // namespace phi3_v -} +} // anonymous namespace class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { public: From a5c63dd5f1ad87e66238152c6ac9b021e203eb43 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Tue, 21 Jan 2025 11:48:56 +0400 Subject: [PATCH 25/28] Remove comment --- src/cpp/src/visual_language/vision_encoder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/vision_encoder.cpp b/src/cpp/src/visual_language/vision_encoder.cpp index 520262f3f8..04ddd63145 100644 --- a/src/cpp/src/visual_language/vision_encoder.cpp +++ b/src/cpp/src/visual_language/vision_encoder.cpp @@ -992,7 +992,7 @@ ov::Tensor transpose_image_patches_qwen2vl(const ov::Tensor& reshaped_patches) { return transposed_patches; } -} // anonymous namespace +} VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config) : model_type(model_type) { From 0d08310ca1cc021fa984ef1207a77c9b6dc1f462 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 22 Jan 2025 19:03:10 +0400 Subject: [PATCH 26/28] Freeze mac OV_BRANCH --- .github/workflows/mac.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index f377d3e6a5..ecc73cc369 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -17,7 +17,7 @@ concurrency: env: PYTHON_VERSION: '3.10' - OV_BRANCH: 'master' + OV_BRANCH: 7f56fcd4658c6a427111ac835e809ddd87f0cad2 OV_TARBALL: '' jobs: From 066d972f2fcb19a5657d30443347c6a84ac0c292 Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Wed, 22 Jan 2025 20:05:29 +0400 Subject: [PATCH 27/28] Add notes --- SUPPORTED_MODELS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md index a13d1f6d7c..82b43a2fa1 100644 --- a/SUPPORTED_MODELS.md +++ b/SUPPORTED_MODELS.md @@ -312,6 +312,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize Models LoRA support Example HuggingFace Models + Notes InternVL2 @@ -329,6 +330,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
  • OpenGVLab/InternVL2_5-8B
+ LLaVA @@ -339,6 +341,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
  • llava-hf/llava-1.5-7b-hf
  • + LLaVA-NeXT @@ -351,6 +354,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
  • llava-hf/llama3-llava-next-8b-hf
  • + MiniCPMV @@ -361,6 +365,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
  • openbmb/MiniCPM-V-2_6
  • + Phi3VForCausalLM @@ -372,6 +377,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
  • microsoft/Phi-3.5-vision-instruct
  • + GPU is not supported Qwen2-VL @@ -383,6 +389,7 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
  • Qwen/Qwen2-VL-7B-Instruct
  • + From 03a29fc504668f84096e13ea04d5a04d2a41041c Mon Sep 17 00:00:00 2001 From: Vladimir Zlobin Date: Thu, 23 Jan 2025 01:13:42 +0400 Subject: [PATCH 28/28] Extend notes --- SUPPORTED_MODELS.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/SUPPORTED_MODELS.md b/SUPPORTED_MODELS.md index 82b43a2fa1..3064fb58c1 100644 --- a/SUPPORTED_MODELS.md +++ b/SUPPORTED_MODELS.md @@ -377,7 +377,10 @@ In addition to image generation models, `InpaintingPipeline` supports specialize
  • microsoft/Phi-3.5-vision-instruct
  • - GPU is not supported + +
  • GPU isn't supported
  • +
  • These models' configs aren't consistent. It's required to override the default eos_token_id with the one from a tokenizer: generation_config.set_eos_token_id(pipe.get_tokenizer().get_eos_token_id()).
  • + Qwen2-VL