From 841fae001ffa125b7dcd851ee3906485fe00fd6a Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 1 Aug 2024 11:37:17 +0200
Subject: [PATCH 1/7] Set seq len axis based on model type

---
 .../prompt_lookup_decoding_lm/CMakeLists.txt  | 15 +++-
 .../prompt_lookup_decoding_lm.cpp             | 55 ++++++++++---
 .../speculative_decoding_lm/CMakeLists.txt    | 17 +++-
 .../speculative_decoding_lm.cpp               | 80 ++++++++++++++-----
 4 files changed, 132 insertions(+), 35 deletions(-)
diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
index 087c95bfc4..a504a4a7cb 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
@@ -6,16 +6,29 @@ if(TARGET openvino_tokenizers)
 else()
     message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.")
 endif()
+
+include(FetchContent)
+
+FetchContent_Declare(nlohmann_json
+    URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
+    URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
+FetchContent_MakeAvailable(nlohmann_json)
+
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 find_package(TBB REQUIRED COMPONENTS tbb)
+
 add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
-target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb)
+
+target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json)
+
 set_target_properties(prompt_lookup_decoding_lm PROPERTIES
     COMPILE_PDB_NAME prompt_lookup_decoding_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
+
 target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}")
 target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17)
+
 install(TARGETS prompt_lookup_decoding_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
index 51ac654aac..88ea135b36 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
+++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -1,17 +1,47 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <string_view>
+#include <fstream>
+#include <nlohmann/json.hpp>
 #include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
+#include <string_view>
 
 namespace {
 
 // only batch_size = 1 currently supported
 constexpr size_t BATCH_SIZE = 1;
-// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
-// threfore usually SEQ_LEN_AXIS = 2
-constexpr size_t SEQ_LEN_AXIS = 2;
+
+size_t get_seq_len_axis_from_config(const std::string model_dir) {
+    // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
+    // threfore usually DEFAILT_SEQ_LEN_AXIS = 2
+    constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2;
+
+    std::ifstream f(model_dir + "/config.json");
+
+    if (!f.is_open()) {
+        return DEFAILT_SEQ_LEN_AXIS;
+    }
+
+    nlohmann::json data = nlohmann::json::parse(f);
+
+    if (!data.contains("model_type")) {
+        return DEFAILT_SEQ_LEN_AXIS;
+    }
+
+    const std::string model_type = data["model_type"].get<std::string>();
+
+    const std::map<std::string, size_t> model_type_to_seq_len_axis{
+        {"chatglm", 0},
+        {"llama", 2},
+    };
+
+    if (!model_type_to_seq_len_axis.count(model_type)) {
+        return DEFAILT_SEQ_LEN_AXIS;
+    }
+
+    return model_type_to_seq_len_axis.at(model_type);
+}
 
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) {
     tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
@@ -58,7 +88,7 @@ struct TextStreamer {
     void end() {
         std::string text = detokenize(detokenizer, token_cache);
         if (text.size() <= print_len)
-            return ;
+            return;
         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
         token_cache.clear();
         print_len = 0;
@@ -75,10 +105,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
 
     auto old_tensor_data = tensor.data<float>();
     auto shape = tensor.get_shape();
-    size_t batch_size = shape[0];
-    size_t num_kv_heads = shape[1];
-    size_t old_seq_len = shape[2];
-    size_t head_size = shape[3];
+    size_t old_seq_len = shape[seq_len_axis];
 
     OPENVINO_ASSERT(new_seq_len <= old_seq_len);
 
@@ -86,14 +113,16 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
     if (old_seq_len == new_seq_len)
         return tensor;
 
+    shape[seq_len_axis] = new_seq_len;
+
     if (seq_len_axis == 0) {
-        shape[0] = new_seq_len;
         tensor.set_shape(shape);
         return tensor;
     }
 
     ov::Coordinate new_shape_begin{0, 0, 0, 0};
-    ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size};
+    ov::Coordinate new_shape_end{shape};
+
     auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end);
 
     return new_tensor;
@@ -228,6 +257,8 @@ int main(int argc, char* argv[]) try {
 
     const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
 
+    const size_t seq_len_axis = get_seq_len_axis_from_config(model_dir);
+
     // Prompt lookup decoding is a speculative decoding technic where the draft model replaced
     // with string matching in the prompt to generate candidate token sequences.
     int max_sequence_length = 100;
@@ -288,7 +319,7 @@ int main(int argc, char* argv[]) try {
         // Increment the sequence length by the number of matched tokens, and
         // trim the KV cache to match the new sequence length.
         seq_len += accepted_tokens_number;
-        update_kv_cache(model, SEQ_LEN_AXIS, seq_len);
+        update_kv_cache(model, seq_len_axis, seq_len);
 
         first_token = out_token;
     }
diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
index b30905bdb9..9a6a145d09 100644
--- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
@@ -6,16 +6,29 @@ if(TARGET openvino_tokenizers)
 else()
     message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.")
 endif()
+
+include(FetchContent)
+
+FetchContent_Declare(nlohmann_json
+    URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
+    URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
+FetchContent_MakeAvailable(nlohmann_json)
+
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 find_package(TBB REQUIRED COMPONENTS tbb)
+
 add_executable(speculative_decoding_lm speculative_decoding_lm.cpp)
-target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb)
+
+target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json)
+
 set_target_properties(speculative_decoding_lm PROPERTIES
     COMPILE_PDB_NAME speculative_decoding_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}")
+
+    target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}")
 target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17)
+
 install(TARGETS speculative_decoding_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index 4927b7d795..bf1750c40b 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -2,17 +2,47 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cmath>
+#include <fstream>
+#include <nlohmann/json.hpp>
 #include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
 #include <random>
 
+namespace {
+
 constexpr size_t BATCH_SIZE = 1;
 
-// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
-// threfore usually SEQ_LEN_AXIS = 2
-constexpr size_t SEQ_LEN_AXIS = 2;
+size_t get_seq_len_axis_from_config(const std::string model_dir) {
+    // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
+    // threfore usually DEFAILT_SEQ_LEN_AXIS = 2
+    constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2;
+
+    std::ifstream f(model_dir + "/config.json");
+
+    if (!f.is_open()) {
+        return DEFAILT_SEQ_LEN_AXIS;
+    }
+
+    nlohmann::json data = nlohmann::json::parse(f);
+
+    if (!data.contains("model_type")) {
+        return DEFAILT_SEQ_LEN_AXIS;
+    }
+
+    const std::string model_type = data["model_type"].get<std::string>();
+
+    const std::map<std::string, size_t> model_type_to_seq_len_axis{
+        {"chatglm", 0},
+        {"llama", 2},
+    };
+
+    if (!model_type_to_seq_len_axis.count(model_type)) {
+        return DEFAILT_SEQ_LEN_AXIS;
+    }
+
+    return model_type_to_seq_len_axis.at(model_type);
+}
 
-namespace {
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) {
     tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
     tokenizer.infer();
@@ -58,7 +88,7 @@ struct TextStreamer {
     void end() {
         std::string text = detokenize(detokenizer, token_cache);
         if (text.size() <= print_len)
-            return ;
+            return;
         std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n';
         token_cache.clear();
         print_len = 0;
@@ -75,10 +105,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
 
     auto old_tensor_data = tensor.data<float>();
     auto shape = tensor.get_shape();
-    size_t batch_size = shape[0];
-    size_t num_kv_heads = shape[1];
-    size_t old_seq_len = shape[2];
-    size_t head_size = shape[3];
+    size_t old_seq_len = shape[seq_len_axis];
 
     OPENVINO_ASSERT(new_seq_len <= old_seq_len);
 
@@ -86,14 +113,16 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_
     if (old_seq_len == new_seq_len)
         return tensor;
 
+    shape[seq_len_axis] = new_seq_len;
+
     if (seq_len_axis == 0) {
-        shape[0] = new_seq_len;
         tensor.set_shape(shape);
         return tensor;
     }
 
     ov::Coordinate new_shape_begin{0, 0, 0, 0};
-    ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size};
+    ov::Coordinate new_shape_end{shape};
+
     auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end);
 
     return new_tensor;
@@ -114,14 +143,19 @@ class AssistedCandidateGenerator {
     size_t max_seq_length;
     size_t num_pred_tokens = 5;
     const size_t max_pred_tokens = 10;
+    const size_t seq_len_axis;
     int64_t out_of_kv_cache_token = -1;
     size_t draft_model_seq_length = 0;
 
 public:
-    AssistedCandidateGenerator(ov::InferRequest draft_model, const size_t max_seq_length, const size_t num_pred_tokens)
+    AssistedCandidateGenerator(ov::InferRequest draft_model,
+                               const size_t max_seq_length,
+                               const size_t num_pred_tokens,
+                               const size_t seq_len_axis)
         : draft_model{draft_model},
           max_seq_length{max_seq_length},
-          num_pred_tokens{num_pred_tokens} {};
+          num_pred_tokens{num_pred_tokens},
+          seq_len_axis{seq_len_axis} {};
 
     int64_t generate_next_token(const std::vector<int64_t> tokens) {
         size_t tokens_size = tokens.size();
@@ -199,7 +233,7 @@ class AssistedCandidateGenerator {
         }
 
         out_of_kv_cache_token = -1;
-        ::update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_length);
+        ::update_kv_cache(draft_model, seq_len_axis, seq_length);
         draft_model_seq_length = seq_length;
     }
 };
@@ -224,27 +258,32 @@ int main(int argc, char* argv[]) try {
     // tokenizer model
     ov::Core core;
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
+
+    const std::string draft_model_dir = std::string{argv[1]};
+
+    auto tokenizer_model = core.read_model(draft_model_dir + "/openvino_tokenizer.xml");
     // tokenizer and detokenizer work on CPU only
     ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request();
     auto [input_ids, attention_mask] = tokenize(tokenizer, argv[3]);
     ov::InferRequest detokenizer =
-        core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+        core.compile_model(draft_model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request();
     TextStreamer text_streamer{std::move(detokenizer)};
 
     // draft model (which is smaller, less accurate but faster)
     ov::InferRequest draft_model =
-        core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
+        core.compile_model(draft_model_dir + "/openvino_model.xml", "CPU").create_infer_request();
 
     uint64_t seq_len = input_ids.get_shape()[1];
 
+    const std::string main_model_dir = std::string{argv[2]};
     // main model (which is bigger, more accurate but slower)
     ov::InferRequest main_model =
-        core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request();
+        core.compile_model(main_model_dir + "/openvino_model.xml", "CPU").create_infer_request();
 
     size_t max_sequence_length = 100;
 
-    AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5};
+    const size_t draft_model_seq_len_axis = get_seq_len_axis_from_config(draft_model_dir);
+    AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5, draft_model_seq_len_axis};
 
     main_model.set_tensor("input_ids", input_ids);
     main_model.set_tensor("attention_mask", attention_mask);
@@ -275,6 +314,7 @@ int main(int argc, char* argv[]) try {
     text_streamer.put(out_token);
 
     const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
+    const size_t main_model_seq_len_axis = get_seq_len_axis_from_config(main_model_dir);
 
     /* Speculative decoding works the following way. The draft model predicts the next K
        tokens one by one in an autoregressive manner, while the main model validates these
@@ -347,7 +387,7 @@ int main(int argc, char* argv[]) try {
         }
 
         candidateGenerator.update_kv_cache(seq_len);
-        update_kv_cache(main_model, SEQ_LEN_AXIS, seq_len);
+        update_kv_cache(main_model, main_model_seq_len_axis, seq_len);
 
         candidates.clear();
     }

From 06c7a9156ead9b92b4bfed7baeb1825dabfc0545 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Thu, 1 Aug 2024 12:02:14 +0200
Subject: [PATCH 2/7] Rename function

---
 .../prompt_lookup_decoding_lm.cpp                        | 7 +++++--
 .../speculative_decoding_lm/speculative_decoding_lm.cpp  | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
index 88ea135b36..ca30b51640 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
+++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -12,7 +12,10 @@ namespace {
 // only batch_size = 1 currently supported
 constexpr size_t BATCH_SIZE = 1;
 
-size_t get_seq_len_axis_from_config(const std::string model_dir) {
+size_t get_seq_len_axis(const std::string model_dir) {
+    // get sequence length axis based on config.json model_type
+    // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type
+
     // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
     // threfore usually DEFAILT_SEQ_LEN_AXIS = 2
     constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2;
@@ -257,7 +260,7 @@ int main(int argc, char* argv[]) try {
 
     const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
 
-    const size_t seq_len_axis = get_seq_len_axis_from_config(model_dir);
+    const size_t seq_len_axis = get_seq_len_axis(model_dir);
 
     // Prompt lookup decoding is a speculative decoding technic where the draft model replaced
     // with string matching in the prompt to generate candidate token sequences.
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index bf1750c40b..6df5d2ada9 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -12,7 +12,10 @@ namespace {
 
 constexpr size_t BATCH_SIZE = 1;
 
-size_t get_seq_len_axis_from_config(const std::string model_dir) {
+size_t get_seq_len_axis(const std::string model_dir) {
+    // get sequence length axis based on config.json model_type
+    // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type
+
     // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
     // threfore usually DEFAILT_SEQ_LEN_AXIS = 2
     constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2;
@@ -282,7 +285,7 @@ int main(int argc, char* argv[]) try {
 
     size_t max_sequence_length = 100;
 
-    const size_t draft_model_seq_len_axis = get_seq_len_axis_from_config(draft_model_dir);
+    const size_t draft_model_seq_len_axis = get_seq_len_axis(draft_model_dir);
     AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5, draft_model_seq_len_axis};
 
     main_model.set_tensor("input_ids", input_ids);
@@ -314,7 +317,7 @@ int main(int argc, char* argv[]) try {
     text_streamer.put(out_token);
 
     const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
-    const size_t main_model_seq_len_axis = get_seq_len_axis_from_config(main_model_dir);
+    const size_t main_model_seq_len_axis = get_seq_len_axis(main_model_dir);
 
     /* Speculative decoding works the following way. The draft model predicts the next K
        tokens one by one in an autoregressive manner, while the main model validates these

From 478de3096acd61dc76cb4066f346e68790fccac8 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 2 Aug 2024 10:30:49 +0200
Subject: [PATCH 3/7] Add guard and policy

---
 .../cpp/prompt_lookup_decoding_lm/CMakeLists.txt   | 14 ++++++++++----
 samples/cpp/speculative_decoding_lm/CMakeLists.txt | 14 ++++++++++----
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
index a504a4a7cb..73ec2cc25b 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
@@ -9,10 +9,16 @@ endif()
 
 include(FetchContent)
 
-FetchContent_Declare(nlohmann_json
-    URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
-    URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
-FetchContent_MakeAvailable(nlohmann_json)
+if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif()
+
+if(NOT TARGET nlohmann_json)
+    FetchContent_Declare(nlohmann_json
+        URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
+        URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
+    FetchContent_MakeAvailable(nlohmann_json)
+endif()
 
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 find_package(TBB REQUIRED COMPONENTS tbb)
diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
index 9a6a145d09..20bc8ef386 100644
--- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
@@ -9,10 +9,16 @@ endif()
 
 include(FetchContent)
 
-FetchContent_Declare(nlohmann_json
-    URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
-    URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
-FetchContent_MakeAvailable(nlohmann_json)
+if(POLICY CMP0135)
+    cmake_policy(SET CMP0135 NEW)
+endif()
+
+if(NOT TARGET nlohmann_json)
+    FetchContent_Declare(nlohmann_json
+        URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
+        URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
+    FetchContent_MakeAvailable(nlohmann_json)
+endif()
 
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 find_package(TBB REQUIRED COMPONENTS tbb)

From 1b6350daf8a16ac40013d29a1061bf282a1adcab Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 2 Aug 2024 17:56:29 +0200
Subject: [PATCH 4/7] Update only trim fn

---
 .github/workflows/causal_lm_cpp.yml           |  4 +-
 .../prompt_lookup_decoding_lm/CMakeLists.txt  | 23 +------
 .../prompt_lookup_decoding_lm.cpp             | 43 ++----------
 .../speculative_decoding_lm/CMakeLists.txt    | 25 +------
 .../speculative_decoding_lm.cpp               | 69 ++++---------------
 5 files changed, 24 insertions(+), 140 deletions(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 527259f203..5ac1299706 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -376,7 +376,7 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
-          ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
+          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
@@ -420,7 +420,7 @@ jobs:
           A:' > ./prompt.txt
 
           ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_prompt_lookup.txt
-          ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
+          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$(<prompt.txt)" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()
diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
index 73ec2cc25b..08674906f3 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
@@ -6,36 +6,17 @@ if(TARGET openvino_tokenizers)
 else()
     message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.")
 endif()
-
-include(FetchContent)
-
-if(POLICY CMP0135)
-    cmake_policy(SET CMP0135 NEW)
-endif()
-
-if(NOT TARGET nlohmann_json)
-    FetchContent_Declare(nlohmann_json
-        URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
-        URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
-    FetchContent_MakeAvailable(nlohmann_json)
-endif()
-
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 find_package(TBB REQUIRED COMPONENTS tbb)
-
 add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp)
-
-target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json)
-
+target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb)
 set_target_properties(prompt_lookup_decoding_lm PROPERTIES
     COMPILE_PDB_NAME prompt_lookup_decoding_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-
 target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}")
 target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17)
-
 install(TARGETS prompt_lookup_decoding_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
-    EXCLUDE_FROM_ALL)
+    EXCLUDE_FROM_ALL)
\ No newline at end of file
diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
index ca30b51640..dc70d25c2c 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
+++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp
@@ -1,8 +1,6 @@
 // Copyright (C) 2023-2024 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 
-#include <fstream>
-#include <nlohmann/json.hpp>
 #include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
 #include <string_view>
@@ -11,40 +9,9 @@ namespace {
 
 // only batch_size = 1 currently supported
 constexpr size_t BATCH_SIZE = 1;
-
-size_t get_seq_len_axis(const std::string model_dir) {
-    // get sequence length axis based on config.json model_type
-    // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type
-
-    // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
-    // threfore usually DEFAILT_SEQ_LEN_AXIS = 2
-    constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2;
-
-    std::ifstream f(model_dir + "/config.json");
-
-    if (!f.is_open()) {
-        return DEFAILT_SEQ_LEN_AXIS;
-    }
-
-    nlohmann::json data = nlohmann::json::parse(f);
-
-    if (!data.contains("model_type")) {
-        return DEFAILT_SEQ_LEN_AXIS;
-    }
-
-    const std::string model_type = data["model_type"].get<std::string>();
-
-    const std::map<std::string, size_t> model_type_to_seq_len_axis{
-        {"chatglm", 0},
-        {"llama", 2},
-    };
-
-    if (!model_type_to_seq_len_axis.count(model_type)) {
-        return DEFAILT_SEQ_LEN_AXIS;
-    }
-
-    return model_type_to_seq_len_axis.at(model_type);
-}
+// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
+// threfore usually SEQ_LEN_AXIS = 2
+constexpr size_t SEQ_LEN_AXIS = 2;
 
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) {
     tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
@@ -260,8 +227,6 @@ int main(int argc, char* argv[]) try {
 
     const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
 
-    const size_t seq_len_axis = get_seq_len_axis(model_dir);
-
     // Prompt lookup decoding is a speculative decoding technic where the draft model replaced
     // with string matching in the prompt to generate candidate token sequences.
     int max_sequence_length = 100;
@@ -322,7 +287,7 @@ int main(int argc, char* argv[]) try {
         // Increment the sequence length by the number of matched tokens, and
         // trim the KV cache to match the new sequence length.
         seq_len += accepted_tokens_number;
-        update_kv_cache(model, seq_len_axis, seq_len);
+        update_kv_cache(model, SEQ_LEN_AXIS, seq_len);
 
         first_token = out_token;
     }
diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
index 20bc8ef386..8fc21b59d8 100644
--- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
@@ -6,36 +6,17 @@ if(TARGET openvino_tokenizers)
 else()
     message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.")
 endif()
-
-include(FetchContent)
-
-if(POLICY CMP0135)
-    cmake_policy(SET CMP0135 NEW)
-endif()
-
-if(NOT TARGET nlohmann_json)
-    FetchContent_Declare(nlohmann_json
-        URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz
-        URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406)
-    FetchContent_MakeAvailable(nlohmann_json)
-endif()
-
 find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 find_package(TBB REQUIRED COMPONENTS tbb)
-
 add_executable(speculative_decoding_lm speculative_decoding_lm.cpp)
-
-target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json)
-
+target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb)
 set_target_properties(speculative_decoding_lm PROPERTIES
     COMPILE_PDB_NAME speculative_decoding_lm
     # Ensure out of box LC_RPATH on macOS with SIP
     INSTALL_RPATH_USE_LINK_PATH ON)
-
-    target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}")
+target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}")
 target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17)
-
 install(TARGETS speculative_decoding_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
-    EXCLUDE_FROM_ALL)
+    EXCLUDE_FROM_ALL)
\ No newline at end of file
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index 6df5d2ada9..10b3972fd6 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -2,8 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include <cmath>
-#include <fstream>
-#include <nlohmann/json.hpp>
 #include <openvino/core/parallel.hpp>
 #include <openvino/openvino.hpp>
 #include <random>
@@ -11,40 +9,9 @@
 namespace {
 
 constexpr size_t BATCH_SIZE = 1;
-
-size_t get_seq_len_axis(const std::string model_dir) {
-    // get sequence length axis based on config.json model_type
-    // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type
-
-    // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
-    // threfore usually DEFAILT_SEQ_LEN_AXIS = 2
-    constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2;
-
-    std::ifstream f(model_dir + "/config.json");
-
-    if (!f.is_open()) {
-        return DEFAILT_SEQ_LEN_AXIS;
-    }
-
-    nlohmann::json data = nlohmann::json::parse(f);
-
-    if (!data.contains("model_type")) {
-        return DEFAILT_SEQ_LEN_AXIS;
-    }
-
-    const std::string model_type = data["model_type"].get<std::string>();
-
-    const std::map<std::string, size_t> model_type_to_seq_len_axis{
-        {"chatglm", 0},
-        {"llama", 2},
-    };
-
-    if (!model_type_to_seq_len_axis.count(model_type)) {
-        return DEFAILT_SEQ_LEN_AXIS;
-    }
-
-    return model_type_to_seq_len_axis.at(model_type);
-}
+// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size],
+// threfore usually SEQ_LEN_AXIS = 2
+constexpr size_t SEQ_LEN_AXIS = 2;
 
 std::pair<ov::Tensor, ov::Tensor> tokenize(ov::InferRequest& tokenizer, std::string&& prompt) {
     tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt});
@@ -146,19 +113,14 @@ class AssistedCandidateGenerator {
     size_t max_seq_length;
     size_t num_pred_tokens = 5;
     const size_t max_pred_tokens = 10;
-    const size_t seq_len_axis;
     int64_t out_of_kv_cache_token = -1;
     size_t draft_model_seq_length = 0;
 
 public:
-    AssistedCandidateGenerator(ov::InferRequest draft_model,
-                               const size_t max_seq_length,
-                               const size_t num_pred_tokens,
-                               const size_t seq_len_axis)
+    AssistedCandidateGenerator(ov::InferRequest draft_model, const size_t max_seq_length, const size_t num_pred_tokens)
         : draft_model{draft_model},
           max_seq_length{max_seq_length},
-          num_pred_tokens{num_pred_tokens},
-          seq_len_axis{seq_len_axis} {};
+          num_pred_tokens{num_pred_tokens} {};
 
     int64_t generate_next_token(const std::vector<int64_t> tokens) {
         size_t tokens_size = tokens.size();
@@ -236,7 +198,7 @@ class AssistedCandidateGenerator {
         }
 
         out_of_kv_cache_token = -1;
-        ::update_kv_cache(draft_model, seq_len_axis, seq_length);
+        ::update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_length);
         draft_model_seq_length = seq_length;
     }
 };
@@ -262,31 +224,27 @@ int main(int argc, char* argv[]) try {
     ov::Core core;
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
 
-    const std::string draft_model_dir = std::string{argv[1]};
-
-    auto tokenizer_model = core.read_model(draft_model_dir + "/openvino_tokenizer.xml");
+    auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
     // tokenizer and detokenizer work on CPU only
     ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request();
     auto [input_ids, attention_mask] = tokenize(tokenizer, argv[3]);
     ov::InferRequest detokenizer =
-        core.compile_model(draft_model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+        core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request();
     TextStreamer text_streamer{std::move(detokenizer)};
 
     // draft model (which is smaller, less accurate but faster)
     ov::InferRequest draft_model =
-        core.compile_model(draft_model_dir + "/openvino_model.xml", "CPU").create_infer_request();
+        core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request();
 
     uint64_t seq_len = input_ids.get_shape()[1];
 
-    const std::string main_model_dir = std::string{argv[2]};
     // main model (which is bigger, more accurate but slower)
     ov::InferRequest main_model =
-        core.compile_model(main_model_dir + "/openvino_model.xml", "CPU").create_infer_request();
+        core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request();
 
-    size_t max_sequence_length = 100;
+    size_t max_sequence_length = 100 + seq_len;
 
-    const size_t draft_model_seq_len_axis = get_seq_len_axis(draft_model_dir);
-    AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5, draft_model_seq_len_axis};
+    AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5};
 
     main_model.set_tensor("input_ids", input_ids);
     main_model.set_tensor("attention_mask", attention_mask);
@@ -317,7 +275,6 @@ int main(int argc, char* argv[]) try {
     text_streamer.put(out_token);
 
     const int64_t EOS_TOKEN = get_eos_token(tokenizer_model);
-    const size_t main_model_seq_len_axis = get_seq_len_axis(main_model_dir);
 
     /* Speculative decoding works the following way. The draft model predicts the next K
        tokens one by one in an autoregressive manner, while the main model validates these
@@ -390,7 +347,7 @@ int main(int argc, char* argv[]) try {
         }
 
         candidateGenerator.update_kv_cache(seq_len);
-        update_kv_cache(main_model, main_model_seq_len_axis, seq_len);
+        update_kv_cache(main_model, SEQ_LEN_AXIS, seq_len);
 
         candidates.clear();
     }

From a3fdc043812d5c6b67305bdb6758f7e2dc6b2365 Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Fri, 2 Aug 2024 17:58:59 +0200
Subject: [PATCH 5/7] Fix formatting

---
 samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt           | 2 +-
 samples/cpp/speculative_decoding_lm/CMakeLists.txt             | 2 +-
 .../cpp/speculative_decoding_lm/speculative_decoding_lm.cpp    | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
index 08674906f3..087c95bfc4 100644
--- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt
@@ -19,4 +19,4 @@ target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17)
 install(TARGETS prompt_lookup_decoding_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
-    EXCLUDE_FROM_ALL)
\ No newline at end of file
+    EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
index 8fc21b59d8..b30905bdb9 100644
--- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt
+++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt
@@ -19,4 +19,4 @@ target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17)
 install(TARGETS speculative_decoding_lm
     RUNTIME DESTINATION samples_bin/
     COMPONENT samples_bin
-    EXCLUDE_FROM_ALL)
\ No newline at end of file
+    EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
index 10b3972fd6..ba610574e8 100644
--- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
+++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp
@@ -223,7 +223,6 @@ int main(int argc, char* argv[]) try {
     // tokenizer model
     ov::Core core;
     core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-
     auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml");
     // tokenizer and detokenizer work on CPU only
     ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request();
@@ -242,7 +241,7 @@ int main(int argc, char* argv[]) try {
     ov::InferRequest main_model =
         core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request();
 
-    size_t max_sequence_length = 100 + seq_len;
+    size_t max_sequence_length = 100;
 
     AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5};
 

From 8dfc1abf73377e4e266e230d169b13aa0b3654cd Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Mon, 5 Aug 2024 08:30:28 +0200
Subject: [PATCH 6/7] Fix command

---
 .github/workflows/causal_lm_cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 5ac1299706..9d7fc7ae86 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -376,7 +376,7 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
-          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
+          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ "Alan Turing was a" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()

From 61a30814d46c2a8d1c20ac237ee86a3b430cebab Mon Sep 17 00:00:00 2001
From: Alexander Suvorov <alexander.suvorov@intel.com>
Date: Mon, 5 Aug 2024 09:03:57 +0200
Subject: [PATCH 7/7] Fix main model

---
 .github/workflows/causal_lm_cpp.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
index 9d7fc7ae86..d9f886760f 100644
--- a/.github/workflows/causal_lm_cpp.yml
+++ b/.github/workflows/causal_lm_cpp.yml
@@ -376,7 +376,7 @@ jobs:
         run: |
           source ./ov/setupvars.sh
           ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt
-          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ "Alan Turing was a" > predictions_greedy.txt
+          ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt
           python -c "
           with open('predictions_greedy.txt', 'r') as f:
               predicted_greedy = f.readline()