From 841fae001ffa125b7dcd851ee3906485fe00fd6a Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 1 Aug 2024 11:37:17 +0200 Subject: [PATCH 1/7] Set seq len axis based on model type --- .../prompt_lookup_decoding_lm/CMakeLists.txt | 15 +++- .../prompt_lookup_decoding_lm.cpp | 55 ++++++++++--- .../speculative_decoding_lm/CMakeLists.txt | 17 +++- .../speculative_decoding_lm.cpp | 80 ++++++++++++++----- 4 files changed, 132 insertions(+), 35 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index 087c95bfc4..a504a4a7cb 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -6,16 +6,29 @@ if(TARGET openvino_tokenizers) else() message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") endif() + +include(FetchContent) + +FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) +FetchContent_MakeAvailable(nlohmann_json) + find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB REQUIRED COMPONENTS tbb) + add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) -target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb) + +target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json) + set_target_properties(prompt_lookup_decoding_lm PROPERTIES COMPILE_PDB_NAME prompt_lookup_decoding_lm # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) + target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17) + install(TARGETS prompt_lookup_decoding_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 51ac654aac..88ea135b36 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -1,17 +1,47 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include +#include +#include #include #include +#include namespace { // only batch_size = 1 currently supported constexpr size_t BATCH_SIZE = 1; -// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], -// threfore usually SEQ_LEN_AXIS = 2 -constexpr size_t SEQ_LEN_AXIS = 2; + +size_t get_seq_len_axis_from_config(const std::string model_dir) { + // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], + // threfore usually DEFAILT_SEQ_LEN_AXIS = 2 + constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2; + + std::ifstream f(model_dir + "/config.json"); + + if (!f.is_open()) { + return DEFAILT_SEQ_LEN_AXIS; + } + + nlohmann::json data = nlohmann::json::parse(f); + + if (!data.contains("model_type")) { + return DEFAILT_SEQ_LEN_AXIS; + } + + const std::string model_type = data["model_type"].get(); + + const std::map model_type_to_seq_len_axis{ + {"chatglm", 0}, + {"llama", 2}, + }; + + if (!model_type_to_seq_len_axis.count(model_type)) { + return DEFAILT_SEQ_LEN_AXIS; + } + + return model_type_to_seq_len_axis.at(model_type); +} std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); @@ -58,7 +88,7 @@ struct TextStreamer { void end() { std::string text = detokenize(detokenizer, token_cache); if (text.size() <= print_len) - return ; + return; std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; token_cache.clear(); print_len = 0; @@ -75,10 +105,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ auto old_tensor_data = tensor.data(); auto shape = tensor.get_shape(); - size_t batch_size = shape[0]; - size_t num_kv_heads = shape[1]; - size_t old_seq_len = shape[2]; - size_t head_size = shape[3]; + size_t old_seq_len = shape[seq_len_axis]; OPENVINO_ASSERT(new_seq_len <= old_seq_len); @@ -86,14 +113,16 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ if (old_seq_len == new_seq_len) return tensor; + shape[seq_len_axis] = new_seq_len; + if (seq_len_axis == 0) { - shape[0] = new_seq_len; tensor.set_shape(shape); return tensor; } ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size}; + ov::Coordinate new_shape_end{shape}; + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); return new_tensor; @@ -228,6 +257,8 @@ int main(int argc, char* argv[]) try { const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); + const size_t seq_len_axis = get_seq_len_axis_from_config(model_dir); + // Prompt lookup decoding is a speculative decoding technic where the draft model replaced // with string matching in the prompt to generate candidate token sequences. int max_sequence_length = 100; @@ -288,7 +319,7 @@ int main(int argc, char* argv[]) try { // Increment the sequence length by the number of matched tokens, and // trim the KV cache to match the new sequence length. seq_len += accepted_tokens_number; - update_kv_cache(model, SEQ_LEN_AXIS, seq_len); + update_kv_cache(model, seq_len_axis, seq_len); first_token = out_token; } diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt index b30905bdb9..9a6a145d09 100644 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -6,16 +6,29 @@ if(TARGET openvino_tokenizers) else() message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") endif() + +include(FetchContent) + +FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) +FetchContent_MakeAvailable(nlohmann_json) + find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB REQUIRED COMPONENTS tbb) + add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) -target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb) + +target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json) + set_target_properties(speculative_decoding_lm PROPERTIES COMPILE_PDB_NAME speculative_decoding_lm # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) -target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") + + target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17) + install(TARGETS speculative_decoding_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 4927b7d795..bf1750c40b 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -2,17 +2,47 @@ // SPDX-License-Identifier: Apache-2.0 #include +#include +#include #include #include #include +namespace { + constexpr size_t BATCH_SIZE = 1; -// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], -// threfore usually SEQ_LEN_AXIS = 2 -constexpr size_t SEQ_LEN_AXIS = 2; +size_t get_seq_len_axis_from_config(const std::string model_dir) { + // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], + // threfore usually DEFAILT_SEQ_LEN_AXIS = 2 + constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2; + + std::ifstream f(model_dir + "/config.json"); + + if (!f.is_open()) { + return DEFAILT_SEQ_LEN_AXIS; + } + + nlohmann::json data = nlohmann::json::parse(f); + + if (!data.contains("model_type")) { + return DEFAILT_SEQ_LEN_AXIS; + } + + const std::string model_type = data["model_type"].get(); + + const std::map model_type_to_seq_len_axis{ + {"chatglm", 0}, + {"llama", 2}, + }; + + if (!model_type_to_seq_len_axis.count(model_type)) { + return DEFAILT_SEQ_LEN_AXIS; + } + + return model_type_to_seq_len_axis.at(model_type); +} -namespace { std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); tokenizer.infer(); @@ -58,7 +88,7 @@ struct TextStreamer { void end() { std::string text = detokenize(detokenizer, token_cache); if (text.size() <= print_len) - return ; + return; std::cout << std::string_view{text.data() + print_len, text.size() - print_len} << '\n'; token_cache.clear(); print_len = 0; @@ -75,10 +105,7 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ auto old_tensor_data = tensor.data(); auto shape = tensor.get_shape(); - size_t batch_size = shape[0]; - size_t num_kv_heads = shape[1]; - size_t old_seq_len = shape[2]; - size_t head_size = shape[3]; + size_t old_seq_len = shape[seq_len_axis]; OPENVINO_ASSERT(new_seq_len <= old_seq_len); @@ -86,14 +113,16 @@ ov::Tensor trimm_tensor(ov::Tensor& tensor, uint64_t seq_len_axis, uint64_t new_ if (old_seq_len == new_seq_len) return tensor; + shape[seq_len_axis] = new_seq_len; + if (seq_len_axis == 0) { - shape[0] = new_seq_len; tensor.set_shape(shape); return tensor; } ov::Coordinate new_shape_begin{0, 0, 0, 0}; - ov::Coordinate new_shape_end{batch_size, num_kv_heads, new_seq_len, head_size}; + ov::Coordinate new_shape_end{shape}; + auto new_tensor = ov::Tensor(tensor, new_shape_begin, new_shape_end); return new_tensor; @@ -114,14 +143,19 @@ class AssistedCandidateGenerator { size_t max_seq_length; size_t num_pred_tokens = 5; const size_t max_pred_tokens = 10; + const size_t seq_len_axis; int64_t out_of_kv_cache_token = -1; size_t draft_model_seq_length = 0; public: - AssistedCandidateGenerator(ov::InferRequest draft_model, const size_t max_seq_length, const size_t num_pred_tokens) + AssistedCandidateGenerator(ov::InferRequest draft_model, + const size_t max_seq_length, + const size_t num_pred_tokens, + const size_t seq_len_axis) : draft_model{draft_model}, max_seq_length{max_seq_length}, - num_pred_tokens{num_pred_tokens} {}; + num_pred_tokens{num_pred_tokens}, + seq_len_axis{seq_len_axis} {}; int64_t generate_next_token(const std::vector tokens) { size_t tokens_size = tokens.size(); @@ -199,7 +233,7 @@ class AssistedCandidateGenerator { } out_of_kv_cache_token = -1; - ::update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_length); + ::update_kv_cache(draft_model, seq_len_axis, seq_length); draft_model_seq_length = seq_length; } }; @@ -224,27 +258,32 @@ int main(int argc, char* argv[]) try { // tokenizer model ov::Core core; core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); + + const std::string draft_model_dir = std::string{argv[1]}; + + auto tokenizer_model = core.read_model(draft_model_dir + "/openvino_tokenizer.xml"); // tokenizer and detokenizer work on CPU only ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); auto [input_ids, attention_mask] = tokenize(tokenizer, argv[3]); ov::InferRequest detokenizer = - core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + core.compile_model(draft_model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request(); TextStreamer text_streamer{std::move(detokenizer)}; // draft model (which is smaller, less accurate but faster) ov::InferRequest draft_model = - core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); + core.compile_model(draft_model_dir + "/openvino_model.xml", "CPU").create_infer_request(); uint64_t seq_len = input_ids.get_shape()[1]; + const std::string main_model_dir = std::string{argv[2]}; // main model (which is bigger, more accurate but slower) ov::InferRequest main_model = - core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request(); + core.compile_model(main_model_dir + "/openvino_model.xml", "CPU").create_infer_request(); size_t max_sequence_length = 100; - AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5}; + const size_t draft_model_seq_len_axis = get_seq_len_axis_from_config(draft_model_dir); + AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5, draft_model_seq_len_axis}; main_model.set_tensor("input_ids", input_ids); main_model.set_tensor("attention_mask", attention_mask); @@ -275,6 +314,7 @@ int main(int argc, char* argv[]) try { text_streamer.put(out_token); const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); + const size_t main_model_seq_len_axis = get_seq_len_axis_from_config(main_model_dir); /* Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these @@ -347,7 +387,7 @@ int main(int argc, char* argv[]) try { } candidateGenerator.update_kv_cache(seq_len); - update_kv_cache(main_model, SEQ_LEN_AXIS, seq_len); + update_kv_cache(main_model, main_model_seq_len_axis, seq_len); candidates.clear(); } From 06c7a9156ead9b92b4bfed7baeb1825dabfc0545 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Thu, 1 Aug 2024 12:02:14 +0200 Subject: [PATCH 2/7] Rename function --- .../prompt_lookup_decoding_lm.cpp | 7 +++++-- .../speculative_decoding_lm/speculative_decoding_lm.cpp | 9 ++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index 88ea135b36..ca30b51640 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -12,7 +12,10 @@ namespace { // only batch_size = 1 currently supported constexpr size_t BATCH_SIZE = 1; -size_t get_seq_len_axis_from_config(const std::string model_dir) { +size_t get_seq_len_axis(const std::string model_dir) { + // get sequence length axis based on config.json model_type + // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type + // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], // threfore usually DEFAILT_SEQ_LEN_AXIS = 2 constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2; @@ -257,7 +260,7 @@ int main(int argc, char* argv[]) try { const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); - const size_t seq_len_axis = get_seq_len_axis_from_config(model_dir); + const size_t seq_len_axis = get_seq_len_axis(model_dir); // Prompt lookup decoding is a speculative decoding technic where the draft model replaced // with string matching in the prompt to generate candidate token sequences. diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index bf1750c40b..6df5d2ada9 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -12,7 +12,10 @@ namespace { constexpr size_t BATCH_SIZE = 1; -size_t get_seq_len_axis_from_config(const std::string model_dir) { +size_t get_seq_len_axis(const std::string model_dir) { + // get sequence length axis based on config.json model_type + // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type + // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], // threfore usually DEFAILT_SEQ_LEN_AXIS = 2 constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2; @@ -282,7 +285,7 @@ int main(int argc, char* argv[]) try { size_t max_sequence_length = 100; - const size_t draft_model_seq_len_axis = get_seq_len_axis_from_config(draft_model_dir); + const size_t draft_model_seq_len_axis = get_seq_len_axis(draft_model_dir); AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5, draft_model_seq_len_axis}; main_model.set_tensor("input_ids", input_ids); @@ -314,7 +317,7 @@ int main(int argc, char* argv[]) try { text_streamer.put(out_token); const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); - const size_t main_model_seq_len_axis = get_seq_len_axis_from_config(main_model_dir); + const size_t main_model_seq_len_axis = get_seq_len_axis(main_model_dir); /* Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these From 478de3096acd61dc76cb4066f346e68790fccac8 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 2 Aug 2024 10:30:49 +0200 Subject: [PATCH 3/7] Add guard and policy --- .../cpp/prompt_lookup_decoding_lm/CMakeLists.txt | 14 ++++++++++---- samples/cpp/speculative_decoding_lm/CMakeLists.txt | 14 ++++++++++---- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index a504a4a7cb..73ec2cc25b 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -9,10 +9,16 @@ endif() include(FetchContent) -FetchContent_Declare(nlohmann_json - URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz - URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) -FetchContent_MakeAvailable(nlohmann_json) +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +if(NOT TARGET nlohmann_json) + FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) + FetchContent_MakeAvailable(nlohmann_json) +endif() find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB REQUIRED COMPONENTS tbb) diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt index 9a6a145d09..20bc8ef386 100644 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -9,10 +9,16 @@ endif() include(FetchContent) -FetchContent_Declare(nlohmann_json - URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz - URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) -FetchContent_MakeAvailable(nlohmann_json) +if(POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif() + +if(NOT TARGET nlohmann_json) + FetchContent_Declare(nlohmann_json + URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz + URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) + FetchContent_MakeAvailable(nlohmann_json) +endif() find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB REQUIRED COMPONENTS tbb) From 1b6350daf8a16ac40013d29a1061bf282a1adcab Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 2 Aug 2024 17:56:29 +0200 Subject: [PATCH 4/7] Update only trim fn --- .github/workflows/causal_lm_cpp.yml | 4 +- .../prompt_lookup_decoding_lm/CMakeLists.txt | 23 +------ .../prompt_lookup_decoding_lm.cpp | 43 ++---------- .../speculative_decoding_lm/CMakeLists.txt | 25 +------ .../speculative_decoding_lm.cpp | 69 ++++--------------- 5 files changed, 24 insertions(+), 140 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 527259f203..5ac1299706 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -376,7 +376,7 @@ jobs: run: | source ./ov/setupvars.sh ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt - ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() @@ -420,7 +420,7 @@ jobs: A:' > ./prompt.txt ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_prompt_lookup.txt - ./build/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./TinyLlama-1.1B-Chat-v1.0/ "$( predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index 73ec2cc25b..08674906f3 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -6,36 +6,17 @@ if(TARGET openvino_tokenizers) else() message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") endif() - -include(FetchContent) - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -if(NOT TARGET nlohmann_json) - FetchContent_Declare(nlohmann_json - URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz - URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) - FetchContent_MakeAvailable(nlohmann_json) -endif() - find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB REQUIRED COMPONENTS tbb) - add_executable(prompt_lookup_decoding_lm prompt_lookup_decoding_lm.cpp) - -target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json) - +target_link_libraries(prompt_lookup_decoding_lm PRIVATE openvino::runtime TBB::tbb) set_target_properties(prompt_lookup_decoding_lm PROPERTIES COMPILE_PDB_NAME prompt_lookup_decoding_lm # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) - target_compile_definitions(prompt_lookup_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17) - install(TARGETS prompt_lookup_decoding_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp index ca30b51640..dc70d25c2c 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp +++ b/samples/cpp/prompt_lookup_decoding_lm/prompt_lookup_decoding_lm.cpp @@ -1,8 +1,6 @@ // Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 -#include -#include #include #include #include @@ -11,40 +9,9 @@ namespace { // only batch_size = 1 currently supported constexpr size_t BATCH_SIZE = 1; - -size_t get_seq_len_axis(const std::string model_dir) { - // get sequence length axis based on config.json model_type - // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type - - // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], - // threfore usually DEFAILT_SEQ_LEN_AXIS = 2 - constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2; - - std::ifstream f(model_dir + "/config.json"); - - if (!f.is_open()) { - return DEFAILT_SEQ_LEN_AXIS; - } - - nlohmann::json data = nlohmann::json::parse(f); - - if (!data.contains("model_type")) { - return DEFAILT_SEQ_LEN_AXIS; - } - - const std::string model_type = data["model_type"].get(); - - const std::map model_type_to_seq_len_axis{ - {"chatglm", 0}, - {"llama", 2}, - }; - - if (!model_type_to_seq_len_axis.count(model_type)) { - return DEFAILT_SEQ_LEN_AXIS; - } - - return model_type_to_seq_len_axis.at(model_type); -} +// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], +// threfore usually SEQ_LEN_AXIS = 2 +constexpr size_t SEQ_LEN_AXIS = 2; std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); @@ -260,8 +227,6 @@ int main(int argc, char* argv[]) try { const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); - const size_t seq_len_axis = get_seq_len_axis(model_dir); - // Prompt lookup decoding is a speculative decoding technic where the draft model replaced // with string matching in the prompt to generate candidate token sequences. int max_sequence_length = 100; @@ -322,7 +287,7 @@ int main(int argc, char* argv[]) try { // Increment the sequence length by the number of matched tokens, and // trim the KV cache to match the new sequence length. seq_len += accepted_tokens_number; - update_kv_cache(model, seq_len_axis, seq_len); + update_kv_cache(model, SEQ_LEN_AXIS, seq_len); first_token = out_token; } diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt index 20bc8ef386..8fc21b59d8 100644 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -6,36 +6,17 @@ if(TARGET openvino_tokenizers) else() message(FATAL_ERROR "multinomial_causal_lm must be compiled as part of OpenVIINOGenAI to have the path to openvino_tokenizers hardcoded.") endif() - -include(FetchContent) - -if(POLICY CMP0135) - cmake_policy(SET CMP0135 NEW) -endif() - -if(NOT TARGET nlohmann_json) - FetchContent_Declare(nlohmann_json - URL https://github.com/nlohmann/json/archive/refs/tags/v3.11.3.tar.gz - URL_HASH SHA256=0d8ef5af7f9794e3263480193c491549b2ba6cc74bb018906202ada498a79406) - FetchContent_MakeAvailable(nlohmann_json) -endif() - find_package(OpenVINO REQUIRED COMPONENTS Runtime) find_package(TBB REQUIRED COMPONENTS tbb) - add_executable(speculative_decoding_lm speculative_decoding_lm.cpp) - -target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb nlohmann_json::nlohmann_json) - +target_link_libraries(speculative_decoding_lm PRIVATE openvino::runtime TBB::tbb) set_target_properties(speculative_decoding_lm PROPERTIES COMPILE_PDB_NAME speculative_decoding_lm # Ensure out of box LC_RPATH on macOS with SIP INSTALL_RPATH_USE_LINK_PATH ON) - - target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") +target_compile_definitions(speculative_decoding_lm PRIVATE OPENVINO_TOKENIZERS_PATH="${OPENVINO_TOKENIZERS_PATH}") target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17) - install(TARGETS speculative_decoding_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) + EXCLUDE_FROM_ALL) \ No newline at end of file diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 6df5d2ada9..10b3972fd6 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -2,8 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 #include -#include -#include #include #include #include @@ -11,40 +9,9 @@ namespace { constexpr size_t BATCH_SIZE = 1; - -size_t get_seq_len_axis(const std::string model_dir) { - // get sequence length axis based on config.json model_type - // return DEFAILT_SEQ_LEN_AXIS if no model_type found or if there is no predefined seq len axis for this model type - - // sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], - // threfore usually DEFAILT_SEQ_LEN_AXIS = 2 - constexpr size_t DEFAILT_SEQ_LEN_AXIS = 2; - - std::ifstream f(model_dir + "/config.json"); - - if (!f.is_open()) { - return DEFAILT_SEQ_LEN_AXIS; - } - - nlohmann::json data = nlohmann::json::parse(f); - - if (!data.contains("model_type")) { - return DEFAILT_SEQ_LEN_AXIS; - } - - const std::string model_type = data["model_type"].get(); - - const std::map model_type_to_seq_len_axis{ - {"chatglm", 0}, - {"llama", 2}, - }; - - if (!model_type_to_seq_len_axis.count(model_type)) { - return DEFAILT_SEQ_LEN_AXIS; - } - - return model_type_to_seq_len_axis.at(model_type); -} +// sequence length axis in key/values tensors, for most cases [BATCH_SIZE, num_kv_heads, seq_len, head_size], +// threfore usually SEQ_LEN_AXIS = 2 +constexpr size_t SEQ_LEN_AXIS = 2; std::pair tokenize(ov::InferRequest& tokenizer, std::string&& prompt) { tokenizer.set_input_tensor(ov::Tensor{ov::element::string, {BATCH_SIZE}, &prompt}); @@ -146,19 +113,14 @@ class AssistedCandidateGenerator { size_t max_seq_length; size_t num_pred_tokens = 5; const size_t max_pred_tokens = 10; - const size_t seq_len_axis; int64_t out_of_kv_cache_token = -1; size_t draft_model_seq_length = 0; public: - AssistedCandidateGenerator(ov::InferRequest draft_model, - const size_t max_seq_length, - const size_t num_pred_tokens, - const size_t seq_len_axis) + AssistedCandidateGenerator(ov::InferRequest draft_model, const size_t max_seq_length, const size_t num_pred_tokens) : draft_model{draft_model}, max_seq_length{max_seq_length}, - num_pred_tokens{num_pred_tokens}, - seq_len_axis{seq_len_axis} {}; + num_pred_tokens{num_pred_tokens} {}; int64_t generate_next_token(const std::vector tokens) { size_t tokens_size = tokens.size(); @@ -236,7 +198,7 @@ class AssistedCandidateGenerator { } out_of_kv_cache_token = -1; - ::update_kv_cache(draft_model, seq_len_axis, seq_length); + ::update_kv_cache(draft_model, SEQ_LEN_AXIS, seq_length); draft_model_seq_length = seq_length; } }; @@ -262,31 +224,27 @@ int main(int argc, char* argv[]) try { ov::Core core; core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - const std::string draft_model_dir = std::string{argv[1]}; - - auto tokenizer_model = core.read_model(draft_model_dir + "/openvino_tokenizer.xml"); + auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); // tokenizer and detokenizer work on CPU only ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); auto [input_ids, attention_mask] = tokenize(tokenizer, argv[3]); ov::InferRequest detokenizer = - core.compile_model(draft_model_dir + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + core.compile_model(std::string{argv[1]} + "/openvino_detokenizer.xml", "CPU").create_infer_request(); TextStreamer text_streamer{std::move(detokenizer)}; // draft model (which is smaller, less accurate but faster) ov::InferRequest draft_model = - core.compile_model(draft_model_dir + "/openvino_model.xml", "CPU").create_infer_request(); + core.compile_model(std::string{argv[1]} + "/openvino_model.xml", "CPU").create_infer_request(); uint64_t seq_len = input_ids.get_shape()[1]; - const std::string main_model_dir = std::string{argv[2]}; // main model (which is bigger, more accurate but slower) ov::InferRequest main_model = - core.compile_model(main_model_dir + "/openvino_model.xml", "CPU").create_infer_request(); + core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request(); - size_t max_sequence_length = 100; + size_t max_sequence_length = 100 + seq_len; - const size_t draft_model_seq_len_axis = get_seq_len_axis(draft_model_dir); - AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5, draft_model_seq_len_axis}; + AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5}; main_model.set_tensor("input_ids", input_ids); main_model.set_tensor("attention_mask", attention_mask); @@ -317,7 +275,6 @@ int main(int argc, char* argv[]) try { text_streamer.put(out_token); const int64_t EOS_TOKEN = get_eos_token(tokenizer_model); - const size_t main_model_seq_len_axis = get_seq_len_axis(main_model_dir); /* Speculative decoding works the following way. The draft model predicts the next K tokens one by one in an autoregressive manner, while the main model validates these @@ -390,7 +347,7 @@ int main(int argc, char* argv[]) try { } candidateGenerator.update_kv_cache(seq_len); - update_kv_cache(main_model, main_model_seq_len_axis, seq_len); + update_kv_cache(main_model, SEQ_LEN_AXIS, seq_len); candidates.clear(); } From a3fdc043812d5c6b67305bdb6758f7e2dc6b2365 Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Fri, 2 Aug 2024 17:58:59 +0200 Subject: [PATCH 5/7] Fix formatting --- samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt | 2 +- samples/cpp/speculative_decoding_lm/CMakeLists.txt | 2 +- .../cpp/speculative_decoding_lm/speculative_decoding_lm.cpp | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt index 08674906f3..087c95bfc4 100644 --- a/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt +++ b/samples/cpp/prompt_lookup_decoding_lm/CMakeLists.txt @@ -19,4 +19,4 @@ target_compile_features(prompt_lookup_decoding_lm PRIVATE cxx_std_17) install(TARGETS prompt_lookup_decoding_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/CMakeLists.txt b/samples/cpp/speculative_decoding_lm/CMakeLists.txt index 8fc21b59d8..b30905bdb9 100644 --- a/samples/cpp/speculative_decoding_lm/CMakeLists.txt +++ b/samples/cpp/speculative_decoding_lm/CMakeLists.txt @@ -19,4 +19,4 @@ target_compile_features(speculative_decoding_lm PRIVATE cxx_std_17) install(TARGETS speculative_decoding_lm RUNTIME DESTINATION samples_bin/ COMPONENT samples_bin - EXCLUDE_FROM_ALL) \ No newline at end of file + EXCLUDE_FROM_ALL) diff --git a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp index 10b3972fd6..ba610574e8 100644 --- a/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp +++ b/samples/cpp/speculative_decoding_lm/speculative_decoding_lm.cpp @@ -223,7 +223,6 @@ int main(int argc, char* argv[]) try { // tokenizer model ov::Core core; core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - auto tokenizer_model = core.read_model(std::string{argv[1]} + "/openvino_tokenizer.xml"); // tokenizer and detokenizer work on CPU only ov::InferRequest tokenizer = core.compile_model(tokenizer_model, "CPU").create_infer_request(); @@ -242,7 +241,7 @@ int main(int argc, char* argv[]) try { ov::InferRequest main_model = core.compile_model(std::string{argv[2]} + "/openvino_model.xml", "CPU").create_infer_request(); - size_t max_sequence_length = 100 + seq_len; + size_t max_sequence_length = 100; AssistedCandidateGenerator candidateGenerator{draft_model, max_sequence_length, 5}; From 8dfc1abf73377e4e266e230d169b13aa0b3654cd Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Mon, 5 Aug 2024 08:30:28 +0200 Subject: [PATCH 6/7] Fix command --- .github/workflows/causal_lm_cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 5ac1299706..9d7fc7ae86 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -376,7 +376,7 @@ jobs: run: | source ./ov/setupvars.sh ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt - ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline() From 61a30814d46c2a8d1c20ac237ee86a3b430cebab Mon Sep 17 00:00:00 2001 From: Alexander Suvorov Date: Mon, 5 Aug 2024 09:03:57 +0200 Subject: [PATCH 7/7] Fix main model --- .github/workflows/causal_lm_cpp.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 9d7fc7ae86..d9f886760f 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -376,7 +376,7 @@ jobs: run: | source ./ov/setupvars.sh ./build/samples/cpp/speculative_decoding_lm/speculative_decoding_lm ./dolly-v2-3b/ ./dolly-v2-7b/ "Alan Turing was a" > predictions_speculative.txt - ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-3b/ "Alan Turing was a" > predictions_greedy.txt + ./build/samples/cpp/greedy_causal_lm/greedy_causal_lm ./dolly-v2-7b/ "Alan Turing was a" > predictions_greedy.txt python -c " with open('predictions_greedy.txt', 'r') as f: predicted_greedy = f.readline()