From b8026a92fa125b1decef2b9cb17d6362d816a968 Mon Sep 17 00:00:00 2001
From: Pavel Esir <pavel.esir@intel.com>
Date: Thu, 4 Apr 2024 16:14:40 +0200
Subject: [PATCH] rename to GenerationConfig

---
 .../generate_pipeline/generate_pipeline.hpp   | 16 ++++----
 .../causal_lm/cpp/generate_pipeline/main.cpp  | 38 ++++++++++++-------
 .../generate_pipeline/sampling_parameters.hpp | 23 +++++------
 3 files changed, 44 insertions(+), 33 deletions(-)

diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
index e3bcc52473..8e3d8109d2 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp
@@ -14,7 +14,7 @@ using GenerationResult = std::vector<std::vector<int64_t>>;
 class LLMEngine {
     ov::InferRequest m_model_runner;
 
-    GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult greedy_search(ov::Tensor prompts, GenerationConfig sampling_params) {
         ov::Shape prompts_shape = prompts.get_shape();
         size_t batch_size = prompts_shape[0];
         OPENVINO_ASSERT(batch_size == 1);
@@ -60,7 +60,7 @@ class LLMEngine {
         return results;
     }
 
-    GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult beam_search(ov::Tensor prompts, GenerationConfig sampling_params) {
         ov::Shape prompts_shape = prompts.get_shape();
         size_t batch_size = prompts_shape[0];
         // todo: implement for batch > 1
@@ -120,7 +120,7 @@ class LLMEngine {
             }
         }
 
-        auto compare_scores = [](Beam left, Beam right) { return (left.score < right.score); };
+        auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
         std::sort(beams.begin(), beams.end(), compare_scores);
         
         GenerationResult results;
@@ -130,7 +130,7 @@ class LLMEngine {
         return results;
     }
 
-    GenerationResult multinomial_sampling(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
         // todo: implement
         GenerationResult results;
         return results;
@@ -145,7 +145,7 @@ class LLMEngine {
     LLMEngine() = default;
 
     // more high level interface
-    GenerationResult generate(ov::Tensor prompts, SamplingParameters sampling_params) {
+    GenerationResult generate(ov::Tensor prompts, GenerationConfig sampling_params) {
         if (sampling_params.is_gready_sampling()) {
             return greedy_search(prompts, sampling_params);
         } else if (sampling_params.is_beam_search()) {
@@ -247,14 +247,14 @@ class LLMPipeline {
     ov::InferRequest m_tokenizer;
     ov::InferRequest m_detokenizer;
     std::string m_path;
-    SamplingParameters m_sampling_parameters;
+    GenerationConfig m_sampling_parameters;
 
 public:
     LLMPipeline(std::string& path) : m_path(path) {
         if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
-            m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json");
+            m_sampling_parameters = GenerationConfig(m_path + "/generation_config.json");
         }
-        m_sampling_parameters = SamplingParameters(m_path + "/generation_config_beam.json");
+        m_sampling_parameters = GenerationConfig(m_path + "/generation_config_beam.json");
 
         ov::Core core;
         // The model can be compiled for GPU as well
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
index 12b85bbe02..57b0531cf3 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp
@@ -15,23 +15,33 @@ int main(int argc, char* argv[]) try {
     // PIPELINE
     std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
     LLMPipeline pipe(model_path);
-    // std::cout << pipe.call("Alan Turing was a");
+    std::cout << pipe.call("Alan Turing was a");
     
     // GENERATE
-    ov::Core core;
-    core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
-    ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
-    ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
-
-    // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
-    std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
-    ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
+    // ov::Core core;
+    // core.add_extension(OPENVINO_TOKENIZERS_PATH);  // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
+    // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
+    // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();
+
+    // // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
+    // std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
+    // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
     
-    auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
-    SamplingParameters sampling_params = SamplingParameters::beam_search();
-    LLMEngine engine(request);
-    GenerationResult generation_results = engine.generate(input_ids, sampling_params);
-    std::cout << detokenize(detokenizer, generation_results[0]);
+    // auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
+    // GenerationConfig sampling_params = GenerationConfig::beam_search();
+    // LLMEngine engine(request);
+    // GenerationResult generation_results = engine.generate(input_ids, sampling_params);
+    // std::cout << detokenize(detokenizer, generation_results[0]);
+    
+    // std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
+    // LLMPipeline pipe(model_path);
+    // GenerationConfig params;
+    // std::cout << pipe("Alan Turing was a", params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));
+    
+    // LLMEngine engine(request);
+    // GenerationConfig params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2);
+    // GenerationResult generation_results = engine.generate(input_ids, params);
+    // std::cout << detokenize(detokenizer, generation_results[0]);
 
 } catch (const std::exception& error) {
     std::cerr << error.what() << '\n';
diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
index b12a25bbe7..5cd42aa4b9 100644
--- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
+++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp
@@ -12,9 +12,9 @@
 // forward declaration
 class Sequence;
 
-// SamplingParameters is similar to HuggingFace GenerationConfig 
-// and has parameters that are not present in the original SamplingParameters for continous batching
-struct SamplingParameters {
+// Similar to HuggingFace GenerationConfig 
+// but has parameters that are not present in the original SamplingParameters for continous batching
+struct GenerationConfig {
     // Generic
     size_t max_new_tokens = 10;
     size_t max_length = 100; // max_new tokens should have priority over max_new_tokens
@@ -43,9 +43,9 @@ struct SamplingParameters {
     int64_t eos_token_id = 0;
     int64_t pad_token_id = 0;
 
-    SamplingParameters() = default;
+    GenerationConfig() = default;
 
-    SamplingParameters(std::string json_path) {
+    GenerationConfig(std::string json_path) {
         std::ifstream f(json_path);
         nlohmann::json data = nlohmann::json::parse(f);
 
@@ -54,6 +54,7 @@ struct SamplingParameters {
         max_length = data.value("max_length", 0);
         pad_token_id = data.value("pad_token_id", 0);
         num_return_sequences = data.value("num_return_sequences", 1);
+        max_new_tokens = data.value("max_new_tokens", 100);
         
         temperature = data.value("temperature", 0.0f);
         do_sample = data.value("do_sample", false);
@@ -66,15 +67,15 @@ struct SamplingParameters {
         group_size = num_beams / n_groups;
     }
 
-    static SamplingParameters greedy() {
-        SamplingParameters greedy_params;
+    static GenerationConfig greedy() {
+        GenerationConfig greedy_params;
         greedy_params.temperature = 0.0f;
         greedy_params.ignore_eos = true;
         return greedy_params;
     }
 
-    static SamplingParameters beam_search() {
-        SamplingParameters beam_search;
+    static GenerationConfig beam_search() {
+        GenerationConfig beam_search;
         beam_search.n_groups = 3;
         beam_search.group_size = 5;
         beam_search.max_new_tokens = 10;
@@ -82,8 +83,8 @@ struct SamplingParameters {
         return beam_search;
     }
 
-    static SamplingParameters multimomial() {
-        SamplingParameters multimomial;
+    static GenerationConfig multimomial() {
+        GenerationConfig multimomial;
         multimomial.temperature = 0.8f;
         multimomial.top_p = 0.8;
         multimomial.top_k = 20;