From b8026a92fa125b1decef2b9cb17d6362d816a968 Mon Sep 17 00:00:00 2001 From: Pavel Esir Date: Thu, 4 Apr 2024 16:14:40 +0200 Subject: [PATCH] rename to GenerationConfig --- .../generate_pipeline/generate_pipeline.hpp | 16 ++++---- .../causal_lm/cpp/generate_pipeline/main.cpp | 38 ++++++++++++------- .../generate_pipeline/sampling_parameters.hpp | 23 +++++------ 3 files changed, 44 insertions(+), 33 deletions(-) diff --git a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp index e3bcc52473..8e3d8109d2 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/generate_pipeline.hpp @@ -14,7 +14,7 @@ using GenerationResult = std::vector>; class LLMEngine { ov::InferRequest m_model_runner; - GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) { + GenerationResult greedy_search(ov::Tensor prompts, GenerationConfig sampling_params) { ov::Shape prompts_shape = prompts.get_shape(); size_t batch_size = prompts_shape[0]; OPENVINO_ASSERT(batch_size == 1); @@ -60,7 +60,7 @@ class LLMEngine { return results; } - GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) { + GenerationResult beam_search(ov::Tensor prompts, GenerationConfig sampling_params) { ov::Shape prompts_shape = prompts.get_shape(); size_t batch_size = prompts_shape[0]; // todo: implement for batch > 1 @@ -120,7 +120,7 @@ class LLMEngine { } } - auto compare_scores = [](Beam left, Beam right) { return (left.score < right.score); }; + auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); }; std::sort(beams.begin(), beams.end(), compare_scores); GenerationResult results; @@ -130,7 +130,7 @@ class LLMEngine { return results; } - GenerationResult multinomial_sampling(ov::Tensor prompts, SamplingParameters sampling_params) { + GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) { // todo: implement GenerationResult results; return results; @@ -145,7 +145,7 @@ class LLMEngine { LLMEngine() = default; // more high level interface - GenerationResult generate(ov::Tensor prompts, SamplingParameters sampling_params) { + GenerationResult generate(ov::Tensor prompts, GenerationConfig sampling_params) { if (sampling_params.is_gready_sampling()) { return greedy_search(prompts, sampling_params); } else if (sampling_params.is_beam_search()) { @@ -247,14 +247,14 @@ class LLMPipeline { ov::InferRequest m_tokenizer; ov::InferRequest m_detokenizer; std::string m_path; - SamplingParameters m_sampling_parameters; + GenerationConfig m_sampling_parameters; public: LLMPipeline(std::string& path) : m_path(path) { if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) { - m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json"); + m_sampling_parameters = GenerationConfig(m_path + "/generation_config.json"); } - m_sampling_parameters = SamplingParameters(m_path + "/generation_config_beam.json"); + m_sampling_parameters = GenerationConfig(m_path + "/generation_config_beam.json"); ov::Core core; // The model can be compiled for GPU as well diff --git a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp index 12b85bbe02..57b0531cf3 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/main.cpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/main.cpp @@ -15,23 +15,33 @@ int main(int argc, char* argv[]) try { // PIPELINE std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"; LLMPipeline pipe(model_path); - // std::cout << pipe.call("Alan Turing was a"); + std::cout << pipe.call("Alan Turing was a"); // GENERATE - ov::Core core; - core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt - ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request(); - ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request(); - - // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted - std::shared_ptr model = core.read_model(model_path + "/openvino_model.xml"); - ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request(); + // ov::Core core; + // core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt + // ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request(); + // ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request(); + + // // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted + // std::shared_ptr model = core.read_model(model_path + "/openvino_model.xml"); + // ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request(); - auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a"); - SamplingParameters sampling_params = SamplingParameters::beam_search(); - LLMEngine engine(request); - GenerationResult generation_results = engine.generate(input_ids, sampling_params); - std::cout << detokenize(detokenizer, generation_results[0]); + // auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a"); + // GenerationConfig sampling_params = GenerationConfig::beam_search(); + // LLMEngine engine(request); + // GenerationResult generation_results = engine.generate(input_ids, sampling_params); + // std::cout << detokenize(detokenizer, generation_results[0]); + + // std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/"; + // LLMPipeline pipe(model_path); + // GenerationConfig params; + // std::cout << pipe("Alan Turing was a", params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2)); + + // LLMEngine engine(request); + // GenerationConfig params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2); + // GenerationResult generation_results = engine.generate(input_ids, params); + // std::cout << detokenize(detokenizer, generation_results[0]); } catch (const std::exception& error) { std::cerr << error.what() << '\n'; diff --git a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp index b12a25bbe7..5cd42aa4b9 100644 --- a/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp +++ b/text_generation/causal_lm/cpp/generate_pipeline/sampling_parameters.hpp @@ -12,9 +12,9 @@ // forward declaration class Sequence; -// SamplingParameters is similar to HuggingFace GenerationConfig -// and has parameters that are not present in the original SamplingParameters for continous batching -struct SamplingParameters { +// Similar to HuggingFace GenerationConfig +// but has parameters that are not present in the original SamplingParameters for continous batching +struct GenerationConfig { // Generic size_t max_new_tokens = 10; size_t max_length = 100; // max_new tokens should have priority over max_new_tokens @@ -43,9 +43,9 @@ struct SamplingParameters { int64_t eos_token_id = 0; int64_t pad_token_id = 0; - SamplingParameters() = default; + GenerationConfig() = default; - SamplingParameters(std::string json_path) { + GenerationConfig(std::string json_path) { std::ifstream f(json_path); nlohmann::json data = nlohmann::json::parse(f); @@ -54,6 +54,7 @@ struct SamplingParameters { max_length = data.value("max_length", 0); pad_token_id = data.value("pad_token_id", 0); num_return_sequences = data.value("num_return_sequences", 1); + max_new_tokens = data.value("max_new_tokens", 100); temperature = data.value("temperature", 0.0f); do_sample = data.value("do_sample", false); @@ -66,15 +67,15 @@ struct SamplingParameters { group_size = num_beams / n_groups; } - static SamplingParameters greedy() { - SamplingParameters greedy_params; + static GenerationConfig greedy() { + GenerationConfig greedy_params; greedy_params.temperature = 0.0f; greedy_params.ignore_eos = true; return greedy_params; } - static SamplingParameters beam_search() { - SamplingParameters beam_search; + static GenerationConfig beam_search() { + GenerationConfig beam_search; beam_search.n_groups = 3; beam_search.group_size = 5; beam_search.max_new_tokens = 10; @@ -82,8 +83,8 @@ struct SamplingParameters { return beam_search; } - static SamplingParameters multimomial() { - SamplingParameters multimomial; + static GenerationConfig multimomial() { + GenerationConfig multimomial; multimomial.temperature = 0.8f; multimomial.top_p = 0.8; multimomial.top_k = 20;