Skip to content

Commit

Permalink
rename to GenerationConfig
Browse files Browse the repository at this point in the history
  • Loading branch information
pavel-esir committed Apr 4, 2024
1 parent 670749f commit b8026a9
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ using GenerationResult = std::vector<std::vector<int64_t>>;
class LLMEngine {
ov::InferRequest m_model_runner;

GenerationResult greedy_search(ov::Tensor prompts, SamplingParameters sampling_params) {
GenerationResult greedy_search(ov::Tensor prompts, GenerationConfig sampling_params) {
ov::Shape prompts_shape = prompts.get_shape();
size_t batch_size = prompts_shape[0];
OPENVINO_ASSERT(batch_size == 1);
Expand Down Expand Up @@ -60,7 +60,7 @@ class LLMEngine {
return results;
}

GenerationResult beam_search(ov::Tensor prompts, SamplingParameters sampling_params) {
GenerationResult beam_search(ov::Tensor prompts, GenerationConfig sampling_params) {
ov::Shape prompts_shape = prompts.get_shape();
size_t batch_size = prompts_shape[0];
// todo: implement for batch > 1
Expand Down Expand Up @@ -120,7 +120,7 @@ class LLMEngine {
}
}

auto compare_scores = [](Beam left, Beam right) { return (left.score < right.score); };
auto compare_scores = [](Beam left, Beam right) { return (left.score > right.score); };
std::sort(beams.begin(), beams.end(), compare_scores);

GenerationResult results;
Expand All @@ -130,7 +130,7 @@ class LLMEngine {
return results;
}

GenerationResult multinomial_sampling(ov::Tensor prompts, SamplingParameters sampling_params) {
GenerationResult multinomial_sampling(ov::Tensor prompts, GenerationConfig sampling_params) {
// todo: implement
GenerationResult results;
return results;
Expand All @@ -145,7 +145,7 @@ class LLMEngine {
LLMEngine() = default;

// more high level interface
GenerationResult generate(ov::Tensor prompts, SamplingParameters sampling_params) {
GenerationResult generate(ov::Tensor prompts, GenerationConfig sampling_params) {
if (sampling_params.is_gready_sampling()) {
return greedy_search(prompts, sampling_params);
} else if (sampling_params.is_beam_search()) {
Expand Down Expand Up @@ -247,14 +247,14 @@ class LLMPipeline {
ov::InferRequest m_tokenizer;
ov::InferRequest m_detokenizer;
std::string m_path;
SamplingParameters m_sampling_parameters;
GenerationConfig m_sampling_parameters;

public:
LLMPipeline(std::string& path) : m_path(path) {
if (std::experimental::filesystem::exists(m_path + "/generation_config.json")) {
m_sampling_parameters = SamplingParameters(m_path + "/generation_config.json");
m_sampling_parameters = GenerationConfig(m_path + "/generation_config.json");
}
m_sampling_parameters = SamplingParameters(m_path + "/generation_config_beam.json");
m_sampling_parameters = GenerationConfig(m_path + "/generation_config_beam.json");

ov::Core core;
// The model can be compiled for GPU as well
Expand Down
38 changes: 24 additions & 14 deletions text_generation/causal_lm/cpp/generate_pipeline/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,33 @@ int main(int argc, char* argv[]) try {
// PIPELINE
std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
LLMPipeline pipe(model_path);
// std::cout << pipe.call("Alan Turing was a");
std::cout << pipe.call("Alan Turing was a");

// GENERATE
ov::Core core;
core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();

// todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();
// ov::Core core;
// core.add_extension(OPENVINO_TOKENIZERS_PATH); // OPENVINO_TOKENIZERS_PATH is defined in CMakeLists.txt
// ov::InferRequest tokenizer = core.compile_model(model_path + "/openvino_tokenizer.xml", "CPU").create_infer_request();
// ov::InferRequest detokenizer = core.compile_model(model_path + "/openvino_detokenizer.xml", "CPU").create_infer_request();

// // todo: beam search does not work properly on GPU, when reshaped from batch=1 to batch=num_beams not broadcasted
// std::shared_ptr<ov::Model> model = core.read_model(model_path + "/openvino_model.xml");
// ov::InferRequest request = core.compile_model(model, "CPU").create_infer_request();

auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
SamplingParameters sampling_params = SamplingParameters::beam_search();
LLMEngine engine(request);
GenerationResult generation_results = engine.generate(input_ids, sampling_params);
std::cout << detokenize(detokenizer, generation_results[0]);
// auto [input_ids, attention_mask] = tokenize(tokenizer, "Alan Turing was a");
// GenerationConfig sampling_params = GenerationConfig::beam_search();
// LLMEngine engine(request);
// GenerationResult generation_results = engine.generate(input_ids, sampling_params);
// std::cout << detokenize(detokenizer, generation_results[0]);

// std::string model_path = "/home/epavel/devel/openvino.genai/text_generation/causal_lm/TinyLlama-1.1B-Chat-v1.0/pytorch/dldt/FP16/";
// LLMPipeline pipe(model_path);
// GenerationConfig params;
// std::cout << pipe("Alan Turing was a", params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2));

// LLMEngine engine(request);
// GenerationConfig params.temperature(0.2).top_k(4).do_sample(true).repetition_penalty(1.2);
// GenerationResult generation_results = engine.generate(input_ids, params);
// std::cout << detokenize(detokenizer, generation_results[0]);

} catch (const std::exception& error) {
std::cerr << error.what() << '\n';
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
// forward declaration
class Sequence;

// SamplingParameters is similar to HuggingFace GenerationConfig
// and has parameters that are not present in the original SamplingParameters for continous batching
struct SamplingParameters {
// Similar to HuggingFace GenerationConfig
// but has parameters that are not present in the original SamplingParameters for continous batching
struct GenerationConfig {
// Generic
size_t max_new_tokens = 10;
size_t max_length = 100; // max_new tokens should have priority over max_new_tokens
Expand Down Expand Up @@ -43,9 +43,9 @@ struct SamplingParameters {
int64_t eos_token_id = 0;
int64_t pad_token_id = 0;

SamplingParameters() = default;
GenerationConfig() = default;

SamplingParameters(std::string json_path) {
GenerationConfig(std::string json_path) {
std::ifstream f(json_path);
nlohmann::json data = nlohmann::json::parse(f);

Expand All @@ -54,6 +54,7 @@ struct SamplingParameters {
max_length = data.value("max_length", 0);
pad_token_id = data.value("pad_token_id", 0);
num_return_sequences = data.value("num_return_sequences", 1);
max_new_tokens = data.value("max_new_tokens", 100);

temperature = data.value("temperature", 0.0f);
do_sample = data.value("do_sample", false);
Expand All @@ -66,24 +67,24 @@ struct SamplingParameters {
group_size = num_beams / n_groups;
}

static SamplingParameters greedy() {
SamplingParameters greedy_params;
static GenerationConfig greedy() {
GenerationConfig greedy_params;
greedy_params.temperature = 0.0f;
greedy_params.ignore_eos = true;
return greedy_params;
}

static SamplingParameters beam_search() {
SamplingParameters beam_search;
static GenerationConfig beam_search() {
GenerationConfig beam_search;
beam_search.n_groups = 3;
beam_search.group_size = 5;
beam_search.max_new_tokens = 10;
beam_search.diversity_penalty = 2.0f;
return beam_search;
}

static SamplingParameters multimomial() {
SamplingParameters multimomial;
static GenerationConfig multimomial() {
GenerationConfig multimomial;
multimomial.temperature = 0.8f;
multimomial.top_p = 0.8;
multimomial.top_k = 20;
Expand Down

0 comments on commit b8026a9

Please sign in to comment.