Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added possibility to pass PAD_TOKEN_ID #28458

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime);
DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,16 @@ static constexpr ov::Property<uint32_t> batch_dim{"NPUW_LLM_BATCH_DIM"};
*/
static constexpr ov::Property<uint32_t> seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"};

/**
* @brief
* TODO: Check that it is indeed needed, or remove.
* Type: int64_t.
* Pad token id, required to fill the input of prefill model until useful
* tokens met.
* Default value: 0.
*/
static constexpr ov::Property<uint32_t> pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"};

/**
* @brief
* Type: uint32_t.
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_npu/src/al/src/config/npuw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
desc.add<NPUW_LLM>();
desc.add<NPUW_LLM_BATCH_DIM>();
desc.add<NPUW_LLM_SEQ_LEN_DIM>();
desc.add<NPUW_LLM_PAD_TOKEN_ID>();
desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
m_cfg.update(any_copy(npuw_llm_props));

LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
auto kvcache_model = model->clone();
auto kvcache_model = model;
LOG_DEBUG("2. Transform kvcache model from stateful to stateless.");
ov::pass::StatefulToStateless().run_on_model(kvcache_model);
LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one.");
Expand All @@ -465,7 +465,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);

m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, false};
LOG_DEBUG("4. Make prefill model with static shapes");
reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
LOG_DEBUG("5. Make kvcache model with static shapes");
Expand Down Expand Up @@ -713,6 +713,7 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get),
BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get),
BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get),
BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get),
BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
}

void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
const auto pad_token_id = m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>();
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), pad_token_id);
Copy link
Contributor

@ilya-lavrenov ilya-lavrenov Jan 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it really matter what is pad token value if it's masked off via attention_mask ?

fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
Expand Down
Loading