diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 51b6b8ca9fe28c..2510c010397837 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -70,6 +70,7 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime); DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime); DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime); DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime); +DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime); DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime); DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime); DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index bbf7073a04656b..724386b8ff6af9 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -407,6 +407,16 @@ static constexpr ov::Property batch_dim{"NPUW_LLM_BATCH_DIM"}; */ static constexpr ov::Property seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"}; +/** + * @brief + * TODO: Check that it is indeed needed, or remove. + * Type: int64_t. + * Pad token id, required to fill the input of prefill model until useful + * tokens met. + * Default value: 0. + */ +static constexpr ov::Property pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"}; + /** * @brief * Type: uint32_t. diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 0ed344596fea3b..9a0cbd4bae3293 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -59,6 +59,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); desc.add(); desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 0232811dfb800e..513863a4f939dc 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -452,7 +452,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m m_cfg.update(any_copy(npuw_llm_props)); LOG_DEBUG("1. Creating kvcache model as clone of passed one."); - auto kvcache_model = model->clone(); + auto kvcache_model = model; LOG_DEBUG("2. Transform kvcache model from stateful to stateless."); ov::pass::StatefulToStateless().run_on_model(kvcache_model); LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one."); @@ -465,7 +465,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); - m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim}; + m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, false}; LOG_DEBUG("4. Make prefill model with static shapes"); reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); LOG_DEBUG("5. Make kvcache model with static shapes"); @@ -713,6 +713,7 @@ void ov::npuw::LLMCompiledModel::implement_properties() { m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get), BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get), BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get), + BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get), BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get), BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get), BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get), diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp index 2e987036483e34..bcefd335250975 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -124,7 +124,8 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0); + const auto pad_token_id = m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>(); + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), pad_token_id); fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0); fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0); fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);