openvinotoolkit · AsyaPronina · Jan 15, 2025 · ilya-lavrenov · Jan 15, 2025
@@ -70,6 +70,7 @@ DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime);
 DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime);
 DEFINE_OPT(NPUW_LLM_BATCH_DIM, uint32_t, 0, npuw::llm::batch_dim, CompileTime);
 DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, CompileTime);
+DEFINE_OPT(NPUW_LLM_PAD_TOKEN_ID, int64_t, 0, npuw::llm::pad_token_id, CompileTime);
 DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime);
 DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime);
 DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, false, npuw::llm::optimize_v_tensors, CompileTime);

@@ -407,6 +407,16 @@ static constexpr ov::Property<uint32_t> batch_dim{"NPUW_LLM_BATCH_DIM"};
  */
 static constexpr ov::Property<uint32_t> seq_len_dim{"NPUW_LLM_SEQ_LEN_DIM"};
 
+/**
+ * @brief
+ * TODO: Check that it is indeed needed, or remove.
+ * Type: int64_t.
+ * Pad token id, required to fill the input of prefill model until useful
+ * tokens met.
+ * Default value: 0.
+ */
+static constexpr ov::Property<uint32_t> pad_token_id{"NPUW_LLM_PAD_TOKEN_ID"};
+
 /**
  * @brief
  * Type: uint32_t.

@@ -59,6 +59,7 @@ void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) {
     desc.add<NPUW_LLM>();
     desc.add<NPUW_LLM_BATCH_DIM>();
     desc.add<NPUW_LLM_SEQ_LEN_DIM>();
+    desc.add<NPUW_LLM_PAD_TOKEN_ID>();
     desc.add<NPUW_LLM_MAX_PROMPT_LEN>();
     desc.add<NPUW_LLM_MIN_RESPONSE_LEN>();
     desc.add<NPUW_LLM_OPTIMIZE_V_TENSORS>();

@@ -452,7 +452,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     m_cfg.update(any_copy(npuw_llm_props));
 
     LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
-    auto kvcache_model = model->clone();
+    auto kvcache_model = model;
     LOG_DEBUG("2. Transform kvcache model from stateful to stateless.");
     ov::pass::StatefulToStateless().run_on_model(kvcache_model);
     LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one.");
@@ -465,7 +465,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
     const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u);
     const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u);
 
-    m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
+    m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, false};
     LOG_DEBUG("4. Make prefill model with static shapes");
     reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes);
     LOG_DEBUG("5. Make kvcache model with static shapes");
@@ -713,6 +713,7 @@ void ov::npuw::LLMCompiledModel::implement_properties() {
     m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get),
                           BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get),
                           BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get),
+                          BIND(npuw::llm::pad_token_id, NPUW_LLM_PAD_TOKEN_ID, get),
                           BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get),
                           BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get),
                           BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get),

@@ -124,7 +124,8 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
 }
 
 void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
-    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
+    const auto pad_token_id = m_npuw_llm_compiled_model->m_cfg.get<::intel_npu::NPUW_LLM_PAD_TOKEN_ID>();
+    fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), pad_token_id);
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
     fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
     fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);