diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 3d48f0dc77a3bc..370b429700bcec 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -63,6 +63,37 @@ void registerNPUWLLMOptions(OptionsDesc& desc); } \ }; +#define DEFINE_ANYMAP_OPT(Name, PropertyKey) \ + struct Name final : OptionBase { \ + static std::string_view key() { \ + return ov::intel_npu::npuw::llm::prefill_config.name(); \ + } \ + \ + static constexpr std::string_view getTypeName() { \ + return "::intel_npu::" #PropertyKey; \ + } \ + \ + static ov::AnyMap defaultValue() { \ + return {}; \ + } \ + \ + static ov::AnyMap parse(std::string_view val) { \ + return ov::npuw::s11n::stringToAnyMap(std::string(val)); \ + } \ + \ + static std::string toString(const ov::AnyMap& val) { \ + return ov::npuw::s11n::anyMapToString(val); \ + } \ + \ + static OptionMode mode() { \ + return OptionMode::RunTime; \ + } \ + \ + static bool isPublic() { \ + return false; \ + } \ + }; + DEFINE_OPT(NPU_USE_NPUW, bool, false, use_npuw, RunTime); DEFINE_OPT(NPUW_DEVICES, std::string, "NPU,CPU", npuw::devices, RunTime); DEFINE_OPT(NPUW_SUBMODEL_DEVICE, std::string, "", npuw::submodel_device, RunTime); @@ -109,10 +140,16 @@ DEFINE_OPT(NPUW_LLM_SEQ_LEN_DIM, uint32_t, 2, npuw::llm::seq_len_dim, RunTime); DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, RunTime); DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, RunTime); DEFINE_OPT(NPUW_LLM_OPTIMIZE_V_TENSORS, bool, true, npuw::llm::optimize_v_tensors, RunTime); -DEFINE_OPT(NPUW_LLM_CACHE_ROPE, bool, true, npuw::llm::cache_rope, CompileTime); +DEFINE_OPT(NPUW_LLM_CACHE_ROPE, bool, true, npuw::llm::cache_rope, RunTime); DEFINE_OPT(NPUW_LLM_PREFILL_CHUNK_SIZE, uint64_t, 1024, npuw::llm::prefill_chunk_size, RunTime); -DEFINE_OPT(NPUW_LLM_SHARED_HEAD, bool, true, npuw::llm::shared_lm_head, CompileTime); +DEFINE_OPT(NPUW_LLM_SHARED_HEAD, bool, true, npuw::llm::shared_lm_head, RunTime); DEFINE_OPT(NPUW_LLM_MAX_LORA_RANK, uint32_t, 32, npuw::llm::max_lora_rank, RunTime); +DEFINE_ANYMAP_OPT(NPUW_LLM_PREFILL_CONFIG, npuw::llm::prefill_config); +DEFINE_ANYMAP_OPT(NPUW_LLM_ADDITIONAL_PREFILL_CONFIG, npuw::llm::additional_prefill_config); +DEFINE_ANYMAP_OPT(NPUW_LLM_GENERATE_CONFIG, npuw::llm::generate_config); +DEFINE_ANYMAP_OPT(NPUW_LLM_ADDITIONAL_GENERATE_CONFIG, npuw::llm::additional_generate_config); +DEFINE_ANYMAP_OPT(NPUW_LLM_SHARED_LM_HEAD_CONFIG, npuw::llm::shared_lm_head_config); +DEFINE_ANYMAP_OPT(NPUW_LLM_ADDITIONAL_SHARED_LM_HEAD_CONFIG, npuw::llm::additional_shared_lm_head_config); namespace npuw { namespace llm { @@ -216,64 +253,4 @@ struct NPUW_LLM_GENERATE_HINT final : OptionBase { - static std::string_view key() { - return ov::intel_npu::npuw::llm::prefill_config.name(); - } - - static constexpr std::string_view getTypeName() { - return "::intel_npu::npuw::llm::prefill_config"; - } - - static ov::AnyMap defaultValue() { - return {}; - } - - static ov::AnyMap parse(std::string_view val) { - return ov::npuw::s11n::stringToAnyMap(std::string(val)); - } - - static std::string toString(const ov::AnyMap& val) { - return ov::npuw::s11n::anyMapToString(val); - } - - static OptionMode mode() { - return OptionMode::RunTime; - } - - static bool isPublic() { - return false; - } -}; - -struct NPUW_LLM_GENERATE_CONFIG final : OptionBase { - static std::string_view key() { - return ov::intel_npu::npuw::llm::generate_config.name(); - } - - static constexpr std::string_view getTypeName() { - return "::intel_npu::npuw::llm::generate_config"; - } - - static ov::AnyMap defaultValue() { - return {}; - } - - static ov::AnyMap parse(std::string_view val) { - return ov::npuw::s11n::stringToAnyMap(std::string(val)); - } - - static std::string toString(const ov::AnyMap& val) { - return ov::npuw::s11n::anyMapToString(val); - } - - static OptionMode mode() { - return OptionMode::RunTime; - } - - static bool isPublic() { - return false; - } -}; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/plugin/include/properties.hpp b/src/plugins/intel_npu/src/plugin/include/properties.hpp index d6f0b5f04fa9c3..c5dedacd768eb6 100644 --- a/src/plugins/intel_npu/src/plugin/include/properties.hpp +++ b/src/plugins/intel_npu/src/plugin/include/properties.hpp @@ -113,10 +113,18 @@ class Properties final { ov::intel_npu::npuw::llm::max_prompt_len.name(), ov::intel_npu::npuw::llm::min_response_len.name(), ov::intel_npu::npuw::llm::optimize_v_tensors.name(), + ov::intel_npu::npuw::llm::cache_rope.name(), + ov::intel_npu::npuw::llm::prefill_chunk_size.name(), + ov::intel_npu::npuw::llm::shared_lm_head.name(), + ov::intel_npu::npuw::llm::max_lora_rank.name(), ov::intel_npu::npuw::llm::prefill_hint.name(), ov::intel_npu::npuw::llm::prefill_config.name(), + ov::intel_npu::npuw::llm::additional_prefill_config.name(), ov::intel_npu::npuw::llm::generate_hint.name(), - ov::intel_npu::npuw::llm::generate_config.name()}; + ov::intel_npu::npuw::llm::generate_config.name(), + ov::intel_npu::npuw::llm::additional_generate_config.name(), + ov::intel_npu::npuw::llm::shared_lm_head_config.name(), + ov::intel_npu::npuw::llm::additional_shared_lm_head_config.name()}; const std::vector _internalSupportedProperties = {ov::internal::caching_properties.name(), ov::internal::caching_with_mmap.name()}; diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 4a6206c3731b4e..dd9cb1bbb4d224 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -332,10 +332,17 @@ void Plugin::init_options() { REGISTER_OPTION(NPUW_LLM_MIN_RESPONSE_LEN); REGISTER_OPTION(NPUW_LLM_OPTIMIZE_V_TENSORS); REGISTER_OPTION(NPUW_LLM_CACHE_ROPE); + REGISTER_OPTION(NPUW_LLM_PREFILL_CHUNK_SIZE); + REGISTER_OPTION(NPUW_LLM_SHARED_HEAD); + REGISTER_OPTION(NPUW_LLM_MAX_LORA_RANK); REGISTER_OPTION(NPUW_LLM_PREFILL_HINT); REGISTER_OPTION(NPUW_LLM_PREFILL_CONFIG); + REGISTER_OPTION(NPUW_LLM_ADDITIONAL_PREFILL_CONFIG); REGISTER_OPTION(NPUW_LLM_GENERATE_HINT); REGISTER_OPTION(NPUW_LLM_GENERATE_CONFIG); + REGISTER_OPTION(NPUW_LLM_ADDITIONAL_GENERATE_CONFIG); + REGISTER_OPTION(NPUW_LLM_SHARED_LM_HEAD_CONFIG); + REGISTER_OPTION(NPUW_LLM_ADDITIONAL_SHARED_LM_HEAD_CONFIG); } void Plugin::filter_config_by_compiler_support(FilteredConfig& cfg) const { diff --git a/src/plugins/intel_npu/src/plugin/src/properties.cpp b/src/plugins/intel_npu/src/plugin/src/properties.cpp index 507048ac090dcf..f04713980e6ed2 100644 --- a/src/plugins/intel_npu/src/plugin/src/properties.cpp +++ b/src/plugins/intel_npu/src/plugin/src/properties.cpp @@ -448,11 +448,21 @@ void Properties::registerPluginProperties() { TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::cache_rope, NPUW_LLM_CACHE_ROPE); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::prefill_chunk_size, NPUW_LLM_PREFILL_CHUNK_SIZE); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::shared_lm_head, NPUW_LLM_SHARED_HEAD); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::max_lora_rank, NPUW_LLM_MAX_LORA_RANK); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::prefill_hint, NPUW_LLM_PREFILL_HINT); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::prefill_config, NPUW_LLM_PREFILL_CONFIG); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::additional_prefill_config, + NPUW_LLM_ADDITIONAL_PREFILL_CONFIG); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT); TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::generate_config, NPUW_LLM_GENERATE_CONFIG); - TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::cache_rope, NPUW_LLM_CACHE_ROPE); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::additional_generate_config, + NPUW_LLM_ADDITIONAL_GENERATE_CONFIG); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::shared_lm_head_config, NPUW_LLM_SHARED_LM_HEAD_CONFIG); + TRY_REGISTER_SIMPLE_PROPERTY(ov::intel_npu::npuw::llm::additional_shared_lm_head_config, + NPUW_LLM_ADDITIONAL_SHARED_LM_HEAD_CONFIG); // 2. Metrics (static device and enviroment properties) // ========