From b2eb2b5ad7090ad3b3e002b200104a82eeb2fa7f Mon Sep 17 00:00:00 2001 From: Richard Zou Date: Fri, 18 Jul 2025 14:10:21 -0400 Subject: [PATCH 01/57] [Kernel] Apply torch.Tag.needs_fixed_stride_order only for torch==2.6.0 (#19346) Signed-off-by: rzou --- csrc/torch_bindings.cpp | 12 ++++++++---- vllm/attention/ops/rocm_aiter_mla.py | 8 ++++++-- vllm/model_executor/layers/fused_moe/fused_moe.py | 8 +++++--- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 23e9212a2f1d..79e2575974b5 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -20,13 +20,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops // - // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need + // The default behavior in PyTorch 2.6 was changed to "requires_contiguous", + // so we need // to override this for many GEMMs with the following tag. Otherwise, // torch.compile will force all input tensors to be contiguous(), which // will break many custom ops that require column-major weight matrices. - // TODO: remove this for PyTorch 2.8, when the default is planned to switch - // to match exact eager-mode strides. - at::Tag stride_tag = at::Tag::needs_fixed_stride_order; + // This was a bug and PyTorch 2.7 has since fixed this. +#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6 + #define stride_tag at::Tag::needs_fixed_stride_order +#else + #define stride_tag +#endif ops.def("weak_ref_tensor(Tensor input) -> Tensor"); ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py index cce6b4639460..d91cda255ff3 100644 --- a/vllm/attention/ops/rocm_aiter_mla.py +++ b/vllm/attention/ops/rocm_aiter_mla.py @@ -6,7 +6,7 @@ import torch from vllm.platforms import current_platform -from vllm.utils import direct_register_custom_op +from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer def get_aiter_mla_metadata(max_batch_size: int, block_size: int, @@ -93,8 +93,12 @@ def mla_decode_fwd_fake( if current_platform.is_rocm(): + if is_torch_equal_or_newer("2.7.0"): + tags = () + else: + tags = (torch.Tag.needs_fixed_stride_order, ), direct_register_custom_op(op_name="rocm_aiter_mla_decode_fwd", op_func=mla_decode_fwd_impl, mutates_args=["o"], fake_impl=mla_decode_fwd_fake, - tags=[torch.Tag.needs_fixed_stride_order]) + tags=tags) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 459360260073..aec5d7b252e3 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -33,7 +33,7 @@ dequant_mxfp4) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton -from vllm.utils import direct_register_custom_op +from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled @@ -1056,7 +1056,8 @@ def inplace_fused_experts_fake( op_func=inplace_fused_experts, mutates_args=["hidden_states"], fake_impl=inplace_fused_experts_fake, - tags=(torch.Tag.needs_fixed_stride_order, ), + tags=(() if is_torch_equal_or_newer("2.7.0") else + (torch.Tag.needs_fixed_stride_order, )), ) @@ -1122,7 +1123,8 @@ def outplace_fused_experts_fake( op_func=outplace_fused_experts, mutates_args=[], fake_impl=outplace_fused_experts_fake, - tags=(torch.Tag.needs_fixed_stride_order, ), + tags=(() if is_torch_equal_or_newer("2.7.0") else + (torch.Tag.needs_fixed_stride_order, )), ) From 0f199f197b4e7a835ccc5b4d15363f8faa7824c8 Mon Sep 17 00:00:00 2001 From: JialinOuyang-Meta Date: Fri, 18 Jul 2025 12:34:40 -0700 Subject: [PATCH 02/57] [Core] Avoid KVCacheBlock.__eq__ invocations in FreeKVCacheBlockQueue (#21005) Signed-off-by: Jialin Ouyang --- benchmarks/kv_cache/benchmark_block_pool.py | 108 ++++++++++++++++++++ tests/v1/core/test_kv_cache_utils.py | 28 ++--- tests/v1/core/test_prefix_caching.py | 26 ++--- vllm/v1/core/kv_cache_utils.py | 106 +++++++++++++------ 4 files changed, 210 insertions(+), 58 deletions(-) create mode 100644 benchmarks/kv_cache/benchmark_block_pool.py diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py new file mode 100644 index 000000000000..134551bb6128 --- /dev/null +++ b/benchmarks/kv_cache/benchmark_block_pool.py @@ -0,0 +1,108 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import gc +import time +from typing import Optional + +from tabulate import tabulate + +from vllm.utils import FlexibleArgumentParser +from vllm.v1.core.block_pool import BlockPool + + +class Metric: + def __init__(self) -> None: + self.cnt: int = 0 + self.sum_v: int = 0 + self.max_v: Optional[int] = None + + def update(self, v: int) -> None: + self.cnt += 1 + self.sum_v += v + if self.max_v is None: + self.max_v = v + else: + self.max_v = max(self.max_v, v) + + def avg_v(self) -> float: + return self.sum_v * 1.0 / self.cnt + + +def main(args): + rows = [] + for allocate_block in args.allocate_blocks: + # Enforce a GC collect ahead to minimize the impact among runs + gc.collect() + block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True) + + get_blocks_metric: Metric = Metric() + free_blocks_metric: Metric = Metric() + for _ in range(args.num_iteration): + t1 = time.monotonic_ns() + blocks = block_pool.get_new_blocks(allocate_block) + t2 = time.monotonic_ns() + block_pool.free_blocks(blocks) + t3 = time.monotonic_ns() + get_blocks_metric.update(t2 - t1) + free_blocks_metric.update(t3 - t2) + + if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None: + rows.append( + [ + get_blocks_metric.cnt, + args.num_gpu_blocks, + allocate_block, + get_blocks_metric.avg_v() / 1000000, + get_blocks_metric.max_v / 1000000.0, + free_blocks_metric.avg_v() / 1000000, + free_blocks_metric.max_v / 1000000.0, + ] + ) + else: + print( + "No valid metrics found." + f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}" + ) + + print( + tabulate( + rows, + headers=[ + "Iterations", + "Total\nBlocks", + "Allocated\nBlocks", + "Get Blocks\nAvg (ms)", + "Get Blocks\nMax (ms)", + "Free Blocks\nAvg (ms)", + "Free Blocks\nMax (ms)", + ], + tablefmt="grid", + floatfmt=".6f", + ) + ) + + +def invoke_main() -> None: + parser = FlexibleArgumentParser( + description="Benchmark the performance of BlockPool for KV Cache." + ) + parser.add_argument("--num-gpu-blocks", type=int, default=100000) + parser.add_argument( + "--num-iteration", + type=int, + default=1000, + help="Number of iterations to run to stablize final data readings", + ) + parser.add_argument( + "--allocate-blocks", + type=int, + nargs="*", + default=[10, 50, 100, 500, 1000], + help="Number of blocks to allocate", + ) + args = parser.parse_args() + main(args) + + +if __name__ == "__main__": + invoke_main() # pragma: no cover diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 0676cb3eb65d..68b060156901 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -132,8 +132,8 @@ def test_free_kv_cache_block_queue_initialization(): block = KVCacheBlock(block_id=0) queue = FreeKVCacheBlockQueue([block]) assert queue.num_free_blocks == 1 - assert queue.free_list_head == block - assert queue.free_list_tail == block + assert queue.fake_free_list_head.next_free_block is block + assert queue.fake_free_list_tail.prev_free_block is block def test_free_kv_cache_block_queue_operations(): @@ -145,36 +145,38 @@ def test_free_kv_cache_block_queue_operations(): # Check initial state assert queue.num_free_blocks == 5 - assert queue.free_list_head == blocks[0] - assert queue.free_list_tail == blocks[4] + assert queue.fake_free_list_head.next_free_block is blocks[0] + assert queue.fake_free_list_tail.prev_free_block is blocks[4] # Pop the first block block1 = queue.popleft() assert block1 == blocks[0] assert queue.num_free_blocks == 4 - assert queue.free_list_head == blocks[1] - assert queue.free_list_tail == blocks[4] + assert queue.fake_free_list_head.next_free_block is blocks[1] + assert queue.fake_free_list_tail.prev_free_block is blocks[4] # Remove a block from the middle block_to_remove = blocks[2] queue.remove(block_to_remove) assert queue.num_free_blocks == 3 - assert blocks[1].next_free_block == blocks[3] - assert blocks[3].prev_free_block == blocks[1] + assert blocks[1].next_free_block is blocks[3] + assert blocks[3].prev_free_block is blocks[1] # Append a block back queue.append(block_to_remove) assert queue.num_free_blocks == 4 - assert queue.free_list_tail == block_to_remove - assert block_to_remove.prev_free_block == blocks[4] - assert block_to_remove.next_free_block is None + assert queue.fake_free_list_tail.prev_free_block is block_to_remove + assert block_to_remove.prev_free_block is blocks[4] + assert block_to_remove.next_free_block is queue.fake_free_list_tail # Pop blocks until empty for _ in range(4): queue.popleft() assert queue.num_free_blocks == 0 - assert queue.free_list_head is None - assert queue.free_list_tail is None + assert (queue.fake_free_list_head.next_free_block + is queue.fake_free_list_tail) + assert (queue.fake_free_list_tail.prev_free_block + is queue.fake_free_list_head) # Attempt to pop from an empty queue with pytest.raises(ValueError) as e: diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index f31bdf74f4a6..b7f583de1f63 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -155,13 +155,14 @@ def test_prefill(hash_algo): assert block.ref_cnt == 2 # At this point, we should have 5 free blocks left. - assert manager.block_pool.free_block_queue.num_free_blocks == 5 + free_block_queue = manager.block_pool.free_block_queue + assert free_block_queue.num_free_blocks == 5 manager.free(req0) manager.free(req1) # All blocks should be available. - assert manager.block_pool.free_block_queue.num_free_blocks == 10 + assert free_block_queue.num_free_blocks == 10 # The order should be # [unallocated (6, 7, 8, 9, 10)] # [unique_req0 (4)] @@ -188,14 +189,10 @@ def test_prefill(hash_algo): # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. - assert manager.block_pool.free_block_queue.num_free_blocks == 6 - assert all([ - b.ref_cnt == 0 - for b in manager.block_pool.free_block_queue.get_all_free_blocks() - ]) - assert len([ - b for b in manager.block_pool.free_block_queue.get_all_free_blocks() - ]) == 6 + assert free_block_queue.num_free_blocks == 6 + assert all( + [b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()]) + assert len([b for b in free_block_queue.get_all_free_blocks()]) == 6 manager.free(req2) @@ -209,9 +206,12 @@ def test_prefill(hash_algo): computed_blocks) # This block ID order also checks the eviction order. assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], ) - assert manager.block_pool.free_block_queue.num_free_blocks == 0 - assert manager.block_pool.free_block_queue.free_list_head is None - assert manager.block_pool.free_block_queue.free_list_tail is None + + assert free_block_queue.num_free_blocks == 0 + assert (free_block_queue.fake_free_list_head.next_free_block + is free_block_queue.fake_free_list_tail) + assert (free_block_queue.fake_free_list_tail.prev_free_block + is free_block_queue.fake_free_list_head) def test_prefill_hybrid_model(): diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 6067a127e97f..b1fab0d34de4 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -212,27 +212,65 @@ class FreeKVCacheBlockQueue: def __init__(self, blocks: list[KVCacheBlock]) -> None: self.num_free_blocks = len(blocks) - # Initialize the doubly linked list of free blocks. - self.free_list_head: Optional[KVCacheBlock] = blocks[0] - self.free_list_tail: Optional[KVCacheBlock] = blocks[-1] + # Initialize doubly links of consecutive blocks for i in range(self.num_free_blocks): if i > 0: blocks[i].prev_free_block = blocks[i - 1] if i < self.num_free_blocks - 1: blocks[i].next_free_block = blocks[i + 1] + # Create a fake head and a tail block for the doubly linked list to + # reduce branching in the code + # + # The implementation garenteed that the fake head and tail + # are NEVER got popped, so we could safely assume each real blocks + # in the queue has prev and next blocks. + self.fake_free_list_head = KVCacheBlock(block_id=-1) + self.fake_free_list_tail = KVCacheBlock(block_id=-1) + if self.num_free_blocks > 0: + # Connect fake_head and fake_tail to the first and last block + # respectively. + self.fake_free_list_head.next_free_block = blocks[0] + blocks[0].prev_free_block = self.fake_free_list_head + self.fake_free_list_tail.prev_free_block = blocks[-1] + blocks[-1].next_free_block = self.fake_free_list_tail + else: + # For empty list, simply connect the fake head and tail. + self.fake_free_list_head.next_free_block = self.fake_free_list_tail + self.fake_free_list_tail.prev_free_block = self.fake_free_list_head + def popleft(self) -> KVCacheBlock: """Pop the first free block and reduce num_free_blocks by 1. Returns: The first free block. """ - if not self.free_list_head: + if (self.fake_free_list_head.next_free_block + is self.fake_free_list_tail + or self.fake_free_list_head.next_free_block is None): + assert self.num_free_blocks == 0, ( + f"num_free_blocks ({self.num_free_blocks}) is out of sync " + "with the free list.") raise ValueError("No free blocks available") - block = self.free_list_head - self.remove(block) - return block + first_block: KVCacheBlock = self.fake_free_list_head.next_free_block + + if first_block.next_free_block is None: + # This should not happen if the block is from the free list. + # It indicates a bug in the caller's logic. + raise RuntimeError("Invalid block found in popleft() " + "which doesn't have a valid next_free_block") + + # Connect fake_head and the next block of first_block (i.e. second block + # or fake tail). + self.fake_free_list_head.next_free_block = first_block.next_free_block + first_block.next_free_block.prev_free_block = self.fake_free_list_head + + # Remove the block from the linked list. + first_block.prev_free_block = first_block.next_free_block = None + + self.num_free_blocks -= 1 + return first_block def remove(self, block: KVCacheBlock) -> None: """Remove a block in the free list and reduce num_free_blocks by 1. @@ -240,19 +278,15 @@ def remove(self, block: KVCacheBlock) -> None: Args: block: The block to remove. """ - if block.prev_free_block is not None: - # Link the previous block to the next block. - block.prev_free_block.next_free_block = block.next_free_block - if block.next_free_block is not None: - # Link the next block to the previous block. - block.next_free_block.prev_free_block = block.prev_free_block - - if block == self.free_list_head: - # Update the head if the block is the head. - self.free_list_head = block.next_free_block - if block == self.free_list_tail: - # Update the tail if the block is the tail. - self.free_list_tail = block.prev_free_block + if block.prev_free_block is None or block.next_free_block is None: + # This should not happen if the block is from the free list. + # It indicates a bug in the caller's logic. + raise RuntimeError(f"remove() called on an invalid block: {block}") + + # Link the previous block to the next block. + block.prev_free_block.next_free_block = block.next_free_block + # Link the next block to the previous block. + block.next_free_block.prev_free_block = block.prev_free_block # Remove the block from the linked list. block.prev_free_block = block.next_free_block = None @@ -265,17 +299,19 @@ def append(self, block: KVCacheBlock) -> None: Args: block: The block to append. """ - if self.free_list_tail is not None: - # Link the last block to the new block. - self.free_list_tail.next_free_block = block - block.prev_free_block = self.free_list_tail - self.free_list_tail = block - else: - # The free list is empty. - assert self.free_list_head is None - self.free_list_head = self.free_list_tail = block + if self.fake_free_list_tail.prev_free_block is None: + raise RuntimeError( + "prev_free_block of fake_free_list_tail should always exist") + last_block: KVCacheBlock = self.fake_free_list_tail.prev_free_block + + # Connect the new block after the last block. + last_block.next_free_block = block + block.prev_free_block = last_block + + # Connect the fake tail after the new block. + block.next_free_block = self.fake_free_list_tail + self.fake_free_list_tail.prev_free_block = block - block.next_free_block = None self.num_free_blocks += 1 def get_all_free_blocks(self) -> list[KVCacheBlock]: @@ -285,8 +321,14 @@ def get_all_free_blocks(self) -> list[KVCacheBlock]: A list of free blocks. """ ret = [] - curr_block = self.free_list_head - while curr_block is not None: + if self.fake_free_list_head.next_free_block is None: + raise RuntimeError( + "next_free_block of fake_free_list_head should always exist") + # Start from the first block + curr_block: KVCacheBlock = self.fake_free_list_head.next_free_block + # As long as next_free_block is available, we haven't reached to + # the fake tail yet. + while curr_block.next_free_block is not None: ret.append(curr_block) curr_block = curr_block.next_free_block return ret From 5782581acfa4dc89491904c7612f076c0cd5a646 Mon Sep 17 00:00:00 2001 From: hax0r31337 <65506006+hax0r31337@users.noreply.github.com> Date: Sat, 19 Jul 2025 00:40:18 +0200 Subject: [PATCH 03/57] [Bugfix] Voxtral on Blackwell GPUs (RTX 50 series) (#21077) Signed-off-by: hax0r31337 --- vllm/attention/layer.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index f9c2d4f49835..b6b93ff4a0ac 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -16,6 +16,7 @@ has_kv_transfer_group, is_v1_kv_transfer_group) from vllm.forward_context import ForwardContext, get_forward_context +from vllm.logger import init_logger from vllm.model_executor.layers.linear import UnquantizedLinearMethod from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) @@ -23,6 +24,34 @@ from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op +logger = init_logger(__name__) +USE_XFORMERS_OPS = None + + +def check_xformers_availability(): + global USE_XFORMERS_OPS + if USE_XFORMERS_OPS is not None: + return USE_XFORMERS_OPS + + if current_platform.is_cuda() and current_platform.has_device_capability( + 100): + # Xformers FA is not compatible with B200 + USE_XFORMERS_OPS = False + else: + try: + from importlib.util import find_spec + + find_spec("xformers.ops") + USE_XFORMERS_OPS = True + except ImportError: + USE_XFORMERS_OPS = False + + # the warning only needs to be shown once + if not USE_XFORMERS_OPS: + logger.warning("Xformers is not available, falling back.") + + return USE_XFORMERS_OPS + class Attention(nn.Module): """Attention layer. @@ -314,6 +343,10 @@ def __init__( _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1 } else _Backend.TORCH_SDPA + if (self.attn_backend == _Backend.XFORMERS + and not check_xformers_availability()): + self.attn_backend = _Backend.TORCH_SDPA + def forward( self, query: torch.Tensor, From 217937221b6845913502371aba554a3357fbccfb Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Fri, 18 Jul 2025 17:46:09 -0700 Subject: [PATCH 04/57] Elastic Expert Parallel Initial Support (#20775) Signed-off-by: Rui Qiao --- examples/online_serving/elastic_ep/bench.sh | 57 ++++ examples/online_serving/elastic_ep/scale.py | 53 ++++ .../elastic_ep/serve_deepseek_v2.sh | 72 +++++ tools/ep_kernels/elastic_ep/eep_nvshmem.patch | 92 +++++++ .../elastic_ep/install_eep_libraries.sh | 86 ++++++ vllm/config.py | 13 + vllm/distributed/eplb/eplb_state.py | 252 +++++++++++++++--- vllm/distributed/eplb/rebalance_execute.py | 117 ++++++++ vllm/engine/protocol.py | 6 + vllm/entrypoints/openai/api_server.py | 105 ++++++++ vllm/executor/uniproc_executor.py | 9 + vllm/model_executor/layers/fused_moe/layer.py | 39 ++- vllm/model_executor/models/deepseek_v2.py | 23 +- vllm/model_executor/models/interfaces.py | 7 + vllm/v1/engine/__init__.py | 16 ++ vllm/v1/engine/async_llm.py | 58 ++++ vllm/v1/engine/coordinator.py | 32 ++- vllm/v1/engine/core.py | 69 ++++- vllm/v1/engine/core_client.py | 189 ++++++++++++- vllm/v1/engine/utils.py | 225 +++++++++++++++- vllm/v1/executor/ray_distributed_executor.py | 9 + vllm/v1/worker/cpu_model_runner.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 37 ++- vllm/v1/worker/gpu_worker.py | 159 ++++++++++- 24 files changed, 1659 insertions(+), 68 deletions(-) create mode 100644 examples/online_serving/elastic_ep/bench.sh create mode 100644 examples/online_serving/elastic_ep/scale.py create mode 100644 examples/online_serving/elastic_ep/serve_deepseek_v2.sh create mode 100644 tools/ep_kernels/elastic_ep/eep_nvshmem.patch create mode 100644 tools/ep_kernels/elastic_ep/install_eep_libraries.sh diff --git a/examples/online_serving/elastic_ep/bench.sh b/examples/online_serving/elastic_ep/bench.sh new file mode 100644 index 000000000000..e47631465618 --- /dev/null +++ b/examples/online_serving/elastic_ep/bench.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite" +LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0" +HOST="localhost" +PORT=8006 +NUM_PROMPTS=20 +REQUEST_RATE=5 + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + --model) + MODEL_NAME="$2" + shift 2 + ;; + --local-model) + MODEL_NAME=$LOCAL_MODEL_PATH + shift + ;; + --host) + HOST="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --num-prompts) + NUM_PROMPTS="$2" + shift 2 + ;; + --request-rate) + REQUEST_RATE="$2" + shift 2 + ;; + -h|--help) + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --model MODEL_NAME Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)" + echo " --local-model Use local model path (convenience option)" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use -h or --help for usage information" + exit 1 + ;; + esac +done + +vllm bench serve \ + --model $MODEL_NAME \ + --host $HOST \ + --port $PORT \ + --num-prompts $NUM_PROMPTS \ + --request-rate $REQUEST_RATE diff --git a/examples/online_serving/elastic_ep/scale.py b/examples/online_serving/elastic_ep/scale.py new file mode 100644 index 000000000000..a93c299e3234 --- /dev/null +++ b/examples/online_serving/elastic_ep/scale.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import argparse +import json +import sys + +import requests + + +def scale(host, port, new_dp_size): + url = f"http://{host}:{port}/scale_elastic_ep" + payload = {"new_data_parallel_size": new_dp_size} + headers = {"Content-Type": "application/json"} + + print(f"Sending scale request to {url}") + print(f"Payload: {json.dumps(payload, indent=2)}") + + try: + response = requests.post(url, json=payload, headers=headers, timeout=300) + + print(f"Status Code: {response.status_code}") + print(f"Response: {response.text}") + + if response.status_code == 200: + print("Scale up/down request successful!") + return True + else: + print("Scale up/down request failed!") + return False + + except requests.exceptions.RequestException as e: + print(f"Request failed: {e}") + return False + + +def main(): + parser = argparse.ArgumentParser(description="Test scale up/down functionality") + parser.add_argument("--host", default="localhost", help="API server host") + parser.add_argument("--port", type=int, default=8006, help="API server port") + parser.add_argument( + "--new-dp-size", type=int, default=2, help="New data parallel size" + ) + + args = parser.parse_args() + + success = scale(args.host, args.port, args.new_dp_size) + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() diff --git a/examples/online_serving/elastic_ep/serve_deepseek_v2.sh b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh new file mode 100644 index 000000000000..1234ebba4d81 --- /dev/null +++ b/examples/online_serving/elastic_ep/serve_deepseek_v2.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +HOST="0.0.0.0" +PORT=8006 +DATA_PARALLEL_SIZE=4 +REDUNDANT_EXPERTS=0 +LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0" +MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite" + +while [[ $# -gt 0 ]]; do + case $1 in + --dp) + DATA_PARALLEL_SIZE="$2" + shift 2 + ;; + --re) + REDUNDANT_EXPERTS="$2" + shift 2 + ;; + --host) + HOST="$2" + shift 2 + ;; + --port) + PORT="$2" + shift 2 + ;; + --model) + MODEL_NAME="$2" + shift 2 + ;; + --local-model) + MODEL_NAME=$LOCAL_MODEL_PATH + shift + ;; + -h|--help) + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --dp SIZE Set data parallel size (default: 4)" + echo " --re SIZE Set redundant experts (default: 0)" + echo " --host HOST Set host address (default: 0.0.0.0)" + echo " --port PORT Set port number (default: 8006)" + echo " --model MODEL_NAME Set model name or path" + echo " -h, --help Show this help message" + exit 0 + ;; + *) + echo "Unknown option: $1" + echo "Use -h or --help for usage information" + exit 1 + ;; + esac +done + +echo "Starting vLLM server for $MODEL_NAME with data parallel size: $DATA_PARALLEL_SIZE and redundant experts: $REDUNDANT_EXPERTS" + +export RAY_DEDUP_LOGS=0 +export VLLM_USE_V1=1 +export VLLM_ALL2ALL_BACKEND="pplx" +export VLLM_USE_DEEP_GEMM=1 + +vllm serve $MODEL_NAME \ + --data-parallel-size $DATA_PARALLEL_SIZE \ + --data-parallel-size-local $DATA_PARALLEL_SIZE \ + --data-parallel-backend ray \ + --enforce-eager \ + --enable-expert-parallel \ + --enable-eplb \ + --num-redundant-experts $REDUNDANT_EXPERTS \ + --trust-remote-code \ + --host $HOST \ + --port $PORT diff --git a/tools/ep_kernels/elastic_ep/eep_nvshmem.patch b/tools/ep_kernels/elastic_ep/eep_nvshmem.patch new file mode 100644 index 000000000000..5ebdaea58dd8 --- /dev/null +++ b/tools/ep_kernels/elastic_ep/eep_nvshmem.patch @@ -0,0 +1,92 @@ +From 18c0599c2f07ec965132efa25961dc8179c2dda3 Mon Sep 17 00:00:00 2001 +From: Yongji Wu +Date: Tue, 20 May 2025 13:41:12 -0700 +Subject: [PATCH] fix reinit issues due to states not cleaned up + +fix double free +--- + src/host/init/init.cu | 10 ++++++++++ + .../internal/host/nvshmemi_mem_transport.hpp | 15 +++++++++++++++ + src/modules/bootstrap/uid/bootstrap_uid.cpp | 5 +++++ + 3 files changed, 30 insertions(+) + +diff --git a/src/host/init/init.cu b/src/host/init/init.cu +index b1c5dbf..1fecb4b 100644 +--- a/src/host/init/init.cu ++++ b/src/host/init/init.cu +@@ -43,6 +43,8 @@ + #include "internal/host/nvshmemi_types.h" + #include "internal/host/shared_memory.h" + #include "internal/host/nvshmemi_symmetric_heap.hpp" ++// eep-dev ++#include "internal/host/nvshmemi_mem_transport.hpp" + + extern __constant__ nvshmemi_device_host_state_t nvshmemi_device_state_d; + static std::map registered_device_states; +@@ -1293,6 +1295,14 @@ void nvshmemid_hostlib_finalize(void *device_ctx, void *transport_device_ctx) { + /* Multi-init Multi-fini*/ + nvshmemi_state = NULL; + nvshmemi_device_state.nvshmemi_is_nvshmem_initialized = 0; ++ ++ // eep-dev ++ nvshmemi_mem_p2p_transport::destroy_instance(); ++ nvshmemi_mem_remote_transport::destroy_instance(); ++ free(nvshmemi_default_session); ++ nvshmemi_default_session = nullptr; ++ nvshmemi_device_state.nvshmemi_is_nvshmem_bootstrapped = false; ++ + nvshmemi_is_device_state_ready = false; + } else + nvshmemi_boot_handle.barrier(&nvshmemi_boot_handle); +diff --git a/src/include/internal/host/nvshmemi_mem_transport.hpp b/src/include/internal/host/nvshmemi_mem_transport.hpp +index 2495844..e4f408a 100644 +--- a/src/include/internal/host/nvshmemi_mem_transport.hpp ++++ b/src/include/internal/host/nvshmemi_mem_transport.hpp +@@ -36,6 +36,13 @@ class nvshmemi_mem_p2p_transport final { + return p2p_objref_; + } + } ++ // eep-dev ++ static void destroy_instance(void) { ++ if (p2p_objref_ != nullptr) { ++ delete p2p_objref_; ++ p2p_objref_ = nullptr; ++ } ++ } + + void print_mem_handle(int pe_id, int transport_idx, nvshmemi_symmetric_heap &obj); + +@@ -87,6 +94,14 @@ class nvshmemi_mem_remote_transport final { + } + } + ++ // eep-dev ++ static void destroy_instance(void) { ++ if (remote_objref_ != nullptr) { ++ delete remote_objref_; ++ remote_objref_ = nullptr; ++ } ++ } ++ + int gather_mem_handles(nvshmemi_symmetric_heap &obj, uint64_t heap_offset, size_t size); + /* On-demand registration and release of memory */ + int register_mem_handle(nvshmem_mem_handle_t *local_handles, int transport_idx, +diff --git a/src/modules/bootstrap/uid/bootstrap_uid.cpp b/src/modules/bootstrap/uid/bootstrap_uid.cpp +index a1fa748..788fa96 100644 +--- a/src/modules/bootstrap/uid/bootstrap_uid.cpp ++++ b/src/modules/bootstrap/uid/bootstrap_uid.cpp +@@ -630,6 +630,11 @@ int nvshmemi_bootstrap_plugin_pre_init(bootstrap_handle_t* handle, const int abi + // Discover the network for bootstrap, if not done previously. + // This code needs to be stateful to be able to be called multiple times by the caller + BOOTSTRAP_CHECK(bootstrap_net_init()); ++ // eep-dev ++ if (handle->pre_init_ops != nullptr) { ++ BOOTSTRAP_PTR_FREE(handle->pre_init_ops); ++ handle->pre_init_ops = nullptr; ++ } + if (handle->pre_init_ops == nullptr) { + BOOTSTRAP_CALLOC(&handle->pre_init_ops, 1); + handle->pre_init_ops->get_unique_id = bootstrap_get_unique_id; +-- +2.43.0 + diff --git a/tools/ep_kernels/elastic_ep/install_eep_libraries.sh b/tools/ep_kernels/elastic_ep/install_eep_libraries.sh new file mode 100644 index 000000000000..9d7dc1032f5e --- /dev/null +++ b/tools/ep_kernels/elastic_ep/install_eep_libraries.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +set -ex + +# Default workspace directory +WORKSPACE=$(pwd)/eep_kernels_workspace +INSTALL_NVSHMEM=true + +# Parse command line arguments +while getopts "w:n" opt; do + case $opt in + w) + WORKSPACE="$OPTARG" + ;; + n) + INSTALL_NVSHMEM=false + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + exit 1 + ;; + esac +done + +if [ ! -d "$WORKSPACE" ]; then + mkdir -p $WORKSPACE +fi + + +# install dependencies if not installed +pip3 install cmake torch ninja + +# build nvshmem +pushd $WORKSPACE +# Reset NVSHMEM build if requested +if [ "$INSTALL_NVSHMEM" = true ]; then + mkdir -p nvshmem_src + wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz + tar -xvf nvshmem_src_3.2.5-1.txz -C nvshmem_src --strip-components=1 + pushd nvshmem_src + wget https://github.com/deepseek-ai/DeepEP/raw/main/third-party/nvshmem.patch + git init + git apply -vvv nvshmem.patch + git apply --reject --whitespace=fix ../../eep_nvshmem.patch +else + pushd nvshmem_src +fi + +# assume CUDA_HOME is set correctly +if [ -z "$CUDA_HOME" ]; then + echo "CUDA_HOME is not set, please set it to your CUDA installation directory." + exit 1 +fi + +# disable all features except IBGDA +export NVSHMEM_IBGDA_SUPPORT=1 + +export NVSHMEM_SHMEM_SUPPORT=0 +export NVSHMEM_UCX_SUPPORT=0 +export NVSHMEM_USE_NCCL=0 +export NVSHMEM_PMIX_SUPPORT=0 +export NVSHMEM_TIMEOUT_DEVICE_POLLING=0 +export NVSHMEM_USE_GDRCOPY=0 +export NVSHMEM_IBRC_SUPPORT=0 +export NVSHMEM_BUILD_TESTS=0 +export NVSHMEM_BUILD_EXAMPLES=0 +export NVSHMEM_MPI_SUPPORT=0 +export NVSHMEM_BUILD_HYDRA_LAUNCHER=0 +export NVSHMEM_BUILD_TXZ_PACKAGE=0 +export NVSHMEM_TIMEOUT_DEVICE_POLLING=0 + +cmake -G Ninja -S . -B $WORKSPACE/nvshmem_build/ -DCMAKE_INSTALL_PREFIX=$WORKSPACE/nvshmem_install +cmake --build $WORKSPACE/nvshmem_build/ --target install + +popd + +export CMAKE_PREFIX_PATH=$WORKSPACE/nvshmem_install:$CMAKE_PREFIX_PATH + +# build and install pplx, require pytorch installed +pushd $WORKSPACE +git clone https://github.com/ppl-ai/pplx-kernels +cd pplx-kernels +# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 +# PIP_NO_BUILD_ISOLATION=0 disables build isolation +PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install . --no-deps -v + diff --git a/vllm/config.py b/vllm/config.py index f94c08c32536..a415683f4e79 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2008,6 +2008,19 @@ def has_unfinished_dp(dp_group: "ProcessGroup", aggregated_has_unfinished = bool(tensor.item()) return aggregated_has_unfinished + @staticmethod + def sync_kv_cache_memory_size(dp_group: "ProcessGroup", + kv_cache_memory: int) -> int: + if kv_cache_memory == -1: + kv_cache_memory = torch.iinfo(torch.int64).max + tensor = torch.tensor([kv_cache_memory], + dtype=torch.int64, + device="cpu") + # we cannot use broadcast for stateless dp group since it depends + # on global rank + torch.distributed.all_reduce(tensor, op=ReduceOp.MIN, group=dp_group) + return tensor.item() + def compute_hash(self): """ Provide a hash that uniquely identifies all the configs diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 6b0a126ca9b2..af6462084968 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -29,12 +29,15 @@ import time from collections.abc import Sequence from dataclasses import dataclass +from typing import Optional, Union import torch -from torch.distributed import all_gather, all_reduce +from torch.distributed import ProcessGroup, all_gather, all_reduce from vllm.config import ParallelConfig -from vllm.distributed.parallel_state import get_ep_group, get_node_count +from vllm.distributed.parallel_state import (get_ep_group, get_node_count, + in_the_same_node_as) +from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.model_executor.models.interfaces import MixtureOfExperts @@ -172,6 +175,9 @@ def build( model: MixtureOfExperts, device: torch.device, parallel_config: ParallelConfig, + global_expert_load: Optional[torch.Tensor] = None, + old_global_expert_indices: Optional[torch.Tensor] = None, + rank_mapping: Optional[dict[int, int]] = None, ) -> "EplbState": """ Build the initial EPLB state. @@ -185,8 +191,16 @@ def build( physical_to_logical_map_list, device=device, ) + # Assuming 8 GPUs per node, this supports up to + # (1023 + 1) / 8 = 128 nodes for now. + # TODO(rui): make this configurable + MAX_EXPERT_REDUNDANCY = 1023 + assert model.num_redundant_experts <= MAX_EXPERT_REDUNDANCY, ( + f"num_redundant_experts {model.num_redundant_experts} " + f"must be less than or equal to {MAX_EXPERT_REDUNDANCY}") + max_slots_per_logical_expert = MAX_EXPERT_REDUNDANCY + 1 logical_to_physical_map = torch.full( - (model.num_logical_experts, model.num_redundant_experts + 1), + (model.num_logical_experts, max_slots_per_logical_expert), -1, device=device, ) @@ -235,11 +249,63 @@ def build( expert_rearrangement_step = max( 0, eplb_step_interval - eplb_step_interval // 4) + if global_expert_load is not None: + ep_group = get_ep_group().device_group + assert global_expert_load.shape == (model.num_moe_layers, + model.num_logical_experts) + assert global_expert_load.dtype == torch.int64 + + num_replicas = model.num_physical_experts + num_groups = model.num_expert_groups + num_nodes = get_node_count() + num_gpus = ep_group.size() + + if num_gpus % num_nodes != 0: + num_nodes = 1 + logger.warning_once( + f"num_gpus % num_nodes != 0, " + "not using hierarchical rearrangement algorithm.\n" + f"{num_gpus=}, {num_nodes=}") + + # Get new expert mappings + ( + new_physical_to_logical_map, + new_logical_to_physical_map, + new_logical_replica_count, + ) = (rebalance_experts( + global_expert_load, + num_replicas, + num_groups, + num_nodes, + num_gpus, + )) + + max_physical_slots = new_logical_to_physical_map.shape[-1] + assert max_physical_slots <= logical_to_physical_map.shape[-1] + new_logical_to_physical_map = torch.nn.functional.pad( + new_logical_to_physical_map, + (0, logical_to_physical_map.shape[-1] - max_physical_slots), + value=-1, + ) + physical_to_logical_map = new_physical_to_logical_map.to(device) + logical_to_physical_map.copy_(new_logical_to_physical_map) + logical_replica_count.copy_(new_logical_replica_count) + model.set_eplb_state( expert_load_pass, logical_to_physical_map, logical_replica_count, ) + if global_expert_load is not None: + rearrange_expert_weights_inplace( + old_global_expert_indices, + new_physical_to_logical_map, + model.expert_weights, + ep_group, + False, + rank_mapping, + ) + expert_rearrangement_step = 0 return cls( physical_to_logical_map, @@ -337,7 +403,10 @@ def step(self, def rearrange(self, model: MixtureOfExperts, - is_profile: bool = False) -> None: + is_profile: bool = False, + execute_shuffle: bool = True, + global_expert_load: Optional[torch.Tensor] = None, + rank_mapping: Optional[dict[int, int]] = None) -> None: """ Rearrange the experts according to the current load. """ @@ -353,42 +422,79 @@ def rearrange(self, logger.info("Rearranging experts %s...", "(profile)" if is_profile else "") - # This mapping is only used here, so we do not store it in the state - physical_expert_start = ep_rank * model.num_local_physical_experts - physical_expert_end = (physical_expert_start + - model.num_local_physical_experts) - # (num_moe_layers, num_local_physical_experts) - local_physical_to_logical_map = self.physical_to_logical_map[ - :, - physical_expert_start:physical_expert_end, - ] + if global_expert_load is None: + # This mapping is only used here, so we do not store it in the state + physical_expert_start = ep_rank * model.num_local_physical_experts + physical_expert_end = (physical_expert_start + + model.num_local_physical_experts) + # (num_moe_layers, num_local_physical_experts) + local_physical_to_logical_map = self.physical_to_logical_map[ + :, + physical_expert_start:physical_expert_end, + ] - # Map the local physical expert load to global logical experts - logical_expert_load_window = torch.zeros( - self.expert_load_window_size, - model.num_moe_layers, - model.num_logical_experts, - dtype=self.expert_load_window.dtype, - device=self.expert_load_window.device, - ) - logical_expert_load_window.scatter_add_( - dim=-1, - index=local_physical_to_logical_map.unsqueeze(0).expand_as( - self.expert_load_window).long(), - src=self.expert_load_window, - ) + # Map the local physical expert load to global logical experts + logical_expert_load_window = torch.zeros( + self.expert_load_window_size, + model.num_moe_layers, + model.num_logical_experts, + dtype=self.expert_load_window.dtype, + device=self.expert_load_window.device, + ) + logical_expert_load_window.scatter_add_( + dim=-1, + index=local_physical_to_logical_map.unsqueeze(0).expand_as( + self.expert_load_window).long(), + src=self.expert_load_window, + ) - # Perform all-reduce to get the expert load across all ranks - global_expert_load_window = logical_expert_load_window.sum(dim=0) - all_reduce(global_expert_load_window, group=ep_group) + if not execute_shuffle: + metadata = torch.tensor( + [ + model.num_moe_layers, model.num_logical_experts, + self.physical_to_logical_map.shape[1] + ], + dtype=torch.int32, + device="cpu", + ) + torch.distributed.broadcast(metadata, + group=get_ep_group().cpu_group, + group_src=0) + + # Perform all-reduce to get the expert load across all ranks + global_expert_load_window = logical_expert_load_window.sum(dim=0) + all_reduce(global_expert_load_window, group=ep_group) + + if not execute_shuffle: + # (num_moe_layers, old_num_physical_experts) + old_global_expert_indices = self.physical_to_logical_map + torch.distributed.broadcast(old_global_expert_indices, + group=ep_group, + group_src=0) + return global_expert_load_window + else: + assert execute_shuffle + global_expert_load_window = global_expert_load # TODO(bowen): Treat differently for prefill and decode nodes num_replicas = model.num_physical_experts num_groups = model.num_expert_groups - num_nodes = get_node_count() - num_gpus = ep_group.size() + if rank_mapping is not None and len(rank_mapping) == ep_group.size(): + # NOTE(yongji): scale down, we need to rebalance the experts on + # remaining GPUs, transfer the experts while we haven't shutdown + # the GPUs to be released. + cpu_group = get_ep_group().cpu_group + num_nodes = _node_count_with_rank_mapping(cpu_group, rank_mapping) + num_gpus = sum(new_rank != -1 + for new_rank in rank_mapping.values()) + num_replicas = num_replicas // ep_group.size( + ) * num_gpus # handle num replicas change + else: + num_nodes = get_node_count() + num_gpus = ep_group.size() if num_gpus % num_nodes != 0: + self.num_nodes = 1 logger.warning_once( f"num_gpus % num_nodes != 0, " "not using hierarchical rearrangement algorithm.\n" @@ -414,10 +520,24 @@ def rearrange(self, model.expert_weights, ep_group, is_profile, + rank_mapping, ) if not is_profile: - self.physical_to_logical_map.copy_(new_physical_to_logical_map) + if self.physical_to_logical_map.shape[ + 1] != new_physical_to_logical_map.shape[1]: + self.physical_to_logical_map = new_physical_to_logical_map.to( + self.physical_to_logical_map.device) + else: + self.physical_to_logical_map.copy_(new_physical_to_logical_map) + max_physical_slots = new_logical_to_physical_map.shape[-1] + assert max_physical_slots <= self.logical_to_physical_map.shape[-1] + new_logical_to_physical_map = torch.nn.functional.pad( + new_logical_to_physical_map, + (0, + self.logical_to_physical_map.shape[-1] - max_physical_slots), + value=-1, + ) self.logical_to_physical_map.copy_(new_logical_to_physical_map) self.logical_replica_count.copy_(new_logical_replica_count) @@ -430,3 +550,69 @@ def rearrange(self, " (profile) " if is_profile else " ", time_end - time_start, ) + + @staticmethod + def recv_state() -> tuple[torch.Tensor, torch.Tensor]: + """ + Receive the expert load and old placement from the master rank. + """ + ep_group = get_ep_group() + metadata = torch.empty(3, dtype=torch.int32, device="cpu") + torch.distributed.broadcast(metadata, + group=ep_group.cpu_group, + group_src=0) + num_moe_layers, num_logical_experts, num_old_physical_experts = ( + metadata.tolist()) + global_expert_load = torch.zeros( + (num_moe_layers, num_logical_experts), + dtype=torch.int64, + device=ep_group.device, + ) + all_reduce(global_expert_load, group=ep_group.device_group) + old_global_expert_indices = torch.empty( + (num_moe_layers, num_old_physical_experts), + dtype=torch.int64, + device=ep_group.device, + ) + torch.distributed.broadcast(old_global_expert_indices, + group=ep_group.device_group, + group_src=0) + + return global_expert_load, old_global_expert_indices + + +def _node_count_with_rank_mapping( + pg: Union[ProcessGroup, StatelessProcessGroup], + rank_mapping: dict[int, int], +) -> int: + if isinstance(pg, ProcessGroup): + world_size = torch.distributed.get_world_size(group=pg) + else: + world_size = pg.world_size + + if world_size == 1: + return 1 + + # Build node assignment map + node_assignment = [0] * world_size # rank -> node_id + next_node_id = 0 + + for current_rank in range(world_size): + if node_assignment[current_rank] != 0: + continue # Already assigned to a node + + assert current_rank in rank_mapping + if rank_mapping[current_rank] == -1: + continue # Pending shutdown + + # Assign current rank to a new node + next_node_id += 1 + node_assignment[current_rank] = next_node_id + + # Find all ranks on the same node as current_rank + same_node_flags = in_the_same_node_as(pg, current_rank) + for other_rank, is_same_node in enumerate(same_node_flags): + if is_same_node and node_assignment[other_rank] == 0: + node_assignment[other_rank] = next_node_id + + return next_node_id diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 2ef8587b559b..f8a7d1170bb0 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -8,6 +8,7 @@ from collections.abc import Iterable, MutableSequence, Sequence from functools import partial +from typing import Optional import torch from torch.distributed import (P2POp, ProcessGroup, all_gather, @@ -127,6 +128,8 @@ def shuffle_layer( dst_global = local2global(dst) if is_received_locally[dst]: continue + if old_indices[src_global] == -1 or new_indices[dst_global] == -1: + continue if old_indices[src_global] == new_indices[dst_global]: is_received_locally[dst] = True for weight, buffer in zip(expert_weights, @@ -139,6 +142,8 @@ def shuffle_layer( experts_send_loc: dict[int, int] = {} for src in range(num_local_experts): expert = old_indices[local2global(src)] + if expert == -1: + continue if expert in experts_send_loc: continue experts_send_loc[expert] = src @@ -181,6 +186,8 @@ def shuffle_layer( if is_received_locally[dst]: continue expert = new_indices[local2global(dst)] + if expert == -1: + continue if expert in experts_recv_loc: continue experts_recv_loc[expert] = dst @@ -227,6 +234,8 @@ def shuffle_layer( weight[dst].copy_(buffer[dst]) else: expert = new_indices[local2global(dst)] + if expert == -1: + continue src = experts_recv_loc[expert] for weight, buffer in zip(expert_weights, expert_weights_buffer): weight[dst].copy_(buffer[src]) @@ -238,6 +247,7 @@ def rearrange_expert_weights_inplace( expert_weights: Sequence[Iterable[torch.Tensor]], ep_group: ProcessGroup, is_profile: bool = False, + rank_mapping: Optional[dict[int, int]] = None, ) -> None: """ Rearranges the expert weights in place according to the new expert indices. @@ -256,7 +266,28 @@ def rearrange_expert_weights_inplace( is_profile (bool): If `True`, do not perform any actual weight copy. This is used during profile run, where we only perform dummy communications to reserve enough memory for the buffers. + rank_mapping: A dictionary mapping old rank to new rank. """ + if rank_mapping is not None: + if len(rank_mapping) == ep_group.size(): + # scale down + new_global_expert_indices = \ + _map_new_expert_indices_with_rank_mapping( + new_global_expert_indices, + rank_mapping, + ) + else: + # scale up + old_global_expert_indices = \ + _map_old_expert_indices_with_rank_mapping( + old_global_expert_indices, + rank_mapping, + ep_group.size(), + ) + + assert old_global_expert_indices.shape[ + 1] == new_global_expert_indices.shape[1] + num_moe_layers, num_physical_experts = old_global_expert_indices.shape assert len(expert_weights) == num_moe_layers @@ -304,4 +335,90 @@ def rearrange_expert_weights_inplace( ) +def _map_old_expert_indices_with_rank_mapping( + old_global_expert_indices: torch.Tensor, + rank_mapping: dict[int, int], + new_ep_size: int, +) -> torch.Tensor: + """ + Map the old global expert indices to the new global expert indices. + + Args: + old_global_expert_indices: + Shape (num_layers, old_ep_size * num_local_physical_experts). + rank_mapping: Mapping from old rank to new rank. + new_ep_size: New expert parallelism size. + + Returns: + Mapped expert indices with shape + (num_layers, new_ep_size * num_local_physical_experts). + """ + num_layers, old_num_physical_experts = old_global_expert_indices.shape + assert rank_mapping, "Rank mapping is required" + + # Get sizes from parameters and rank_mapping + old_ep_size = len(rank_mapping) + num_local_physical_experts = old_num_physical_experts // old_ep_size + new_num_physical_experts = new_ep_size * num_local_physical_experts + + # Create mapped tensor with new shape, initialized to -1 + mapped_expert_indices = torch.full( + (num_layers, new_num_physical_experts), + fill_value=-1, + dtype=old_global_expert_indices.dtype, + device=old_global_expert_indices.device, + ) + + # Handle rank mapping (scale up/down with rank changes) + for old_rank in range(old_ep_size): + new_rank = rank_mapping.get(old_rank) + if new_rank is not None and new_rank >= 0 and new_rank < new_ep_size: + # This old rank exists in the new configuration + old_start_idx = old_rank * num_local_physical_experts + old_end_idx = (old_rank + 1) * num_local_physical_experts + new_start_idx = new_rank * num_local_physical_experts + new_end_idx = (new_rank + 1) * num_local_physical_experts + + mapped_expert_indices[:, new_start_idx:new_end_idx] = \ + old_global_expert_indices[:, old_start_idx:old_end_idx] + # If new_rank is None or >= new_ep_size, the experts remain -1 + # (scale down case) + + return mapped_expert_indices + + +def _map_new_expert_indices_with_rank_mapping( + new_global_expert_indices: torch.Tensor, + rank_mapping: dict[int, int], +) -> torch.Tensor: + num_layers, new_num_physical_experts = new_global_expert_indices.shape + assert rank_mapping, "Rank mapping is required" + + # Get sizes from parameters and rank_mapping + old_ep_size = len(rank_mapping) + new_ep_size = sum(new_rank != -1 for new_rank in rank_mapping.values()) + num_local_physical_experts = new_num_physical_experts // new_ep_size + old_num_physical_experts = old_ep_size * num_local_physical_experts + + mapped_expert_indices = torch.full( + (num_layers, old_num_physical_experts), + fill_value=-1, + dtype=new_global_expert_indices.dtype, + device=new_global_expert_indices.device, + ) + + for old_rank in range(old_ep_size): + new_rank = rank_mapping[old_rank] + if new_rank >= 0 and new_rank < new_ep_size: + old_start_idx = old_rank * num_local_physical_experts + old_end_idx = (old_rank + 1) * num_local_physical_experts + new_start_idx = new_rank * num_local_physical_experts + new_end_idx = (new_rank + 1) * num_local_physical_experts + + mapped_expert_indices[:, old_start_idx:old_end_idx] = \ + new_global_expert_indices[:, new_start_idx:new_end_idx] + + return mapped_expert_indices + + __all__ = ["rearrange_expert_weights_inplace"] diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 8688fcc82cd9..f5cc9c474051 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -324,3 +324,9 @@ async def is_sleeping(self) -> bool: async def add_lora(self, lora_request: LoRARequest) -> None: """Load a new LoRA adapter into the engine for future requests.""" ... + + async def scale_elastic_ep(self, + new_data_parallel_size: int, + drain_timeout: int = 300) -> None: + """Scale the engine""" + raise NotImplementedError diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index c2185acbf0c0..3f0c1c85dee6 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1018,6 +1018,73 @@ async def is_sleeping(raw_request: Request): return JSONResponse(content={"is_sleeping": is_sleeping}) +@router.post("/scale_elastic_ep", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: { + "model": dict + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.REQUEST_TIMEOUT.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) +async def scale_elastic_ep(raw_request: Request): + try: + body = await raw_request.json() + except json.JSONDecodeError as e: + raise HTTPException(status_code=400, + detail="Invalid JSON format") from e # noqa: B904 + + new_data_parallel_size = body.get("new_data_parallel_size") + drain_timeout = body.get("drain_timeout", 120) # Default 2 minutes + + if new_data_parallel_size is None: + raise HTTPException(status_code=400, + detail="new_data_parallel_size is required") + + if not isinstance(new_data_parallel_size, + int) or new_data_parallel_size <= 0: + raise HTTPException( + status_code=400, + detail="new_data_parallel_size must be a positive integer") + + if not isinstance(drain_timeout, int) or drain_timeout <= 0: + raise HTTPException(status_code=400, + detail="drain_timeout must be a positive integer") + + # Set scaling flag to prevent new requests + global _scaling_elastic_ep + _scaling_elastic_ep = True + client = engine_client(raw_request) + try: + await client.scale_elastic_ep(new_data_parallel_size, drain_timeout) + return JSONResponse({ + "message": + f"Scaled to {new_data_parallel_size} " + "data parallel engines", + }) + except TimeoutError as e: + raise HTTPException(status_code=408, + detail="Scale failed due to request drain timeout " + f"after {drain_timeout} seconds") from e + except Exception as e: + logger.error("Scale failed: %s", e) + raise HTTPException(status_code=500, detail="Scale failed") from e + finally: + _scaling_elastic_ep = False + + +@router.post("/is_scaling_elastic_ep") +async def is_scaling_elastic_ep(raw_request: Request): + return JSONResponse({"is_scaling_elastic_ep": _scaling_elastic_ep}) + + # TODO: RequestType = TypeForm[BaseModel] when recognized by type checkers # (requires typing_extensions >= 4.13) RequestType = Any @@ -1216,6 +1283,41 @@ async def send_with_request_id(message: Message) -> None: return self.app(scope, receive, send_with_request_id) +# Global variable to track scaling state +_scaling_elastic_ep = False + + +class ScalingMiddleware: + """ + Middleware that checks if the model is currently scaling and + returns a 503 Service Unavailable response if it is. + + This middleware applies to all HTTP requests and prevents + processing when the model is in a scaling state. + """ + + def __init__(self, app: ASGIApp) -> None: + self.app = app + + def __call__(self, scope: Scope, receive: Receive, + send: Send) -> Awaitable[None]: + if scope["type"] != "http": + return self.app(scope, receive, send) + + # Check global scaling state + global _scaling_elastic_ep + if _scaling_elastic_ep: + # Return 503 Service Unavailable response + response = JSONResponse(content={ + "error": + "The model is currently scaling. Please try again later." + }, + status_code=503) + return response(scope, receive, send) + + return self.app(scope, receive, send) + + def _extract_content_from_chunk(chunk_data: dict) -> str: """Extract content from a streaming response chunk.""" try: @@ -1404,6 +1506,9 @@ async def validation_exception_handler(_: Request, if args.enable_request_id_headers: app.add_middleware(XRequestIdMiddleware) + # Add scaling middleware to check for scaling state + app.add_middleware(ScalingMiddleware) + if envs.VLLM_DEBUG_LOG_API_SERVER_RESPONSE: logger.warning("CAUTION: Enabling log response in the API Server. " "This can include sensitive information and should be " diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 7ebeb4a22556..aabc9ed9b80a 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -12,6 +12,7 @@ from vllm.logger import init_logger from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, run_method) +from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -62,6 +63,14 @@ def check_health(self) -> None: # it's running. return + def reinitialize_distributed( + self, reconfig_request: ReconfigureDistributedRequest) -> None: + self.driver_worker.reinitialize_distributed(reconfig_request) + if reconfig_request.new_data_parallel_rank == \ + ReconfigureRankType.SHUTDOWN_CURRENT_RANK: + self.shutdown() + return + UniProcExecutorAsync = UniProcExecutor diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 4b8a37fcc738..4a6a3b95ec7f 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -265,9 +265,6 @@ def select_gemm_impl( prepare_finalize: FusedMoEPrepareAndFinalize, moe: FusedMoEConfig, ) -> FusedMoEPermuteExpertsUnpermute: - - assert self.fused_experts == fused_experts - if (prepare_finalize.activation_format == FusedMoEActivationFormat.BatchedExperts): logger.debug("BatchedTritonExperts %s", self.moe) @@ -375,8 +372,10 @@ def apply( logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: if enable_eplb: - raise NotImplementedError( - "EPLB not supported for `UnquantizedFusedMoEMethod` yet.") + assert expert_load_view is not None + assert logical_to_physical_map is not None + assert logical_replica_count is not None + assert isinstance(layer, FusedMoE) return self.forward( x=x, @@ -393,7 +392,12 @@ def apply( scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input) + apply_router_weight_on_input=apply_router_weight_on_input, + enable_eplb=enable_eplb, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) def forward_cuda( self, @@ -412,6 +416,10 @@ def forward_cuda( e_score_correction_bias: Optional[torch.Tensor] = None, apply_router_weight_on_input: bool = False, activation: str = "silu", + enable_eplb: bool = False, + expert_load_view: Optional[torch.Tensor] = None, + logical_to_physical_map: Optional[torch.Tensor] = None, + logical_replica_count: Optional[torch.Tensor] = None, ) -> torch.Tensor: topk_weights, topk_ids = FusedMoE.select_experts( @@ -425,7 +433,12 @@ def forward_cuda( custom_routing_function=custom_routing_function, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype) + indices_type=self.topk_indices_dtype, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count) if self.rocm_aiter_moe_enabled: return self.rocm_aiter_fused_experts( @@ -730,7 +743,8 @@ def __init__( if self.enable_eplb: from vllm.model_executor.layers.quantization.fp8 import ( Fp8MoEMethod) - if not isinstance(quant_method, Fp8MoEMethod): + if not isinstance(quant_method, + (Fp8MoEMethod, UnquantizedFusedMoEMethod)): # TODO: Add support for additional quantization methods. # The implementation for other quantization methods does not # contain essential differences, but the current quant API @@ -821,6 +835,15 @@ def use_deepep_ll_kernels(self): def use_flashinfer_cutlass_kernels(self): return self.moe_parallel_config.use_flashinfer_cutlass_kernels + def update_expert_map(self): + # ep_size and ep_rank should already be updated + assert self.expert_map is not None + with self.expert_map.device: + self.local_num_experts, self.expert_map = determine_expert_map( + ep_size=self.ep_size, + ep_rank=self.ep_rank, + global_num_experts=self.global_num_experts) + def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, loaded_weight: torch.Tensor, diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 8d36dda65b5d..5106b9914b5e 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -776,6 +776,24 @@ def set_eplb_state( logical_replica_count=logical_replica_count, ) + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + assert self.num_local_physical_experts == num_local_physical_experts + self.num_physical_experts = num_physical_experts + self.num_local_physical_experts = num_local_physical_experts + self.num_redundant_experts = (num_physical_experts - + self.num_logical_experts) + for layer in self.model.layers: + if isinstance(layer.mlp, DeepseekV2MoE): + moe = layer.mlp + moe.n_local_physical_experts = num_local_physical_experts + moe.n_physical_experts = num_physical_experts + moe.n_redundant_experts = self.num_redundant_experts + moe.experts.update_expert_map() + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.model.get_input_embeddings(input_ids) @@ -931,9 +949,8 @@ class DeepseekV3ForCausalLM(DeepseekV2ForCausalLM): def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, weight_name: str) -> Optional[int]: - if hasattr(config, - "num_nextn_predict_layers") and (config.num_nextn_predict_layers - > 0): + if (hasattr(config, "num_nextn_predict_layers") + and config.num_nextn_predict_layers > 0): layer_idx = config.num_hidden_layers for i in range(config.num_nextn_predict_layers): if weight_name.startswith(f"model.layers.{layer_idx+i}."): diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b60f1a5b6ff2..7f3efde43474 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -543,6 +543,13 @@ def set_eplb_state( """ ... + def update_physical_experts_metadata( + self, + num_physical_experts: int, + num_local_physical_experts: int, + ) -> None: + ... + def is_mixture_of_experts(model: object) -> TypeIs[MixtureOfExperts]: return isinstance(model, MixtureOfExperts) diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 921ccd708cdd..79dc80d8fc54 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -177,3 +177,19 @@ class EngineCoreRequestType(enum.Enum): UTILITY = b'\x03' # Sentinel used within EngineCoreProc. EXECUTOR_FAILED = b'\x04' + + +class ReconfigureDistributedRequest(msgspec.Struct): + new_data_parallel_size: int + new_data_parallel_rank: int + new_data_parallel_rank_local: int + new_data_parallel_master_ip: str + new_data_parallel_master_port: int + + +class ReconfigureRankType(enum.IntEnum): + """ + Rank type for reconfiguring distributed request. + """ + KEEP_CURRENT_RANK = -1 + SHUTDOWN_CURRENT_RANK = -2 diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3754570dfaaa..6395d2c1875b 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import time from collections.abc import AsyncGenerator, Mapping from copy import copy from typing import Any, Optional, Union @@ -608,6 +609,63 @@ async def collective_rpc(self, return await self.engine_core.collective_rpc_async( method, timeout, args, kwargs) + async def wait_for_requests_to_drain(self, drain_timeout: int = 300): + """Wait for all requests to be drained.""" + start_time = time.time() + while time.time() - start_time < drain_timeout: + if not self.engine_core.dp_engines_running(): + logger.info("Engines are idle, requests have been drained") + return + + logger.info( + "Engines are still running, waiting for requests to drain...") + await asyncio.sleep(1) # Wait 1 second before checking again + + raise TimeoutError(f"Timeout reached after {drain_timeout} seconds " + "waiting for requests to drain.") + + async def scale_elastic_ep(self, + new_data_parallel_size: int, + drain_timeout: int = 300): + """ + Scale up or down the data parallel size by adding or removing + engine cores. + Args: + new_data_parallel_size: The new number of data parallel workers + drain_timeout: + Maximum time to wait for requests to drain (seconds) + """ + old_data_parallel_size = \ + self.vllm_config.parallel_config.data_parallel_size + if old_data_parallel_size == new_data_parallel_size: + logger.info("Data parallel size is already %s, skipping scale", + new_data_parallel_size) + return + logger.info( + "Waiting for requests to drain before " + "scaling up to %s engines...", new_data_parallel_size) + await self.wait_for_requests_to_drain(drain_timeout) + logger.info( + "Requests have been drained, proceeding with scale " + "to %s engines", new_data_parallel_size) + await self.engine_core.scale_elastic_ep(new_data_parallel_size) + self.vllm_config.parallel_config.data_parallel_size = \ + new_data_parallel_size + + # recreate stat loggers + if new_data_parallel_size > old_data_parallel_size: + stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( + vllm_config=self.vllm_config, + log_stats=self.log_stats, + engine_num=new_data_parallel_size, + custom_stat_loggers=None, + ) + num_new_engines = len(stat_loggers) - len(self.stat_loggers) + self.stat_loggers.extend(stat_loggers[-num_new_engines:]) + else: + for _ in range(old_data_parallel_size - new_data_parallel_size): + self.stat_loggers.pop() + @property def is_running(self) -> bool: # Is None before the loop is started. diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index b3e7a2e85b80..005e71647aae 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -200,11 +200,41 @@ def process_input_socket(self, front_publish_address: str, # Ignore subscription messages. continue + decoded = msgspec.msgpack.decode(buffer) + if isinstance(decoded, (list, tuple)) and len( + decoded) == 2 and decoded[0] == "SCALE_ELASTIC_EP": + # Handle scale up notification + new_engine_count = decoded[1] + current_count = len(self.engines) + if new_engine_count > current_count: + for _ in range(new_engine_count - current_count): + self.engines.append(EngineState()) + # NOTE(yongji): handle the case + # where newly started engines have current_wave = 0 + # if existing engines just finished a wave + # and engine_running isn't updated yet at + # CoordinatorProc requests routed to newly started + # engines may not wake up existing engines, as long + # as 0 < request.wave < existing engines' + # current_wave + # we note that 0 is the wave number for the new + # engine + self.engines_running = False + logger.info( + "DPCoordinator scaled up from %s to %s " + "engines", current_count, new_engine_count) + else: + self.engines = self.engines[:new_engine_count] + logger.info( + "DPCoordinator scaled down from %s to %s " + "engines", current_count, new_engine_count) + continue # Skip normal engine notification processing + # We received a message on the front-end XPUB socket, # from an API server sending a new request while the # engines are paused, so that we can wake the other # engines. - engine_to_exclude, wave = msgspec.msgpack.decode(buffer) + engine_to_exclude, wave = decoded if not self.engines_running: if wave < self.current_wave: # If the wave number is stale, ensure the message diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index b3210197750b..ca636bf5a6f7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -32,7 +32,9 @@ from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, - EngineCoreRequestType, UtilityOutput) + EngineCoreRequestType, + ReconfigureDistributedRequest, ReconfigureRankType, + UtilityOutput) from vllm.v1.engine.mm_input_cache import MirroredProcessingCache from vllm.v1.engine.utils import EngineHandshakeMetadata, EngineZmqAddresses from vllm.v1.executor.abstract import Executor @@ -77,6 +79,8 @@ def __init__(self, self.model_executor.register_failure_callback( executor_fail_callback) + self.available_gpu_memory_for_kv_cache = -1 + # Setup KV Caches and update CacheConfig after profiling. num_gpu_blocks, num_cpu_blocks, kv_cache_config = \ self._initialize_kv_caches(vllm_config) @@ -137,12 +141,23 @@ def _initialize_kv_caches( # Get all kv cache needed by the model kv_cache_specs = self.model_executor.get_kv_cache_specs() - # Profiles the peak memory usage of the model to determine how much - # memory can be allocated for kv cache. has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs) if has_kv_cache: - available_gpu_memory = \ - self.model_executor.determine_available_memory() + if os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1": + dp_group = getattr(self, "dp_group", None) + assert dp_group is not None + self.available_gpu_memory_for_kv_cache = \ + ParallelConfig.sync_kv_cache_memory_size(dp_group, -1) + available_gpu_memory = [ + self.available_gpu_memory_for_kv_cache + ] * len(kv_cache_specs) + else: + # Profiles the peak memory usage of the model to determine how + # much memory can be allocated for kv cache. + available_gpu_memory = ( + self.model_executor.determine_available_memory()) + self.available_gpu_memory_for_kv_cache = \ + available_gpu_memory[0] else: # Attention free models don't need memory for kv cache available_gpu_memory = [0] * len(kv_cache_specs) @@ -989,6 +1004,50 @@ def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool: return ParallelConfig.has_unfinished_dp(self.dp_group, local_unfinished) + def reinitialize_distributed( + self, reconfig_request: ReconfigureDistributedRequest) -> None: + stateless_destroy_torch_distributed_process_group(self.dp_group) + self.shutdown() + + parallel_config = self.vllm_config.parallel_config + old_dp_size = parallel_config.data_parallel_size + parallel_config.data_parallel_size = \ + reconfig_request.new_data_parallel_size + if reconfig_request.new_data_parallel_rank != -1: + parallel_config.data_parallel_rank = \ + reconfig_request.new_data_parallel_rank + # local rank specifies device visibility, it should not be changed + assert reconfig_request.new_data_parallel_rank_local == \ + ReconfigureRankType.KEEP_CURRENT_RANK + parallel_config.data_parallel_master_ip = \ + reconfig_request.new_data_parallel_master_ip + parallel_config.data_parallel_master_port = \ + reconfig_request.new_data_parallel_master_port + if reconfig_request.new_data_parallel_rank != -2: + self.dp_rank = parallel_config.data_parallel_rank + self.dp_group = parallel_config.stateless_init_dp_group() + reconfig_request.new_data_parallel_master_port = \ + parallel_config.data_parallel_master_port + + self.model_executor.reinitialize_distributed(reconfig_request) + if reconfig_request.new_data_parallel_size > old_dp_size: + assert self.available_gpu_memory_for_kv_cache > 0 + # pass available_gpu_memory_for_kv_cache from existing + # engine-cores to new engine-cores so they can directly + # use it in _initialize_kv_caches() rather than profiling. + ParallelConfig.sync_kv_cache_memory_size( + self.dp_group, self.available_gpu_memory_for_kv_cache) + # NOTE(yongji): newly joined workers require dummy_run even + # CUDA graph is not used + self.model_executor.collective_rpc("compile_or_warm_up_model") + if reconfig_request.new_data_parallel_rank == \ + ReconfigureRankType.SHUTDOWN_CURRENT_RANK: + self.shutdown() + logger.info("DPEngineCoreProc %s shutdown", self.dp_rank) + else: + logger.info("Distributed environment reinitialized for DP rank %s", + self.dp_rank) + class DPEngineCoreActor(DPEngineCoreProc): """ diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index dafaa15f777d..82fc1fa9937c 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -21,9 +21,11 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.utils import get_open_zmq_inproc_path, make_zmq_socket +from vllm.utils import get_open_port, get_open_zmq_inproc_path, make_zmq_socket from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, - EngineCoreRequestType, UtilityOutput) + EngineCoreRequestType, + ReconfigureDistributedRequest, ReconfigureRankType, + UtilityOutput) from vllm.v1.engine.coordinator import DPCoordinator from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError @@ -162,6 +164,9 @@ def dp_engines_running(self) -> bool: running state.""" raise NotImplementedError + async def scale_elastic_ep(self, new_data_parallel_size: int) -> None: + raise NotImplementedError + async def get_output_async(self) -> EngineCoreOutputs: raise NotImplementedError @@ -910,14 +915,30 @@ async def run_engine_stats_update_task(): events = await poller.poll() if not self.engines_running and len(events) == 2 or ( events[0][0] == first_req_rcv_socket): - # Send a message to notify the coordinator that + # Check if this is a regular request notification or + # scale up notification + buf = first_req_rcv_socket.recv( + flags=zmq.NOBLOCK).result() + + decoded = msgspec.msgpack.decode(buf) + if isinstance( + decoded, + (list, tuple)) and len(decoded) == 2 and decoded[ + 0] == "SCALE_ELASTIC_EP": + # Extract new engine count from the decoded message + new_engine_count = decoded[1] + # Send scale up notification to coordinator + scale_msg = msgspec.msgpack.encode( + ("SCALE_ELASTIC_EP", new_engine_count)) + await socket.send(scale_msg) + continue + # we're sending a request while the engines are # paused, so that it can wake the others up # (to run dummy EP loop). + assert decoded[0] == "FIRST_REQ" + target_eng_index = decoded[1] self.engines_running = True - buf = first_req_rcv_socket.recv( - flags=zmq.NOBLOCK).result() - target_eng_index = int.from_bytes(buf, "little") msg = msgspec.msgpack.encode( (target_eng_index, self.current_wave)) await socket.send(msg) @@ -953,7 +974,8 @@ async def add_request_async(self, request: EngineCoreRequest) -> None: chosen_engine) if not self.engines_running: # Notify coordinator that we're sending a request - await self.first_req_send_socket.send(chosen_engine) + req_msg = msgspec.msgpack.encode(("FIRST_REQ", chosen_engine)) + await self.first_req_send_socket.send(req_msg) await to_await @@ -1047,3 +1069,156 @@ async def _abort_requests(self, request_ids: list[str], engine: EngineIdentity) -> None: await self._send_input(EngineCoreRequestType.ABORT, request_ids, engine) + + async def _send_reconfig_message( + self, reconfig_request: ReconfigureDistributedRequest, + engine: EngineIdentity) -> asyncio.Future: + """Send reconfiguration message and return the result future without + waiting for completion.""" + call_id = uuid.uuid1().int >> 64 + future = asyncio.get_running_loop().create_future() + self.utility_results[call_id] = future + message = (EngineCoreRequestType.UTILITY.value, *self.encoder.encode( + (self.client_index, call_id, "reinitialize_distributed", + (reconfig_request, )))) + await self._send_input_message(message, engine, reconfig_request) + self._ensure_output_queue_task() + return future + + async def scale_elastic_ep(self, new_data_parallel_size: int) -> None: + """Scale elastic EP data parallel size""" + cur_data_parallel_size = len(self.core_engines) + + assert new_data_parallel_size != cur_data_parallel_size, ( + f"new_data_parallel_size {new_data_parallel_size} must be " + f"different from cur_data_parallel_size {cur_data_parallel_size}") + + assert self.vllm_config.parallel_config.data_parallel_backend == \ + "ray", ("Only ray DP backend supports scaling elastic EP") + + scale_up = new_data_parallel_size > cur_data_parallel_size + + if scale_up: + await self._scale_up_elastic_ep(cur_data_parallel_size, + new_data_parallel_size) + else: + await self._scale_down_elastic_ep(cur_data_parallel_size, + new_data_parallel_size) + + async def _scale_up_elastic_ep(self, cur_data_parallel_size: int, + new_data_parallel_size: int) -> None: + """Scale up the data parallel size by creating new engine cores + and reconfiguring existing ones.""" + cur_data_parallel_size = len(self.core_engines) + + # Phase 1: Send reconfigure messages to all existing engines and wait + # for them to be sent + reconfig_futures = [] + self.vllm_config.parallel_config.data_parallel_master_port = \ + get_open_port() + for engine in self.core_engines: + reconfig_request = ReconfigureDistributedRequest( + new_data_parallel_size=new_data_parallel_size, + new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK, + new_data_parallel_rank_local=\ + ReconfigureRankType.KEEP_CURRENT_RANK, + new_data_parallel_master_ip=self.vllm_config.parallel_config. + data_parallel_master_ip, + new_data_parallel_master_port=self.vllm_config.parallel_config. + data_parallel_master_port) + future = await self._send_reconfig_message(reconfig_request, + engine) + reconfig_futures.append(future) + + logger.info("All reconfigure messages sent, starting engine creation") + + # Phase 2: Create new engines now that reconfig messages have been sent + # self.resources.engine_manager is guaranteed to be + # CoreEngineActorManager for RayDPClient + assert isinstance(self.resources.engine_manager, + CoreEngineActorManager) + self.resources.engine_manager.scale_up_elastic_ep( + self.vllm_config, new_data_parallel_size) + + # Create new CoreEngine objects for the new engines + new_engine_identities = set() + for i in range(cur_data_parallel_size, new_data_parallel_size): + new_engine = i.to_bytes(2, "little") + self.core_engines.append(new_engine) + new_engine_identities.add(new_engine) + + # Wait for ready messages from new engines on the input socket + sync_input_socket = zmq.Socket.shadow(self.input_socket) + while new_engine_identities: + if not sync_input_socket.poll(timeout=600_000): + raise TimeoutError( + "Timed out waiting for new engines to send initial " + "message on input socket.") + identity, _ = sync_input_socket.recv_multipart() + new_engine_identities.discard(identity) + + # Phase 3: Wait for all existing engines to complete reconfiguration + logger.info("Waiting for existing engines to complete reconfiguration") + await asyncio.gather(*reconfig_futures) + + # Notify coordinator about scale up through existing + # stats_update_task connection + self._ensure_stats_update_task() + scale_up_marker = msgspec.msgpack.encode( + ("SCALE_ELASTIC_EP", new_data_parallel_size)) + await self.first_req_send_socket.send(scale_up_marker) + + # Update the parallel config + self.vllm_config.parallel_config.data_parallel_size = \ + new_data_parallel_size + logger.info( + "[Elastic EP] Scale up completed, new data parallel size: %s", + new_data_parallel_size) + + async def _scale_down_elastic_ep(self, cur_data_parallel_size: int, + new_data_parallel_size: int) -> None: + """Scale down the data parallel size by shutting down and + reconfiguring existing engine cores.""" + cur_data_parallel_size = len(self.core_engines) + + self.vllm_config.parallel_config.data_parallel_master_port = \ + get_open_port() + + reconfig_futures = [] + for cur_dp_rank, engine in enumerate(self.core_engines): + reconfig_request = ReconfigureDistributedRequest( + new_data_parallel_size=new_data_parallel_size, + new_data_parallel_rank=ReconfigureRankType.KEEP_CURRENT_RANK, + new_data_parallel_rank_local=\ + ReconfigureRankType.KEEP_CURRENT_RANK, + new_data_parallel_master_ip=self.vllm_config.parallel_config. + data_parallel_master_ip, + new_data_parallel_master_port=self.vllm_config.parallel_config. + data_parallel_master_port) + if cur_dp_rank >= new_data_parallel_size: + reconfig_request.new_data_parallel_rank = \ + ReconfigureRankType.SHUTDOWN_CURRENT_RANK + future = await self._send_reconfig_message(reconfig_request, + engine) + reconfig_futures.append(future) + + for _ in range(new_data_parallel_size, cur_data_parallel_size): + self.core_engines.pop() + + await asyncio.gather(*reconfig_futures) + + assert isinstance(self.resources.engine_manager, + CoreEngineActorManager) + self.resources.engine_manager.scale_down_elastic_ep( + cur_data_parallel_size, new_data_parallel_size) + + self._ensure_stats_update_task() + scale_down_marker = msgspec.msgpack.encode( + ("SCALE_ELASTIC_EP", new_data_parallel_size)) + await self.first_req_send_socket.send(scale_down_marker) + + self.vllm_config.parallel_config.data_parallel_size = \ + new_data_parallel_size + logger.info( + "[Elastic EP] Scale down completed, new data parallel size: %s", + new_data_parallel_size) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index ae104bd6eb96..6dde477576b8 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -174,16 +174,21 @@ def __init__( self.local_engine_actors: list[ray.ActorHandle] = [] self.remote_engine_actors: list[ray.ActorHandle] = [] + + env_vars_list = get_env_vars_to_copy(destination="DPEngineCoreActor") + self.env_vars_dict = { + name: os.environ[name] + for name in env_vars_list if name in os.environ + } + runtime_env = RuntimeEnv(env_vars=self.env_vars_dict) + + self.addresses = addresses + self.executor_class = executor_class + self.log_stats = log_stats dp_size = vllm_config.parallel_config.data_parallel_size local_engine_count = \ vllm_config.parallel_config.data_parallel_size_local world_size = vllm_config.parallel_config.world_size - env_vars_set = get_env_vars_to_copy(destination="DPEngineCoreActor") - env_vars_dict = { - name: os.environ[name] - for name in env_vars_set if name in os.environ - } - runtime_env = RuntimeEnv(env_vars=env_vars_dict) if ray.is_initialized(): logger.info( @@ -208,6 +213,7 @@ def __init__( assert len(placement_groups) == dp_size, ( "Number of placement groups must match data parallel size") + self.placement_group_is_local = [] refs = [] for index in range(dp_size): local_index = local_dp_ranks[index] @@ -231,6 +237,7 @@ def __init__( self.local_engine_actors.append(actor) else: self.remote_engine_actors.append(actor) + self.placement_group_is_local.append(local_client) refs.append(actor.wait_for_init.remote()) ray.get(refs) @@ -242,6 +249,9 @@ def __init__( def create_dp_placement_groups( vllm_config: VllmConfig ) -> tuple[list["PlacementGroup"], list[int]]: + """ + Create placement groups for data parallel. + """ import ray from ray._private.state import available_resources_per_node @@ -250,10 +260,11 @@ def create_dp_placement_groups( logger.info("Creating placement groups for data parallel") dp_master_ip = \ vllm_config.parallel_config.data_parallel_master_ip - dp_size = vllm_config.parallel_config.data_parallel_size + num_pg_to_create = vllm_config.parallel_config.data_parallel_size local_engine_count = \ vllm_config.parallel_config.data_parallel_size_local + nodes = list_nodes() nodes = sorted(list_nodes(), key=lambda node: node.node_ip != dp_master_ip) assert nodes[0].node_ip == dp_master_ip, ( @@ -293,7 +304,7 @@ def create_dp_placement_groups( local_dp_ranks.append(i) else: for i in range(available_engine_count): - if len(placement_groups) == dp_size: + if len(placement_groups) == num_pg_to_create: break bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] pg = ray.util.placement_group( @@ -305,6 +316,204 @@ def create_dp_placement_groups( local_dp_ranks.append(i) return placement_groups, local_dp_ranks + @staticmethod + def add_dp_placement_groups( + old_vllm_config: VllmConfig, new_data_parallel_size: int + ) -> tuple[list["PlacementGroup"], list[int]]: + """ + Add placement groups for new data parallel size. + """ + import ray + from ray._private.state import (available_resources_per_node, + total_resources_per_node) + from ray.util.state import list_nodes + + old_dp_size = old_vllm_config.parallel_config.data_parallel_size + num_pg_to_create = new_data_parallel_size - old_dp_size + + if num_pg_to_create <= 0: + return [], [] + + dp_master_ip = old_vllm_config.parallel_config.data_parallel_master_ip + world_size = old_vllm_config.parallel_config.world_size + + nodes = list_nodes() + nodes = sorted(nodes, key=lambda node: node.node_ip != dp_master_ip) + assert nodes[0].node_ip == dp_master_ip, ( + "The first node must be the head node") + assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, ( + "There can only be one head node") + + available_resources = available_resources_per_node() + total_resources = total_resources_per_node() + + placement_groups = [] + local_dp_ranks = [] + num_pg_created = 0 + + for node in nodes: + if num_pg_created >= num_pg_to_create: + break + + node_ip = node.node_ip + node_id = node.node_id + available_gpus = int(available_resources[node_id]["GPU"]) + + # Get total GPUs on this node from the node's resources + # Ray stores node resources with node ID as key + total_gpus = int(total_resources[node_id]["GPU"]) + + # Calculate used GPUs and used engines on this node + used_gpus = max(0, total_gpus - available_gpus) + used_engines_on_node = used_gpus // world_size + + # Calculate how many new engines this node can accommodate + available_engine_count = available_gpus // world_size + + # Create placement groups for new engines on this node + for i in range(available_engine_count): + if num_pg_created >= num_pg_to_create: + break + + rank = old_dp_size + num_pg_created + + # Create bundles with node constraint for master node + if node_ip == dp_master_ip: + bundles = [{ + "GPU": 1.0, + "node:" + dp_master_ip: 0.001 + }] * world_size + [{ + "CPU": 1.0 + }] + else: + bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] + + pg = ray.util.placement_group( + name=f"dp_rank_{rank}", + strategy="STRICT_PACK", + bundles=bundles, + ) + placement_groups.append(pg) + + # Local rank starts from the number of engines already used + # on this node + local_rank = used_engines_on_node + i + local_dp_ranks.append(local_rank) + num_pg_created += 1 + + return placement_groups, local_dp_ranks + + def scale_up_elastic_ep(self, cur_vllm_config: VllmConfig, + new_data_parallel_size: int) -> None: + import copy + + import ray + from ray.runtime_env import RuntimeEnv + from ray.util.scheduling_strategies import ( + PlacementGroupSchedulingStrategy) + + from vllm.v1.engine.core import DPEngineCoreActor + + cur_data_parallel_size = len(self.local_engine_actors) + \ + len(self.remote_engine_actors) + + assert new_data_parallel_size > cur_data_parallel_size, ( + f"New data parallel size {new_data_parallel_size} must be greater " + f"than current data parallel size {cur_data_parallel_size} " + "for scale up") + + placement_groups, local_dp_ranks = \ + self.add_dp_placement_groups( + cur_vllm_config, new_data_parallel_size) + + world_size = cur_vllm_config.parallel_config.world_size + dp_master_ip = cur_vllm_config.parallel_config.data_parallel_master_ip + new_local_engines = 0 + + runtime_env = RuntimeEnv(env_vars=self.env_vars_dict + | {"VLLM_ELASTIC_EP_SCALE_UP_LAUNCH": "1"}) + for i, (pg, + local_rank) in enumerate(zip(placement_groups, + local_dp_ranks)): + rank = cur_data_parallel_size + i + dp_vllm_config = copy.deepcopy(cur_vllm_config) + dp_vllm_config.parallel_config.data_parallel_size = \ + new_data_parallel_size + dp_vllm_config.parallel_config.placement_group = pg + + # Check if this placement group is on the head node + local_client = any( + bundle.get("node:" + dp_master_ip, 0) > 0 + for bundle in pg.bundle_specs) + + if local_client: + new_local_engines += 1 + # Update data_parallel_size_local + dp_vllm_config.parallel_config.data_parallel_size_local = ( + cur_vllm_config.parallel_config.data_parallel_size_local + + new_local_engines) + + actor = ray.remote(DPEngineCoreActor).options( + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + placement_group_bundle_index=world_size, + ), + runtime_env=runtime_env).remote( + vllm_config=dp_vllm_config, + executor_class=self.executor_class, + log_stats=self.log_stats, + local_client=local_client, + addresses=self.addresses, + dp_rank=rank, + local_dp_rank=local_rank) + + if local_client: + self.local_engine_actors.append(actor) + else: + self.remote_engine_actors.append(actor) + self.created_placement_groups.append(pg) + self.placement_group_is_local.append(local_client) + + ray.get([ + actor.wait_for_init.remote() + for actor in (self.local_engine_actors[-new_local_engines:] + if new_local_engines > 0 else []) + + self.remote_engine_actors[-(len(placement_groups) - + new_local_engines):] + ]) + + actors = (self.local_engine_actors[-new_local_engines:] + if new_local_engines > 0 else []) + \ + self.remote_engine_actors[-(len(placement_groups) - + new_local_engines):] + + for actor in actors: + self.run_refs.append(actor.run.remote()) + + cur_vllm_config.parallel_config.data_parallel_size = \ + new_data_parallel_size + # Update old_vllm_config with new data_parallel_size_local if any new + # local engines were added + if new_local_engines > 0: + cur_vllm_config.parallel_config.data_parallel_size_local += \ + new_local_engines + + def scale_down_elastic_ep(self, cur_data_parallel_size: int, + new_data_parallel_size: int) -> None: + import ray + assert cur_data_parallel_size > new_data_parallel_size, ( + f"cur_data_parallel_size {cur_data_parallel_size} must be greater " + f"than new_data_parallel_size {new_data_parallel_size} " + "for scale down") + for _ in range(cur_data_parallel_size - new_data_parallel_size): + pg = self.created_placement_groups.pop() + is_local = self.placement_group_is_local.pop() + if is_local: + self.local_engine_actors.pop() + else: + self.remote_engine_actors.pop() + ray.util.remove_placement_group(pg) + def get_run_refs(self): return self.run_refs diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index daca7c0faf66..eb659e4f9e47 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -6,6 +6,7 @@ from vllm.executor.ray_distributed_executor import ( # noqa RayDistributedExecutor as RayDistributedExecutorV0) +from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput @@ -62,3 +63,11 @@ def execute_model( # When PP is used, we return a FutureWrapper immediately so that # the scheduler can yield to the next batch. return FutureWrapper(refs[0]) + + def reinitialize_distributed( + self, reconfig_request: ReconfigureDistributedRequest) -> None: + self._run_workers("reinitialize_distributed", reconfig_request) + if reconfig_request.new_data_parallel_rank == \ + ReconfigureRankType.SHUTDOWN_CURRENT_RANK: + self.shutdown() + return diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index c315dcb18325..136a9f08e829 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -49,7 +49,7 @@ def replace_tensor(obj: Any, cpu_attr_name: str, if k.endswith("_cpu") and isinstance(v, torch.Tensor): replace_tensor(self.input_batch.block_table, k, k[:-4]) - def load_model(self) -> None: + def load_model(self, eep_scale_up: bool = False) -> None: logger.info("Starting to load model %s...", self.model_config.model) self.model = get_model(vllm_config=self.vllm_config) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c3eeb6c2e390..06d0214c4d61 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1745,8 +1745,40 @@ def update_config(self, overrides: dict[str, Any]) -> None: new_config = update_config(config, config_overrides) setattr(self, config_name, new_config) - def load_model(self) -> None: + def load_model(self, eep_scale_up: bool = False) -> None: + """ + Args: + eep_scale_up: the model loading is for elastic EP scale up. + """ logger.info("Starting to load model %s...", self.model_config.model) + if eep_scale_up: + from vllm.distributed.parallel_state import get_ep_group + num_local_physical_experts = torch.empty(1, + dtype=torch.int32, + device="cpu") + torch.distributed.broadcast(num_local_physical_experts, + group=get_ep_group().cpu_group, + group_src=0) + num_local_physical_experts = int(num_local_physical_experts.item()) + new_ep_size = get_ep_group().world_size + global_expert_load, old_global_expert_indices = ( + EplbState.recv_state()) + num_logical_experts = global_expert_load.shape[1] + self.parallel_config.num_redundant_experts = ( + num_local_physical_experts * new_ep_size - num_logical_experts) + assert old_global_expert_indices.shape[ + 1] % num_local_physical_experts == 0 + old_ep_size = old_global_expert_indices.shape[ + 1] // num_local_physical_experts + rank_mapping = { + old_ep_rank: old_ep_rank + for old_ep_rank in range(old_ep_size) + } + else: + global_expert_load = None + old_global_expert_indices = None + rank_mapping = None + with DeviceMemoryProfiler() as m: # noqa: SIM117 time_before_load = time.perf_counter() model_loader = get_model_loader(self.load_config) @@ -1788,6 +1820,9 @@ def load_model(self) -> None: self.model, self.device, self.parallel_config, + global_expert_load, + old_global_expert_indices, + rank_mapping, ) def save_tensorized_model( diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 1610d0ecee2f..2201481fa5bf 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -26,6 +26,7 @@ from vllm.pooling_params import PoolingTask from vllm.sequence import IntermediateTensors from vllm.utils import GiB_bytes, MemorySnapshot, memory_profiling +from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec from vllm.v1.outputs import EMPTY_MODEL_RUNNER_OUTPUT, ModelRunnerOutput from vllm.v1.utils import report_usage_stats @@ -191,8 +192,9 @@ def load_model(self) -> None: else: from contextlib import nullcontext context = nullcontext() + eep_scale_up = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1" with context: - self.model_runner.load_model() + self.model_runner.load_model(eep_scale_up=eep_scale_up) def update_config(self, overrides: dict[str, Any]) -> None: self.model_runner.update_config(overrides) @@ -384,6 +386,161 @@ def check_health(self) -> None: # worker will always be healthy as long as it's running. return + def _eplb_before_scale_down(self, old_ep_size: int, + new_ep_size: int) -> None: + from vllm.distributed.parallel_state import get_ep_group + if get_ep_group().rank == 0: + logger.info("[Elastic EP] Starting expert resharding " + "before scaling down...") + rank_mapping = { + old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1 + for old_ep_rank in range(old_ep_size) + } + assert self.model_runner.eplb_state is not None + self.model_runner.eplb_state.rearrange(self.model_runner.model, + execute_shuffle=True, + global_expert_load=None, + rank_mapping=rank_mapping) + torch.cuda.synchronize() + if get_ep_group().rank == 0: + logger.info("[Elastic EP] Expert resharding completed!") + + def _eplb_after_scale_up( + self, old_ep_size: int, new_ep_size: int, + global_expert_load: Optional[torch.Tensor]) -> None: + from vllm.distributed.parallel_state import get_ep_group + if get_ep_group().rank == 0: + logger.info("[Elastic EP] Starting expert resharding " + "after scaling up...") + rank_mapping = { + old_ep_rank: old_ep_rank + for old_ep_rank in range(old_ep_size) + } + assert self.model_runner.eplb_state is not None + self.model_runner.eplb_state.rearrange( + self.model_runner.model, + execute_shuffle=True, + global_expert_load=global_expert_load, + rank_mapping=rank_mapping) + if get_ep_group().rank == 0: + logger.info("[Elastic EP] Expert resharding completed!") + + def _reconfigure_parallel_config( + self, reconfig_request: ReconfigureDistributedRequest) -> None: + """ + Update parallel config with provided reconfig_request + """ + parallel_config = self.vllm_config.parallel_config + parallel_config.data_parallel_size = \ + reconfig_request.new_data_parallel_size + if reconfig_request.new_data_parallel_rank != \ + ReconfigureRankType.KEEP_CURRENT_RANK: + parallel_config.data_parallel_rank = \ + reconfig_request.new_data_parallel_rank + if reconfig_request.new_data_parallel_rank_local != \ + ReconfigureRankType.KEEP_CURRENT_RANK: + parallel_config.data_parallel_rank_local = \ + reconfig_request.new_data_parallel_rank_local + parallel_config.data_parallel_master_ip = \ + reconfig_request.new_data_parallel_master_ip + parallel_config.data_parallel_master_port = \ + reconfig_request.new_data_parallel_master_port + + def _reconfigure_moe(self, old_ep_size: int, + new_ep_size: int) -> Optional[torch.Tensor]: + """ + Reconfigure MoE modules with provided reconfig_request + + Return the global expert load if new_ep_size > old_ep_size, + otherwise None + """ + from vllm.distributed.parallel_state import ( + get_dp_group, get_ep_group, prepare_communication_buffer_for_model) + from vllm.model_executor.layers.fused_moe.layer import ( + FusedMoEParallelConfig) + + parallel_config = self.vllm_config.parallel_config + moe_modules = [ + module for module in self.model_runner.model.modules() + if module.__class__.__name__ == "FusedMoE" + ] + num_local_experts = moe_modules[0].moe_config.num_local_experts + assert all(module.moe_config.num_local_experts == num_local_experts + for module in moe_modules), ( + "All MoE modules must have the same number of experts") + for module in moe_modules: + module.moe_config.num_experts = num_local_experts * new_ep_size + module.global_num_experts = module.moe_config.num_experts + module.moe_parallel_config = FusedMoEParallelConfig.make( + tp_size_=get_tp_group().world_size, + dp_size_=get_dp_group().world_size, + vllm_parallel_config=parallel_config, + ) + module.moe_config.moe_parallel_config = module.moe_parallel_config + if new_ep_size < old_ep_size: + num_local_physical_experts = num_local_experts + assert self.model_runner.eplb_state is not None + new_physical_experts = \ + self.model_runner.eplb_state.physical_to_logical_map.shape[1] + parallel_config.num_redundant_experts = ( + new_physical_experts - + self.model_runner.eplb_state.logical_replica_count.shape[1]) + global_expert_load = None + else: + num_local_physical_experts = torch.tensor([num_local_experts], + dtype=torch.int32, + device="cpu") + torch.distributed.broadcast(num_local_physical_experts, + group=get_ep_group().cpu_group, + group_src=0) + num_local_physical_experts = num_local_physical_experts.item() + new_physical_experts = num_local_physical_experts * new_ep_size + assert self.model_runner.eplb_state is not None + global_expert_load = self.model_runner.eplb_state.rearrange( + self.model_runner.model, execute_shuffle=False) + parallel_config.num_redundant_experts = ( + new_physical_experts - global_expert_load.shape[1]) + prepare_communication_buffer_for_model(self.model_runner.model) + self.model_runner.model.update_physical_experts_metadata( + num_physical_experts=new_physical_experts, + num_local_physical_experts=num_local_physical_experts) + return global_expert_load + + def reinitialize_distributed( + self, reconfig_request: ReconfigureDistributedRequest) -> None: + from vllm.config import set_current_vllm_config + from vllm.distributed.parallel_state import ( + cleanup_dist_env_and_memory, get_ep_group) + + old_ep_size = get_ep_group().world_size + old_ep_rank = get_ep_group().rank + new_ep_size = reconfig_request.new_data_parallel_size * get_tp_group( + ).world_size * get_pp_group().world_size + if new_ep_size < old_ep_size: + self._eplb_before_scale_down(old_ep_size, new_ep_size) + + cleanup_dist_env_and_memory() + + if reconfig_request.new_data_parallel_rank == \ + ReconfigureRankType.SHUTDOWN_CURRENT_RANK: + assert old_ep_rank >= new_ep_size + # shutdown + return + + self._reconfigure_parallel_config(reconfig_request) + + with set_current_vllm_config(self.vllm_config): + init_worker_distributed_environment(self.vllm_config, self.rank, + self.distributed_init_method, + self.local_rank) + + global_expert_load = self._reconfigure_moe(old_ep_size, new_ep_size) + + if new_ep_size > old_ep_size: + assert global_expert_load is not None + self._eplb_after_scale_up(old_ep_size, new_ep_size, + global_expert_load) + def save_sharded_state( self, path: str, From 466e878f2ad5e36cba4861db1cac7cd0d92055fb Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 19 Jul 2025 08:52:02 +0800 Subject: [PATCH 05/57] [Quantization] Enable BNB support for more MoE models (#21100) Signed-off-by: Jee Jee Li --- docs/models/supported_models.md | 8 +- vllm/model_executor/models/bailing_moe.py | 21 +- vllm/model_executor/models/ernie45_moe.py | 153 +++++++------- vllm/model_executor/models/grok1.py | 24 ++- vllm/model_executor/models/hunyuan_v1_moe.py | 198 ++++++++++--------- 5 files changed, 223 insertions(+), 181 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 8fd8b8220cf7..cfd525ab9314 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -316,7 +316,7 @@ Specified using `--task generate`. | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `ArcticForCausalLM` | Arctic | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. | | ✅︎ | ✅︎ | | `BaiChuanForCausalLM` | Baichuan2, Baichuan | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | | ✅︎ | ✅︎ | +| `BailingMoeForCausalLM` | Ling | `inclusionAI/Ling-lite-1.5`, `inclusionAI/Ling-plus`, etc. | ✅︎ | ✅︎ | ✅︎ | | `BambaForCausalLM` | Bamba | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B` | ✅︎ | ✅︎ | ✅︎ | | `BloomForCausalLM` | BLOOM, BLOOMZ, BLOOMChat | `bigscience/bloom`, `bigscience/bloomz`, etc. | | ✅︎ | | | `BartForConditionalGeneration` | BART | `facebook/bart-base`, `facebook/bart-large-cnn`, etc. | | | | @@ -328,8 +328,8 @@ Specified using `--task generate`. | `DeepseekV2ForCausalLM` | DeepSeek-V2 | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat`, etc. | | ✅︎ | ✅︎ | | `DeepseekV3ForCausalLM` | DeepSeek-V3 | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3`, etc. | | ✅︎ | ✅︎ | | `Dots1ForCausalLM` | dots.llm1 | `rednote-hilab/dots.llm1.base`, `rednote-hilab/dots.llm1.inst`, etc. | | ✅︎ | ✅︎ | -| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | | ✅︎ | ✅︎ | -| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. | | ✅︎ | ✅︎ | +| `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ | @@ -351,7 +351,7 @@ Specified using `--task generate`. | `GraniteMoeSharedForCausalLM` | Granite MoE Shared | `ibm-research/moe-7b-1b-active-shared-experts` (test model) | ✅︎ | ✅︎ | ✅︎ | | `GritLM` | GritLM | `parasail-ai/GritLM-7B-vllm`. | ✅︎ | ✅︎ | | | `Grok1ModelForCausalLM` | Grok1 | `hpcai-tech/grok-1`. | ✅︎ | ✅︎ | ✅︎ | -| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | | | ✅︎ | +| `HunYuanMoEV1ForCausalLM` | Hunyuan-80B-A13B | `tencent/Hunyuan-A13B-Instruct`, `tencent/Hunyuan-A13B-Pretrain`, `tencent/Hunyuan-A13B-Instruct-FP8`, etc. | ✅︎ | | ✅︎ | | `InternLMForCausalLM` | InternLM | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM2ForCausalLM` | InternLM2 | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. | ✅︎ | ✅︎ | ✅︎ | | `InternLM3ForCausalLM` | InternLM3 | `internlm/internlm3-8b-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | diff --git a/vllm/model_executor/models/bailing_moe.py b/vllm/model_executor/models/bailing_moe.py index ccfc3997e45c..853c13b135ea 100644 --- a/vllm/model_executor/models/bailing_moe.py +++ b/vllm/model_executor/models/bailing_moe.py @@ -53,7 +53,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP +from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -374,6 +374,14 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + ) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ @@ -381,14 +389,10 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts) params_dict = dict(self.named_parameters(remove_duplicate=False)) loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: if self.config.norm_head and "lm_head.weight" in name: loaded_weight = F.normalize(loaded_weight, @@ -449,7 +453,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class BailingMoeForCausalLM(nn.Module, SupportsPP): +class BailingMoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): packed_modules_mapping = { "query_key_value": ["query_key_value"], @@ -518,3 +522,6 @@ def load_weights(self, weights: Iterable[tuple[str, if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() diff --git a/vllm/model_executor/models/ernie45_moe.py b/vllm/model_executor/models/ernie45_moe.py index e7a50ff7a1c9..984003e62d11 100644 --- a/vllm/model_executor/models/ernie45_moe.py +++ b/vllm/model_executor/models/ernie45_moe.py @@ -51,8 +51,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsPP -from .utils import (PPMissingLayer, extract_layer_index, +from .interfaces import SupportsLoRA, SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) @@ -427,66 +427,15 @@ def forward( return hidden_states + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: -class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - fall_back_to_pt_during_load = False - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = Ernie4_5_MoeModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - if get_pp_group().is_last_rank: - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, - quant_config=quant_config) - else: - self.lm_head = PPMissingLayer() - - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - hidden_states = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) - return hidden_states - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.moe_num_experts) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: @@ -499,16 +448,9 @@ def load_weights(self, weights: Iterable[tuple[str, ("gate_up_proj", "up_proj", 1), ] - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.moe_num_experts) - params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: if self.config.tie_word_embeddings and name.endswith( "lm_head.weight"): @@ -581,3 +523,76 @@ def load_weights(self, weights: Iterable[tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + +class Ernie4_5_MoeForCausalLM(nn.Module, SupportsPP, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = Ernie4_5_MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 2d930527b2be..3659249cd8bd 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -360,6 +360,16 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + # Map Grok1's unique expert parameter names to standard names + # Grok1 uses "num_experts" in its config + num_experts = getattr(self.config, "num_experts", 8) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="linear", # Grok1 specific + ckpt_down_proj_name="linear_1", # Grok1 specific + ckpt_up_proj_name="linear_v", # Grok1 specific + num_experts=num_experts) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: stacked_params_mapping = [ @@ -369,18 +379,9 @@ def load_weights(self, weights: Iterable[tuple[str, ("qkv_proj", "v_proj", "v"), ] - # Map Grok1's unique expert parameter names to standard names - # Grok1 uses "num_experts" in its config - num_experts = getattr(self.config, "num_experts", 8) - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="linear", # Grok1 specific - ckpt_down_proj_name="linear_1", # Grok1 specific - ckpt_up_proj_name="linear_v", # Grok1 specific - num_experts=num_experts) - params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() - + expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -544,3 +545,6 @@ def load_weights(self, weights: Iterable[tuple[str, skip_prefixes=skip_prefixes, ) return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() diff --git a/vllm/model_executor/models/hunyuan_v1_moe.py b/vllm/model_executor/models/hunyuan_v1_moe.py index 43ffba00721e..b3baec98b0fc 100644 --- a/vllm/model_executor/models/hunyuan_v1_moe.py +++ b/vllm/model_executor/models/hunyuan_v1_moe.py @@ -56,7 +56,9 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers +from .interfaces import SupportsLoRA +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, + make_layers) def _get_cla_factor(config: PretrainedConfig) -> int: @@ -617,86 +619,6 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - -class HunYuanMoEV1ForCausalLM(nn.Module): - packed_modules_mapping = { - "qkv_proj": [ - "q_proj", - "k_proj", - "v_proj", - ], - "gate_up_proj": [ - "gate_proj", - "up_proj", - ], - } - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - lora_config = vllm_config.lora_config - self.config = config - self.quant_config = quant_config - self.lora_config = lora_config - - self.model = HunYuanModel(vllm_config=vllm_config, prefix="model") - if get_pp_group().is_last_rank: - self.unpadded_vocab_size = config.vocab_size - if lora_config: - self.unpadded_vocab_size += lora_config.lora_extra_vocab_size - self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE, - quant_config=quant_config, - ) - if config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - - logit_scale = getattr(config, "logit_scale", 1.0) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size, - logit_scale) - else: - self.lm_head = PPMissingLayer() - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - model_output = self.model(input_ids, positions, intermediate_tensors, - inputs_embeds) - return model_output - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - return logits - - def make_empty_intermediate_tensors( - self, batch_size: int, dtype: torch.dtype, - device: torch.device) -> IntermediateTensors: - return IntermediateTensors({ - "hidden_states": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - "residual": - torch.zeros((batch_size, self.config.hidden_size), - dtype=dtype, - device=device), - }) - def _split_qkv_weight(self, qkv: torch.Tensor): num_attention_heads = self.config.num_attention_heads num_kv_heads = getattr(self.config, "num_key_value_heads", @@ -719,6 +641,17 @@ def _split_qkv_weight(self, qkv: torch.Tensor): v = v.reshape(-1, hidden_size) return torch.concat((q, k, v)) + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + return FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.num_experts, + ) + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): cla_factor = _get_cla_factor(self.config) stacked_params_mapping = [ @@ -745,16 +678,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): ), ] - # Params for weights, fp8 weight scales, fp8 activation scales - # (param_name, weight_name, expert_id, shard_id) - expert_params_mapping = FusedMoE.make_expert_params_mapping( - ckpt_gate_proj_name="gate_proj", - ckpt_down_proj_name="down_proj", - ckpt_up_proj_name="up_proj", - num_experts=self.config.num_experts, - ) - params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -806,7 +732,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) - + loaded_params.add(name) is_found = True break if is_found: @@ -885,3 +811,93 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class HunYuanMoEV1ForCausalLM(nn.Module, SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + + self.model = HunYuanModel(vllm_config=vllm_config, prefix="model") + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = config.vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE, + quant_config=quant_config, + ) + if config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, + logit_scale) + else: + self.lm_head = PPMissingLayer() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + model_output = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return model_output + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + "hidden_states": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + "residual": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + }) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) + + def get_expert_mapping(self) -> list[tuple[str, str, int, str]]: + return self.model.get_expert_mapping() From 9a9fda1423c96aa8ea62a56e8f1ad88fc080ae2c Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Sat, 19 Jul 2025 11:48:38 +0800 Subject: [PATCH 06/57] [Core] Support Local Chunked Attention for Hybrid KV Cache (#19351) Signed-off-by: Lucia Fang Signed-off-by: Lu Fang Signed-off-by: Lu Fang Co-authored-by: Lu Fang --- tests/v1/core/test_specialized_manager.py | 157 ++++++++++++++++++- vllm/attention/layer.py | 1 + vllm/config.py | 7 + vllm/v1/attention/backends/flash_attn.py | 3 +- vllm/v1/attention/backends/utils.py | 1 + vllm/v1/core/kv_cache_utils.py | 19 ++- vllm/v1/core/single_type_kv_cache_manager.py | 125 ++++++++++++++- vllm/v1/kv_cache_interface.py | 49 ++++-- vllm/v1/worker/gpu_model_runner.py | 8 + 9 files changed, 351 insertions(+), 19 deletions(-) diff --git a/tests/v1/core/test_specialized_manager.py b/tests/v1/core/test_specialized_manager.py index a9e1898df934..b67c05bd7ac1 100644 --- a/tests/v1/core/test_specialized_manager.py +++ b/tests/v1/core/test_specialized_manager.py @@ -1,13 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import random + import torch from vllm.v1.core.block_pool import BlockPool from vllm.v1.core.kv_cache_utils import (BlockHash, BlockHashWithGroupId, KVCacheBlock) -from vllm.v1.core.single_type_kv_cache_manager import SlidingWindowManager -from vllm.v1.kv_cache_interface import SlidingWindowSpec +from vllm.v1.core.single_type_kv_cache_manager import ( + ChunkedLocalAttentionManager, SlidingWindowManager) +from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, + SlidingWindowSpec) def get_sliding_window_manager(sliding_window_spec, block_pool): @@ -17,6 +21,80 @@ def get_sliding_window_manager(sliding_window_spec, block_pool): kv_cache_group_id=0) +def get_chunked_local_attention_manager(chunked_local_attention_spec, + block_pool): + return ChunkedLocalAttentionManager(chunked_local_attention_spec, + block_pool, + caching_hash_fn=lambda x: x, + kv_cache_group_id=0) + + +def test_chunked_local_attention_possible_cached_prefix(): + block_size = 2 + chunked_local_attention_spec = ChunkedLocalAttentionSpec( + block_size=block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + attention_chunk_size=4, + use_mla=False, + ) + + block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + manager = get_chunked_local_attention_manager(chunked_local_attention_spec, + block_pool) + + def run_one_case(block_is_cached, tail_token, expect_length): + block_hash_list = [ + BlockHash(i, ()) for i in range(len(block_is_cached)) + ] + + block_pool.cached_block_hash_to_block.clear() + + # Mock the block pool with the cached blocks + for i, (block_hash, + is_cached) in enumerate(zip(block_hash_list, block_is_cached)): + if is_cached: + block_pool.cached_block_hash_to_block[BlockHashWithGroupId( + block_hash, 0)] = { + i: block_pool.blocks[i + 10], + } + + computed_blocks = manager.find_longest_cache_hit( + block_hashes=block_hash_list, + max_length=len(block_hash_list) * block_size + tail_token, + kv_cache_group_ids=[0], + block_pool=block_pool, + kv_cache_spec=chunked_local_attention_spec, + use_eagle=False)[0] + assert len(computed_blocks) == expect_length + + assert all(block == block_pool.null_block + for block in computed_blocks[:(expect_length - 1) // 2]) + + run_one_case([True], 0, 1) + run_one_case([True], 1, 1) + run_one_case([True, False], 0, 2) + run_one_case([True, False], 1, 2) + run_one_case([True, True], 0, 2) + run_one_case([True, True], 1, 2) + run_one_case([True, True, False], 0, 2) + run_one_case([True, True, False], 1, 2) + run_one_case([True, True, True], 0, 3) + run_one_case([True, True, True], 1, 3) + run_one_case([True, True, True, False], 0, 4) + run_one_case([True, True, True, False], 1, 4) + run_one_case([random.choice([True, False])] * 8 + [True], 1, 9) + run_one_case([random.choice([True, False])] * 8 + [False], 1, 8) + run_one_case([random.choice([True, False])] * 8 + [True, True], 1, 10) + run_one_case([random.choice([True, False])] * 8 + [True, False], 0, 10) + run_one_case([random.choice([True, False])] * 8 + [True, False], 1, 10) + run_one_case([random.choice([True, False])] * 8 + [False, True], 0, 10) + run_one_case([random.choice([True, False])] * 8 + [False, True], 1, 10) + run_one_case([random.choice([True, False])] * 8 + [False, False], 0, 10) + run_one_case([random.choice([True, False])] * 8 + [False, False], 1, 10) + + def test_sliding_window_possible_cached_prefix(): block_size = 2 sliding_window_spec = SlidingWindowSpec( @@ -84,6 +162,58 @@ def run_one_case(block_is_cached, expect_length): ], 8) +def test_chunked_local_attention_remove_skipped_blocks(): + attention_spec = ChunkedLocalAttentionSpec( + block_size=2, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + attention_chunk_size=4, + use_mla=False, + ) + + block_pool = BlockPool(num_gpu_blocks=2000, enable_caching=True) + + manager = get_chunked_local_attention_manager(attention_spec, block_pool) + + null_block_id = block_pool.null_block.block_id + + def id_to_block_table(ids) -> list[KVCacheBlock]: + return [ + KVCacheBlock(id_) + if id_ != null_block_id else block_pool.null_block for id_ in ids + ] + + def assert_block_id(block_table: list[KVCacheBlock], ids: list[int]): + for block, id_ in zip(block_table, ids): + if id_ == null_block_id: + assert block == block_pool.null_block + else: + assert block.block_id == id_ + + original_block_ids = [ + 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010 + ] + block_table = id_to_block_table(original_block_ids) + manager.req_to_blocks["test"] = block_table + + manager.remove_skipped_blocks("test", 0) + assert_block_id(block_table, original_block_ids) + + # For 4th token (0-indexed), token 0-3 is out of the local attention window. + manager.remove_skipped_blocks("test", 4) + assert_block_id(block_table, [null_block_id] * 2) + + # For 6th token (0-indexed), token 4 - 6 are in local attention window, + # token 0 - 3 are out, 2 blocks can be removed. + manager.remove_skipped_blocks("test", 6) + assert_block_id(block_table, [null_block_id] * 2 + original_block_ids[2:]) + # For 12th token (0-indexed), + # token 0-11 are out, 6 block can be removed. + manager.remove_skipped_blocks("test", 12) + assert_block_id(block_table, [null_block_id] * 6) + + def test_sliding_window_remove_skipped_blocks(): sliding_window_spec = SlidingWindowSpec( block_size=2, @@ -172,3 +302,26 @@ def test_get_num_blocks_to_allocate(): cached_blocks_1) == 20 assert manager.get_num_blocks_to_allocate("2", 20 * block_size, cached_blocks_2) == 15 + + +def test_chunked_local_attention_get_num_blocks_to_allocate(): + block_size = 2 + attention_spec = ChunkedLocalAttentionSpec( + block_size=block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + attention_chunk_size=4, # Placeholder value, not related to test result + use_mla=False, + ) + + block_pool = BlockPool(num_gpu_blocks=100, enable_caching=True) + manager = get_chunked_local_attention_manager(attention_spec, block_pool) + cached_blocks_1 = [KVCacheBlock(i + 1) for i in range(10)] + cached_blocks_2 = [block_pool.null_block for _ in range(5) + ] + [KVCacheBlock(i + 1) for i in range(5)] + + assert manager.get_num_blocks_to_allocate("1", 20 * block_size, + cached_blocks_1) == 20 + assert manager.get_num_blocks_to_allocate("2", 20 * block_size, + cached_blocks_2) == 15 diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index b6b93ff4a0ac..d0677525d310 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -172,6 +172,7 @@ def __init__( kv_sharing_target_layer_name, **extra_impl_args) self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype + self.use_irope = extra_impl_args.get("use_irope", False) # For cuda-alike (CUDA and ROCM) and cpu platforms, we control how # torch.compile works by registering the attention as one giant diff --git a/vllm/config.py b/vllm/config.py index a415683f4e79..7ae9b1b7fd02 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4722,6 +4722,13 @@ def __post_init__(self): if self.kv_events_config is not None: # Hybrid KV cache manager is not compatible with KV events. self.scheduler_config.disable_hybrid_kv_cache_manager = True + if self.model_config is not None and \ + self.model_config.attention_chunk_size is not None and \ + self.speculative_config is not None and \ + self.speculative_config.use_eagle(): + # Hybrid KV cache manager is not yet supported with chunked + # local attention + eagle. + self.scheduler_config.disable_hybrid_kv_cache_manager = True def update_sizes_for_sequence_parallelism(self, possible_sizes: list) -> list: diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index d5b30ac685ac..a37bf2a7115b 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -538,6 +538,7 @@ def use_cascade_attention( num_kv_heads: int, use_alibi: bool, use_sliding_window: bool, + use_local_attention: bool, num_sms: int, ) -> bool: """Decide whether to use cascade attention. @@ -553,7 +554,7 @@ def use_cascade_attention( if common_prefix_len < 256: return False # Cascade attention is currently not supported with these variants. - if use_alibi or use_sliding_window: + if use_alibi or use_sliding_window or use_local_attention: return False # Too few queries. Probably not worth using cascade attention. # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold. diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index b6a06b17bca2..65c3baa6784f 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -120,6 +120,7 @@ def use_cascade_attention( num_kv_heads: int, use_alibi: bool, use_sliding_window: bool, + use_local_attention: bool, num_sms: int, ) -> bool: return False diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index b1fab0d34de4..457d95cc738b 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -11,7 +11,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.utils import GiB_bytes, cdiv, sha256_cbor_64bit -from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, +from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec, + FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheSpec, KVCacheTensor, SlidingWindowSpec) from vllm.v1.metrics.stats import PrefixCacheStats @@ -976,7 +977,11 @@ def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: isinstance(spec, FullAttentionSpec) for spec in kv_cache_spec.values()) has_sliding_window = any( isinstance(spec, SlidingWindowSpec) for spec in kv_cache_spec.values()) - if has_full_attention and has_sliding_window: + has_chunked_local_attention = any( + isinstance(spec, ChunkedLocalAttentionSpec) + for spec in kv_cache_spec.values()) + if has_full_attention and (has_sliding_window + or has_chunked_local_attention): for layer_name, spec in kv_cache_spec.items(): if isinstance(spec, SlidingWindowSpec): kv_cache_spec[layer_name] = FullAttentionSpec( @@ -987,6 +992,15 @@ def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: use_mla=spec.use_mla, sliding_window=spec.sliding_window, ) + elif isinstance(spec, ChunkedLocalAttentionSpec): + kv_cache_spec[layer_name] = FullAttentionSpec( + block_size=spec.block_size, + num_kv_heads=spec.num_kv_heads, + head_size=spec.head_size, + dtype=spec.dtype, + use_mla=spec.use_mla, + attention_chunk_size=spec.attention_chunk_size, + ) if is_hybrid(kv_cache_spec): raise ValueError("Hybrid KV cache manager is disabled but failed to " @@ -1010,7 +1024,6 @@ def get_kv_cache_config( The generated KVCacheConfigs """ check_enough_kv_cache_memory(vllm_config, kv_cache_spec, available_memory) - if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager: unify_hybrid_kv_cache_specs(kv_cache_spec) diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 1560406c9004..65a196e044ab 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -394,6 +394,129 @@ def get_num_common_prefix_blocks(self, request_id: str, return 0 +class ChunkedLocalAttentionManager(SingleTypeKVCacheManager): + + def __init__(self, kv_cache_spec: ChunkedLocalAttentionSpec, + block_pool: BlockPool, **kwargs) -> None: + super().__init__(kv_cache_spec, block_pool, **kwargs) + self.attention_chunk_size = kv_cache_spec.attention_chunk_size + self._null_block = block_pool.null_block + + @classmethod + def find_longest_cache_hit( + cls, + block_hashes: list[BlockHash], + max_length: int, + kv_cache_group_ids: list[int], + block_pool: BlockPool, + kv_cache_spec: KVCacheSpec, + use_eagle: bool, + ) -> tuple[list[KVCacheBlock], ...]: + """ + For chunked local attention, we need to find the longest cache hit + prefix of the blocks that is not longer than `max_length`. The prefix + should be a common prefix hit for all the kv cache groups in + `kv_cache_group_ids`. If no cache hit is found, return an empty list. + note we mark as computed if the whole block is outside of the local + window, and set the block as null. Examples: + + 1. Attention chunk size of 8, block size of 4, max length of 15 + for next token at 15th (zero-indexed), 8th - 14th tokens are in + the window(needs lookup), 0th - 7th are not in the window, + so they are already marked as computed. We check the complete + block3 (8th - 11th tokens), Assume block 3 is hit, we will return + [null, null, block 3], otherwise, we return [null, null] + + 2. Attention chunk size of 8, block size of 4, max length of 16 + for next token at 16th (zero-indexed), 0th - 15th tokens are not + in the window, so they are already marked as computed. + we return 4 blocks[null, null, null, null] + + Args: + block_hashes: The block hashes of the request. + max_length: The maximum length of the cache hit prefix. + kv_cache_group_ids: The ids of the kv cache groups. + block_pool: The block pool. + kv_cache_spec: The kv cache spec. + use_eagle: Whether to use eagle. + + Returns: + A list of cached blocks + """ + assert isinstance(kv_cache_spec, ChunkedLocalAttentionSpec), ( + "ChunkedLocalAttentionManager can only be used for " + + "chunked local attention groups") + assert use_eagle is False, ("Hybrid KV cache is not supported for " + + "eagle + chunked local attention.") + max_num_blocks = max_length // kv_cache_spec.block_size + if max_length > 0: + local_attention_start_idx = (max_length // + kv_cache_spec.attention_chunk_size * + kv_cache_spec.attention_chunk_size) + else: + local_attention_start_idx = 0 + # we marked blocks out of window as computed + # with null blocks, and blocks inside window based on cache lookup + # result [null] [null] ... [null] [hit block 1 (1st block contain + # last window)] [hit block 2] ... [hit block x] + local_attention_start_block_idx = (local_attention_start_idx // + kv_cache_spec.block_size) + computed_blocks: tuple[list[KVCacheBlock], ...] = tuple( + [block_pool.null_block] * local_attention_start_block_idx + for _ in range(len(kv_cache_group_ids))) + for i in range(local_attention_start_block_idx, max_num_blocks): + block_hash = block_hashes[i] + if cached_block := block_pool.get_cached_block( + block_hash, kv_cache_group_ids): + for computed, cached in zip(computed_blocks, cached_block): + computed.append(cached) + else: + break + return computed_blocks + + def remove_skipped_blocks(self, request_id: str, + num_computed_tokens: int) -> None: + # Remove the blocks that are no longer be in the chunked attention + # window and skipped during the attention computation. + + # [chunk 0][chunk 1]local_attention_start_idx ... current + # we computed previous number of chunks to get the idx of + # current chunk window starting offset, + # e.g. for computed 1024 tokens, the 1024th token (0 indexed) + # is in the second chunk, there are 1 prev chunk, the start idx + # is 1024. for 1023, it will be 0. + num_cached_block = self.num_cached_block.get(request_id, 0) + local_attention_start_idx = ( + num_computed_tokens + ) // self.attention_chunk_size * self.attention_chunk_size + first_useful_block_idx = local_attention_start_idx // self.block_size + if num_cached_block > 0: + # Make sure we don't delete the last cached block + first_useful_block_idx = min(first_useful_block_idx, + num_cached_block - 1) + # if block size = 128, 0 -> block 0, 1024 (= 128 * 8) -> + # block 8, 372 (= 128 * 2 + 116) -> block 2 + blocks = self.req_to_blocks[request_id] + removed_blocks: list[KVCacheBlock] = [] + # we need to keep the last block to get the previous hash key + for i in range(first_useful_block_idx - 1, -1, -1): + if blocks[i] == self._null_block: + # If the block is already a null block, the blocks before it + # should also have been set to null blocks by the previous calls + # to this function. + break + removed_blocks.append(blocks[i]) + blocks[i] = self._null_block + self.block_pool.free_blocks(removed_blocks) + + def get_num_common_prefix_blocks(self, request_id: str, + num_running_requests: int) -> int: + """ + cascade attention is not supported by chunked local attention. + """ + return 0 + + class MambaManager(SingleTypeKVCacheManager): @classmethod @@ -435,8 +558,8 @@ def allocate_new_blocks(self, request_id: str, spec_manager_map: dict[type[KVCacheSpec], type[SingleTypeKVCacheManager]] = { FullAttentionSpec: FullAttentionManager, - ChunkedLocalAttentionSpec: FullAttentionManager, SlidingWindowSpec: SlidingWindowManager, + ChunkedLocalAttentionSpec: ChunkedLocalAttentionManager, MambaSpec: MambaManager, } diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 6726709955f7..bec31a7a058d 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -87,6 +87,7 @@ def page_size_bytes(self) -> int: @dataclass class FullAttentionSpec(AttentionSpec): sliding_window: Optional[int] = None + attention_chunk_size: Optional[int] = None """ When hybrid allocator is disabled and the model contains both full attention layers and sliding window attention layers, sliding @@ -105,6 +106,17 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len return cdiv(max_model_len, self.block_size) * self.page_size_bytes + @classmethod + def merge_window_sizes(cls, window_sizes: set[int]) -> Optional[int]: + if len(window_sizes) == 0: + return None + elif len(window_sizes) == 1: + return window_sizes.pop() + else: + raise ValueError( + "All attention layers in the same KV cache group must have the " + "same window size.") + @classmethod def merge(cls, specs: list[Self]) -> Self: """ @@ -114,14 +126,17 @@ def merge(cls, specs: list[Self]) -> Self: merged_spec = super().merge(specs) sliding_window = set(spec.sliding_window for spec in specs if spec.sliding_window is not None) - if len(sliding_window) == 0: - merged_spec.sliding_window = None - elif len(sliding_window) == 1: - merged_spec.sliding_window = sliding_window.pop() - else: - raise ValueError( - "All sliding window layers in the same KV cache group " - "must have the same window size.") + attention_chunk_size = set(spec.attention_chunk_size for spec in specs + if spec.attention_chunk_size is not None) + + merged_spec.sliding_window = cls.merge_window_sizes(sliding_window) + merged_spec.attention_chunk_size = ( + cls.merge_window_sizes(attention_chunk_size)) + assert ( + (merged_spec.sliding_window is not None) + + (merged_spec.attention_chunk_size is not None) <= 1 + ), ("Model with both sliding window layers and chunked local attention " + "layers is not supported.") return merged_spec @@ -129,16 +144,26 @@ def merge(cls, specs: list[Self]) -> Self: class ChunkedLocalAttentionSpec(AttentionSpec): attention_chunk_size: int - def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: - max_model_len = vllm_config.model_config.max_model_len - return cdiv(max_model_len, self.block_size) * self.page_size_bytes - @property def type_id(self) -> str: return ( f"local_attention_{self.attention_chunk_size}_{self.block_size}_{self.page_size_bytes}" ) # noqa + def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: + max_model_len = vllm_config.model_config.max_model_len + max_num_batched_tokens = ( + vllm_config.scheduler_config.max_num_batched_tokens) + + # During chunked prefill, we allocate KV cache for at most + # `self.attention_chunk_size` computed tokens plus the newly scheduled + # tokens. And we won't allocate KV cache for more than `max_model_len` + # tokens. + num_tokens = min(self.attention_chunk_size + max_num_batched_tokens, + max_model_len) + + return cdiv(num_tokens, self.block_size) * self.page_size_bytes + @dataclass class SlidingWindowSpec(AttentionSpec): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 06d0214c4d61..9620bf6a7957 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -862,6 +862,10 @@ def _compute_cascade_attn_prefix_len( use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or (isinstance(kv_cache_spec, FullAttentionSpec) and kv_cache_spec.sliding_window is not None)) + use_local_attention = ( + isinstance(kv_cache_spec, ChunkedLocalAttentionSpec) + or (isinstance(kv_cache_spec, FullAttentionSpec) + and kv_cache_spec.attention_chunk_size is not None)) assert isinstance(kv_cache_spec, AttentionSpec) use_cascade = attn_metadata_builder.use_cascade_attention( common_prefix_len=common_prefix_len, @@ -870,6 +874,7 @@ def _compute_cascade_attn_prefix_len( num_kv_heads=kv_cache_spec.num_kv_heads, use_alibi=self.use_alibi, use_sliding_window=use_sliding_window, + use_local_attention=use_local_attention, num_sms=self.num_sms, ) return common_prefix_len if use_cascade else 0 @@ -2672,6 +2677,9 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: dtype=self.kv_cache_dtype, sliding_window=attn_module.sliding_window, use_mla=use_mla) + assert not use_local_attention, ( + "attention module can not be with ", + "both local attention and sliding window") elif use_local_attention: kv_cache_spec[layer_name] = (ChunkedLocalAttentionSpec( block_size=block_size, From 9ffe905a4154d3ac373b5254fab72c995562137f Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 19 Jul 2025 09:45:03 +0530 Subject: [PATCH 07/57] [Bugfix][Model] Fix LoRA for Mistral-Small-3.1-24B-Instruct-2503 (#21183) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- vllm/lora/models.py | 19 +++++++++++++++++-- vllm/lora/utils.py | 16 ++++++++++------ 2 files changed, 27 insertions(+), 8 deletions(-) diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 521bb079da41..633674d5fb29 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -498,6 +498,14 @@ def remove_all_adapters(self): self._active_adapters.clear() def _create_lora_modules(self): + + def _parent_module(module_name: str) -> str: + # module name is a dot separated name. + # for example: + # - given an input 'x.y.z' return 'x.y' + # - given an input 'x' return '' + return module_name.rpartition('.')[0] + for module_name, module in self.model.named_modules( remove_duplicate=False): if isinstance(module, PPMissingLayer): @@ -529,10 +537,17 @@ def _create_lora_modules(self): new_module.scaling_factor_to_offset # (yard1): TODO make this more robust if "lm_head" in module_name: + logits_processor_module_name = 'logits_processor' + parent_module = _parent_module(module_name) + if parent_module: + logits_processor_module_name = ( + f"{parent_module}.{logits_processor_module_name}") + logits_processor_module = self.model.get_submodule( - "logits_processor") + logits_processor_module_name) + new_module = replace_submodule( - self.model, "logits_processor", + self.model, logits_processor_module_name, from_layer_logits_processor(logits_processor_module, module, self.lora_slots, self.lora_config, diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 6b3291e9c92f..7148ffe14948 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -188,16 +188,20 @@ def get_supported_lora_modules(model: nn.Module) -> list[str]: """ In vLLM, all linear layers support LoRA. """ + supported_lora_modules: set[str] = set() - # step1: traverse the model to get all the linear subfixes. for name, module in model.named_modules(): + # get the embedding modules if the module's embedding_modules + # is not empty. + embedding_modules = getattr(module, "embedding_modules", None) + if embedding_modules is not None: + for name in embedding_modules: + supported_lora_modules.add(name) + + # get all the linear subfixes. if isinstance(module, (LinearBase, )): supported_lora_modules.add(name.split(".")[-1]) - # step 2: get the embedding modules if the model's mbedding_modules - # is not empty. - if model.embedding_modules: - for name in model.embedding_modules: - supported_lora_modules.add(name) + return list(supported_lora_modules) From dd572c0ab3effa539b74f9a1288bb61ce83ada76 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 18 Jul 2025 21:47:50 -0700 Subject: [PATCH 08/57] [V0 Deprecation] Remove V0 Spec Decode workers (#21152) Signed-off-by: Woosuk Kwon --- .buildkite/test-pipeline.yaml | 14 - .github/CODEOWNERS | 1 - .github/mergify.yml | 3 - pyproject.toml | 1 - tests/core/test_serialization.py | 2 +- tests/core/utils.py | 134 +- tests/metrics/test_metrics.py | 146 -- tests/models/registry.py | 8 +- tests/models/test_registry.py | 14 +- tests/samplers/test_rejection_sampler.py | 577 ------- .../test_typical_acceptance_sampler.py | 480 ------ tests/spec_decode/__init__.py | 0 tests/spec_decode/conftest.py | 12 - tests/spec_decode/e2e/__init__.py | 0 tests/spec_decode/e2e/conftest.py | 307 ---- tests/spec_decode/e2e/test_compatibility.py | 66 - .../spec_decode/e2e/test_eagle_correctness.py | 480 ------ tests/spec_decode/e2e/test_integration.py | 161 -- .../e2e/test_integration_dist_tp2.py | 247 --- .../e2e/test_integration_dist_tp4.py | 123 -- tests/spec_decode/e2e/test_logprobs.py | 315 ---- .../e2e/test_medusa_correctness.py | 417 ------ tests/spec_decode/e2e/test_mlp_correctness.py | 533 ------- tests/spec_decode/e2e/test_mtp_correctness.py | 333 ----- .../e2e/test_multistep_correctness.py | 842 ----------- .../spec_decode/e2e/test_ngram_correctness.py | 392 ----- tests/spec_decode/e2e/test_seed.py | 70 - tests/spec_decode/test_batch_expansion.py | 110 -- tests/spec_decode/test_dynamic_spec_decode.py | 90 -- tests/spec_decode/test_memory_usage.py | 91 -- tests/spec_decode/test_metrics.py | 205 --- tests/spec_decode/test_multi_step_worker.py | 838 ----------- tests/spec_decode/test_ngram_worker.py | 221 --- tests/spec_decode/test_scorer.py | 116 -- tests/spec_decode/test_spec_decode_worker.py | 945 ------------ tests/spec_decode/test_utils.py | 150 -- tests/spec_decode/utils.py | 290 ---- tests/test_sequence.py | 1 - tests/v1/test_oracle.py | 6 - tools/mypy.sh | 1 - vllm/config.py | 61 +- vllm/engine/arg_utils.py | 28 +- vllm/engine/llm_engine.py | 8 - vllm/engine/metrics.py | 66 - vllm/engine/metrics_types.py | 12 +- vllm/engine/output_processor/multi_step.py | 5 - .../layers/rejection_sampler.py | 406 ----- vllm/model_executor/layers/sampler.py | 12 +- .../layers/spec_decode_base_sampler.py | 259 ---- .../layers/typical_acceptance_sampler.py | 166 --- vllm/model_executor/models/eagle.py | 261 ---- vllm/model_executor/models/registry.py | 5 +- vllm/platforms/cuda.py | 12 +- vllm/platforms/rocm.py | 11 +- vllm/sequence.py | 14 +- vllm/spec_decode/__init__.py | 0 vllm/spec_decode/batch_expansion.py | 506 ------- vllm/spec_decode/draft_model_runner.py | 349 ----- vllm/spec_decode/interfaces.py | 99 -- vllm/spec_decode/medusa_worker.py | 138 -- vllm/spec_decode/metrics.py | 213 --- vllm/spec_decode/mlp_speculator_worker.py | 94 -- vllm/spec_decode/mqa_scorer.py | 160 -- vllm/spec_decode/multi_step_worker.py | 423 ------ vllm/spec_decode/ngram_worker.py | 196 --- vllm/spec_decode/proposer_worker_base.py | 59 - .../spec_decode/smaller_tp_proposer_worker.py | 196 --- vllm/spec_decode/spec_decode_worker.py | 1326 ----------------- vllm/spec_decode/target_model_runner.py | 45 - vllm/spec_decode/top1_proposer.py | 275 ---- vllm/spec_decode/util.py | 277 ---- vllm/transformers_utils/configs/eagle.py | 40 +- vllm/worker/worker_base.py | 2 - 73 files changed, 191 insertions(+), 14275 deletions(-) delete mode 100644 tests/samplers/test_rejection_sampler.py delete mode 100644 tests/samplers/test_typical_acceptance_sampler.py delete mode 100644 tests/spec_decode/__init__.py delete mode 100644 tests/spec_decode/conftest.py delete mode 100644 tests/spec_decode/e2e/__init__.py delete mode 100644 tests/spec_decode/e2e/conftest.py delete mode 100644 tests/spec_decode/e2e/test_compatibility.py delete mode 100644 tests/spec_decode/e2e/test_eagle_correctness.py delete mode 100644 tests/spec_decode/e2e/test_integration.py delete mode 100644 tests/spec_decode/e2e/test_integration_dist_tp2.py delete mode 100644 tests/spec_decode/e2e/test_integration_dist_tp4.py delete mode 100644 tests/spec_decode/e2e/test_logprobs.py delete mode 100644 tests/spec_decode/e2e/test_medusa_correctness.py delete mode 100644 tests/spec_decode/e2e/test_mlp_correctness.py delete mode 100644 tests/spec_decode/e2e/test_mtp_correctness.py delete mode 100644 tests/spec_decode/e2e/test_multistep_correctness.py delete mode 100644 tests/spec_decode/e2e/test_ngram_correctness.py delete mode 100644 tests/spec_decode/e2e/test_seed.py delete mode 100644 tests/spec_decode/test_batch_expansion.py delete mode 100644 tests/spec_decode/test_dynamic_spec_decode.py delete mode 100644 tests/spec_decode/test_memory_usage.py delete mode 100644 tests/spec_decode/test_metrics.py delete mode 100644 tests/spec_decode/test_multi_step_worker.py delete mode 100644 tests/spec_decode/test_ngram_worker.py delete mode 100644 tests/spec_decode/test_scorer.py delete mode 100644 tests/spec_decode/test_spec_decode_worker.py delete mode 100644 tests/spec_decode/test_utils.py delete mode 100644 tests/spec_decode/utils.py delete mode 100644 vllm/model_executor/layers/rejection_sampler.py delete mode 100644 vllm/model_executor/layers/spec_decode_base_sampler.py delete mode 100644 vllm/model_executor/layers/typical_acceptance_sampler.py delete mode 100644 vllm/model_executor/models/eagle.py delete mode 100644 vllm/spec_decode/__init__.py delete mode 100644 vllm/spec_decode/batch_expansion.py delete mode 100644 vllm/spec_decode/draft_model_runner.py delete mode 100644 vllm/spec_decode/interfaces.py delete mode 100644 vllm/spec_decode/medusa_worker.py delete mode 100644 vllm/spec_decode/metrics.py delete mode 100644 vllm/spec_decode/mlp_speculator_worker.py delete mode 100644 vllm/spec_decode/mqa_scorer.py delete mode 100644 vllm/spec_decode/multi_step_worker.py delete mode 100644 vllm/spec_decode/ngram_worker.py delete mode 100644 vllm/spec_decode/proposer_worker_base.py delete mode 100644 vllm/spec_decode/smaller_tp_proposer_worker.py delete mode 100644 vllm/spec_decode/spec_decode_worker.py delete mode 100644 vllm/spec_decode/target_model_runner.py delete mode 100644 vllm/spec_decode/top1_proposer.py delete mode 100644 vllm/spec_decode/util.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bbbcfb745d57..7f1848b4bfbc 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -159,7 +159,6 @@ steps: - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - - tests/spec_decode/e2e/test_integration_dist_tp4 - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py @@ -182,7 +181,6 @@ steps: - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - pushd ../examples/offline_inference @@ -330,17 +328,6 @@ steps: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers -- label: Speculative decoding tests # 40min - mirror_hardwares: [amdexperimental] - source_file_dependencies: - - vllm/spec_decode - - tests/spec_decode - - vllm/model_executor/models/eagle.py - commands: - - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py - - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - - label: LoRA Test %N # 15min each mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: @@ -726,7 +713,6 @@ steps: - pytest -v -s distributed/test_sequence_parallel.py # this test fails consistently. # TODO: investigate and fix - # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 97f9e7dc1578..8c68bc8f02b6 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -43,7 +43,6 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/multimodal @DarkLight1337 @ywang96 /tests/prefix_caching @comaniac @KuntaiDu /tests/quantization @mgoin @robertgshaw2-redhat -/tests/spec_decode @njhill @LiuXiaoxuanPKU /tests/test_inputs.py @DarkLight1337 @ywang96 /tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm diff --git a/.github/mergify.yml b/.github/mergify.yml index fccce82d50d1..5c878ac02069 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -164,10 +164,7 @@ pull_request_rules: description: Automatically apply speculative-decoding label conditions: - or: - - files~=^vllm/spec_decode/ - files~=^vllm/v1/spec_decode/ - - files=vllm/model_executor/layers/spec_decode_base_sampler.py - - files~=^tests/spec_decode/ - files~=^tests/v1/spec_decode/ - files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py - files~=^vllm/model_executor/models/.*eagle.*\.py diff --git a/pyproject.toml b/pyproject.toml index 85a112ff51cf..0c8d2f82d1d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,6 @@ line-length = 80 "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] -"vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] # Python 3.8 typing - skip utils for ROCm "vllm/utils/__init__.py" = ["UP006", "UP035"] diff --git a/tests/core/test_serialization.py b/tests/core/test_serialization.py index 8281298d6634..ee9ac2129f2d 100644 --- a/tests/core/test_serialization.py +++ b/tests/core/test_serialization.py @@ -6,7 +6,7 @@ from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.sequence import ExecuteModelRequest -from ..spec_decode.utils import create_batch +from .utils import create_batch def test_msgspec_serialization(): diff --git a/tests/core/utils.py b/tests/core/utils.py index b746c1786464..033fffd2c4e2 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -4,15 +4,16 @@ import time from collections import defaultdict from collections.abc import Sequence as GenericSequence -from typing import Any, Optional +from itertools import count +from typing import Any, Optional, Union import torch -from vllm import SamplingParams from vllm.core.scheduler import Scheduler, SchedulerOutputs from vllm.inputs import EncoderDecoderInputs, embeds_inputs, token_inputs from vllm.lora.request import LoRARequest -from vllm.sequence import (Logprob, Sequence, SequenceGroup, +from vllm.sampling_params import SamplingParams +from vllm.sequence import (Logprob, Sequence, SequenceData, SequenceGroup, SequenceGroupMetadata) @@ -262,3 +263,130 @@ def last_schedule_ret( self, ) -> tuple[list[SequenceGroupMetadata], SchedulerOutputs, Any]: _, _, ret = self.call_history["schedule"][-1] return ret + + +def create_seq_group_metadata_from_prompts( + prompts: list[list[int]], + num_gpu_blocks: int, + block_size: int, + final_prompt_lens: list[int], + continuations: Optional[list[list[int]]] = None, + seq_ids: Optional[list[int]] = None, +) -> list[SequenceGroupMetadata]: + + if continuations is None: + continuations = [[] for _ in prompts] + + if seq_ids is None: + seq_ids = list(i for i, _ in enumerate(prompts)) + + free_gpu_blocks = list(range(num_gpu_blocks)) + + block_allocations = { + i: [ + free_gpu_blocks.pop() + for _ in range(round_up_to_next_block(final_len, block_size)) + ] + for i, final_len in enumerate(final_prompt_lens) + } + + seq_grou_metadata_list = [] + for i, (prompt_token_ids, + cont_token_ids) in enumerate(zip(prompts, continuations)): + data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids) + data.update_num_computed_tokens( + len(prompt_token_ids) + len(cont_token_ids) - 1) + seq_data = {i: data} + seq_grou_metadata_list.append( + SequenceGroupMetadata( + request_id=str(i), + is_prompt=len(cont_token_ids) == 0, + seq_data=seq_data, + sampling_params=SamplingParams(temperature=0.0), + block_tables={i: block_allocations[i][:]}, + )) + return seq_grou_metadata_list + + +def create_chunked_seq_group_metadata_from_prompt( + prompt: list[int], + num_gpu_blocks: int, + chunk_size: int, + block_size: int, + seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]: + + if seq_id is None: + seq_id = 0 + + free_gpu_blocks = list(range(num_gpu_blocks)) + + block_allocations = [ + free_gpu_blocks.pop() + for _ in range(round_up_to_next_block(len(prompt), block_size)) + ] + + seq_group_metadata_list = [] + for i, idx in enumerate(range(0, len(prompt), chunk_size)): + chunk_ids = prompt[idx:idx + chunk_size] + data = SequenceData.from_seqs(prompt) + data.update_num_computed_tokens(idx) + seq_data = {i: data} + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=str(seq_id), + is_prompt=True, + do_sample=idx + chunk_size >= len(prompt), # terminal chunk + seq_data=seq_data, + sampling_params=SamplingParams(temperature=0.0), + block_tables={i: block_allocations}, + token_chunk_size=len(chunk_ids))) + return seq_group_metadata_list + + +def create_batch(batch_size, + k, + prompt_len: Union[int, list[int]] = 10, + prev_output_token_len: int = 10, + seq_ids: Optional[list[int]] = None, + num_gpu_blocks: Optional[int] = None, + block_size: Optional[int] = None, + prefill_chunk_size: Optional[int] = None): + if block_size is None: + block_size = 8 + + if num_gpu_blocks is None: + num_gpu_blocks = 2048 // block_size + + iterator = count() + + if isinstance(prompt_len, int): + prompt_lens = [prompt_len for _ in range(batch_size)] + else: + prompt_lens = prompt_len + + prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] + + if prefill_chunk_size: + # Create a batch of chunked prompts. + if not seq_ids: + seq_ids = list(range(len(prompts))) + seq_group_metadata_list = [] + for p, sid in zip(prompts, seq_ids): + seq_group_metadata_list += \ + create_chunked_seq_group_metadata_from_prompt( + p, num_gpu_blocks, prefill_chunk_size, block_size, sid) + seq_group_metadata_list = seq_group_metadata_list[:batch_size] + prev_output_tokens = [] + else: + prev_output_tokens = [[ + next(iterator) for _ in range(prev_output_token_len) + ] for _ in range(batch_size)] + final_prompt_lens = [ + len(prompt) + len(prev_output_token) + k + 1 + for prompt, prev_output_token in zip(prompts, prev_output_tokens) + ] + + seq_group_metadata_list = create_seq_group_metadata_from_prompts( + prompts, num_gpu_blocks, block_size, final_prompt_lens, + prev_output_tokens, seq_ids) + return seq_group_metadata_list, prompts, prev_output_tokens diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 7bb5d8980d61..54dbb747de09 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -1,15 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import time - import pytest import ray from prometheus_client import REGISTRY import vllm.envs as envs from vllm import EngineArgs, LLMEngine -from vllm.distributed import cleanup_dist_env_and_memory from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.metrics import RayPrometheusStatLogger @@ -232,149 +229,6 @@ def test_engine_log_metrics_regression( assert_metrics(model, engine, disable_log_stats, len(example_prompts)) -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [10]) -def test_metric_spec_decode( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, -) -> None: - k = 5 - - with vllm_runner( - model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4, - speculative_config={ - "model": model, - "num_speculative_tokens": k, - }, - ) as vllm_model: - - # Force log interval to be 0 to catch all metrics. - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] - stat_logger.local_interval = 0 - - # Note that the purpose of this test is to verify spec decode - # metrics instead of functional correctness, so the expected values - # are intended to be loose. - metric_name_to_expected_fn = { - "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1, - "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1, - "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k, - "counter_spec_decode_num_draft_tokens": lambda v: v == k, - "counter_spec_decode_num_emitted_tokens": - lambda v: 0 <= v <= k + 1, - } - - # Use one request to better inspect the metrics. - prompts = example_prompts[:1] - - _ = vllm_model.generate_greedy(prompts, max_tokens) - for metric_name, is_expected in metric_name_to_expected_fn.items(): - metric_val = getattr( - stat_logger.metrics, - metric_name).labels(**stat_logger.labels)._value.get() - assert is_expected(metric_val), ( - f"the value of metric {metric_name} ({metric_val}) " - "does not meet expectation") - - -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [10]) -@pytest.mark.parametrize("log_interval", [1, 3, 5, 7]) -def test_metric_spec_decode_interval( - vllm_runner, - example_prompts, - model: str, - dtype: str, - max_tokens: int, - log_interval: int, -) -> None: - k = 5 - - engine_args = EngineArgs( - model=model, - dtype=dtype, - disable_log_stats=False, - gpu_memory_utilization=0.4, - speculative_config={ - "model": model, - "num_speculative_tokens": k, - }, - enforce_eager=True, - ) - - engine = LLMEngine.from_engine_args(engine_args) - - try: - - engine.add_request( - "request-id-0", - example_prompts[0], - SamplingParams(max_tokens=max_tokens), - ) - - # set log internal - stat_logger = engine.stat_loggers['prometheus'] - stat_logger.local_interval = log_interval - - # prefill - engine.step() - - # wait for 5 seconds to ensure that spec decode metrics - # get triggered in first decode step - time.sleep(5) - - # first decode step should trigger async collection of metrics - engine.step() - - # wait one second to allow H2D transfer to finish - time.sleep(1) - - # second decode step should now be able to collect the spec - # decode stats and the request should also be finished - engine.step() - - # must have finisehd now - assert not engine.has_unfinished_requests() - - # wait to ensure logging occurs - time.sleep(log_interval) - - # force logging - engine.step() - - # Note that the purpose of this test is to verify spec decode - # metrics instead of functional correctness, so the expected values - # are intended to be loose. - metric_name_to_expected_fn = { - "gauge_spec_decode_draft_acceptance_rate": lambda v: 0 <= v <= 1, - "gauge_spec_decode_efficiency": lambda v: 0 <= v <= 1, - "counter_spec_decode_num_accepted_tokens": lambda v: 0 <= v <= k, - "counter_spec_decode_num_draft_tokens": lambda v: v == k, - "counter_spec_decode_num_emitted_tokens": - lambda v: 0 <= v <= k + 1, - } - - for metric_name, is_expected in metric_name_to_expected_fn.items(): - metric_val = getattr( - stat_logger.metrics, - metric_name).labels(**stat_logger.labels)._value.get() - assert is_expected(metric_val), ( - f"the value of metric {metric_name} ({metric_val}) " - "does not meet expectation") - - finally: - del engine - cleanup_dist_env_and_memory() - - def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool, num_requests: int) -> None: if disable_log_stats: diff --git a/tests/models/registry.py b/tests/models/registry.py index 56ae501021f4..3ffa7f81a1ad 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -457,12 +457,12 @@ def check_available_online( _SPECULATIVE_DECODING_EXAMPLE_MODELS = { - "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m", - speculative_model="abhigoyal/vllm-eagle-llama-68m-random"), # noqa: E501 "MedusaModel": _HfExamplesInfo("JackFram/llama-68m", speculative_model="abhigoyal/vllm-medusa-llama-68m-random"), # noqa: E501 - "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", - speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501 + # Temporarily disabled. + # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. + # "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m", + # speculative_model="ibm-ai-platform/llama-160m-accelerator"), # noqa: E501 "DeepSeekMTPModel": _HfExamplesInfo("luccafong/deepseek_mtp_main_random", speculative_model="luccafong/deepseek_mtp_draft_random", # noqa: E501 trust_remote_code=True), diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 01b2260abe8c..1ce90070c5c8 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -72,11 +72,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce): @create_new_process_for_each_test() -@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [ - ("MLPSpeculatorPreTrainedModel", False, False), - ("DeepseekV2ForCausalLM", True, False), - ("Qwen2VLForConditionalGeneration", True, True), -]) +@pytest.mark.parametrize( + "model_arch,is_pp,init_cuda", + [ + # TODO(woosuk): Re-enable this once the MLP Speculator is supported + # in V1. + # ("MLPSpeculatorPreTrainedModel", False, False), + ("DeepseekV2ForCausalLM", True, False), + ("Qwen2VLForConditionalGeneration", True, True), + ]) def test_registry_is_pp(model_arch, is_pp, init_cuda): assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py deleted file mode 100644 index 3b93c64113da..000000000000 --- a/tests/samplers/test_rejection_sampler.py +++ /dev/null @@ -1,577 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for rejection sampling.""" - -import pytest -import torch -import torch.nn.functional as F - -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.model_executor.utils import set_random_seed - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This file tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -CUDA_DEVICES = [ - f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) -] - - -def mock_causal_accepted_tensor( - k: int, last_accepted_indices: torch.Tensor) -> torch.Tensor: - """Generate an "accepted" tensor which should yield causally-accepted tokens - up to last accepted indices. - - Tokens after last_accepted_indices+1 may also be accepted, although they - will not be causally accepted. - """ - batch_size = last_accepted_indices.shape[0] - - accepted = (torch.arange(k).expand(batch_size, k) - <= last_accepted_indices.unsqueeze(-1).broadcast_to( - batch_size, k)) - - # Sprinkle accepted values after the contiguous initial accepted values. - # This replicates the behavior of rejection sampling, which may "accept" - # a token that cannot be accepted because of causality. - sprinkle_candidates = (torch.arange(k).expand( - batch_size, - k) > last_accepted_indices.unsqueeze(-1).broadcast_to(batch_size, k) + - 1) - sprinkle = torch.rand(batch_size, k) > 0.5 - accepted[sprinkle_candidates] = sprinkle[sprinkle_candidates] - return accepted - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize( - "which_tokens_accepted", - ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_correct_output_format(which_tokens_accepted: str, seed: int, - device: str, use_flashinfer: bool): - """Verify the output has correct format given predetermined accepted matrix. - """ - set_random_seed(seed) - torch.set_default_device(device) - - batch_size = 10 - k = 5 - vocab_size = 3000 - - if which_tokens_accepted == "all_tokens_accepted": - accepted = mock_causal_accepted_tensor( - k, -1 + k * torch.ones((batch_size, ), dtype=torch.long)) - elif which_tokens_accepted == "no_tokens_accepted": - accepted = mock_causal_accepted_tensor( - k, -torch.ones((batch_size, ), dtype=torch.long)) - elif which_tokens_accepted == "some_tokens_accepted": - last_accepted_indices = torch.randint(low=-1, - high=k, - size=(batch_size, )) - accepted = mock_causal_accepted_tensor(k, last_accepted_indices) - else: - raise AssertionError() - - recovered_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - output_token_ids = rejection_sampler._create_output( # pylint: disable=protected-access - accepted, - recovered_token_ids, - draft_token_ids, - bonus_token_ids, - ) - - expected_bonus_token_ids = bonus_token_ids.clone() - - if which_tokens_accepted == "all_tokens_accepted": - # Expect all tokens to be equal to draft tokens. - assert torch.equal(output_token_ids[:, :-1], draft_token_ids) - - # Expect all bonus tokens to be included. - assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids) - elif which_tokens_accepted == "no_tokens_accepted": - # Expect first token to be equal to recovered tokens. - assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0]) - - # Expect everything else to be -1. - assert torch.equal(output_token_ids[:, 1:], - torch.ones_like(output_token_ids[:, 1:]) * -1) - elif which_tokens_accepted == "some_tokens_accepted": - recovered_plus_bonus = torch.cat( - (recovered_token_ids, expected_bonus_token_ids), dim=-1) - # Assert first rejected token is a recovered token or bonus token. - assert torch.equal( - recovered_plus_bonus[torch.arange(0, batch_size), - last_accepted_indices + 1], - output_token_ids[torch.arange(0, batch_size), - last_accepted_indices + 1]) - - # Assert every subsequent token is -1. - subsequent_mask = torch.arange(0, k + 1).expand( - batch_size, k + 1) >= (last_accepted_indices + 2).unsqueeze(-1) - assert torch.all(output_token_ids[subsequent_mask] == -1) - - -@pytest.mark.parametrize("k", list(range(1, 6))) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", list(range(1, 32))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, - device: str, use_flashinfer: bool): - torch.set_default_device(device) - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids) - - -@pytest.mark.parametrize("frac_seeded", [0.0, 0.25, 0.5, 1.0]) -@pytest.mark.parametrize("k", [1, 3, 6]) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) -@pytest.mark.parametrize("n_rep", [100]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -# @pytest.mark.parametrize("use_flashinfer", [True, False]) -# Not testing FlashInfer now, since 0.2.3 API removed the ability -# to pass in uniform samples. -@pytest.mark.parametrize("use_flashinfer", [False]) -@torch.inference_mode() -def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int, - frac_seeded: float, n_rep: int, device: str, - use_flashinfer: bool): - torch.set_default_device(device) - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - seeded_mask = torch.rand(batch_size, dtype=torch.float32) <= frac_seeded - - results = [] - for _ in range(n_rep): - seeded_seqs = { - i: torch.Generator(device=device).manual_seed(i) - for i in range(batch_size) if seeded_mask[i] - } - results.append( - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids, seeded_seqs)) - - for i in range(batch_size): - if seeded_mask[i]: - for j in range(1, n_rep): - assert torch.equal(results[j][i], results[0][i]) - - -@pytest.mark.parametrize("k", [1, 3, 6]) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", [3, 8, 32, 128]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -# @pytest.mark.parametrize("use_flashinfer", [True, False]) -# Not testing FlashInfer now, since 0.2.3 API removed the ability -# to pass in uniform samples. -@pytest.mark.parametrize("use_flashinfer", [False]) -@torch.inference_mode() -def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int, - device: str, use_flashinfer: bool): - torch.set_default_device(device) - set_random_seed(0) - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - single_batches = [] - for i in range(batch_size): - single_batches.append((draft_probs[i].clone().unsqueeze(0), - draft_token_ids[i].clone().unsqueeze(0), - target_probs[i].clone().unsqueeze(0), - bonus_token_ids[i].clone().unsqueeze(0), - draft_token_ids[i].clone().unsqueeze(0))) - - set_random_seed(0) - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - - results = [] - seeded_seqs = { - i: torch.Generator(device=device).manual_seed(i) - for i in range(1, batch_size) # 0 is seed None - } - batch_result = rejection_sampler(target_probs.clone(), - bonus_token_ids.clone(), - draft_probs.clone(), - draft_token_ids.clone(), seeded_seqs) - - set_random_seed(0) - - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - for i in range(batch_size): - request_seeded_seqs = { - 0: torch.Generator(device=device).manual_seed(i) - } if seeded_seqs.get(i) is not None else None - (draft_probs, draft_token_ids, target_probs, bonus_token_ids, - draft_token_ids) = single_batches[i] - results.append( - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids, request_seeded_seqs)) - for i in range(batch_size): - assert torch.equal(batch_result[i], results[i].squeeze(0)) - - -@pytest.mark.parametrize("k", [1, 3, 6]) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_compare_nonflashinfer_backend(k: int, vocab_size: int, - batch_size: int, device: str): - """ - Test the flashinfer and nonflashinfer backend generate - the same output metrics. - """ - - pytest.skip("Not testing FlashInfer now, since 0.2.3 API removed " - "the ability to pass in uniform samples.") - - torch.set_default_device(device) - torch.manual_seed(0) - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - num_accepted_tokens = [] - num_emitted_tokens = [] - num_draft_tokens = [] - - def get_seeded_seqs(): - return { - i: torch.Generator(device=device).manual_seed(i) - for i in range(batch_size) - } - - for use_flashinfer in [True, False]: - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) - rejection_sampler.init_gpu_tensors(device=device) - # We use seeded sequences to ensure the same tokens are accepted - # for both flashinfer and nonflashinfer backends. - seeded_seqs = get_seeded_seqs() - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids, seeded_seqs) - num_accepted_tokens.append(rejection_sampler.num_accepted_tokens) - num_emitted_tokens.append(rejection_sampler.num_emitted_tokens) - num_draft_tokens.append(rejection_sampler.num_draft_tokens) - - assert num_accepted_tokens[0] == num_accepted_tokens[1] - assert num_emitted_tokens[0] == num_emitted_tokens[1] - assert num_draft_tokens[0] == num_draft_tokens[1] - - -@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) -@pytest.mark.parametrize("which_token_ids", - ["bonus_token_ids", "draft_token_ids"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_raises_when_vocab_oob(above_or_below_vocab_range: str, - which_token_ids: str, device: str, - use_flashinfer: bool): - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - - rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer, - strict_mode=True) - rejection_sampler.init_gpu_tensors(device=device) - - draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - target_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - - oob_token_ids = None - if which_token_ids == "bonus_token_ids": - oob_token_ids = bonus_token_ids - elif which_token_ids == "draft_token_ids": - oob_token_ids = draft_token_ids - else: - raise AssertionError() - - if above_or_below_vocab_range == "above": - rogue_token_id = vocab_size + 1 - elif above_or_below_vocab_range == "below": - rogue_token_id = -1 - else: - raise AssertionError() - - oob_token_ids[0][0] = rogue_token_id - - with pytest.raises(AssertionError): - rejection_sampler(target_probs, bonus_token_ids, draft_probs, - draft_token_ids) - - -@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) -@pytest.mark.parametrize("seed", list(range(5))) -@pytest.mark.parametrize("use_flashinfer", [True, False]) -@torch.inference_mode() -def test_rejection_sampling_approximates_target_distribution( - seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool): - """Verify rejection sampling approximates target distribution, - despite sampling from a potentially distinct draft distribution. - - This is done by first creating a random target probability - distribution and a random draft probability distribution. We then - sample token ids from the rejection sampler using these draft - and target distributions. The samples are used to estimate - the output probability distribution, which we expect to approximate - the target distribution. - - A basic distance metric is used to determine similarity between - distributions. - - We expect that as we increase the number of samples, - the distance between the observed distribution and the target - distribution decreases. To measure this, we compare the distance - of the observed distribution against both the target distribution - and a uniform random distribution. We expect the distance between - the observed distribution and the target distribution to improve - much more than the distance improvement between the observed - distribution and the random distribution. - - When draft_and_target_probs_equal=True, the draft and target - probabilities are exactly equal. Rejection sampling should - still work without any NaNs or exceptions. - """ - torch.set_default_device("cpu") - set_random_seed(seed) - helper = _CorrectnessTestHelper( - vocab_size=10, - rejection_sampler=RejectionSampler(use_flashinfer=use_flashinfer), - ) - - draft_probs, target_probs, reference_probs = helper.generate_probs_for_test( - draft_and_target_probs_equal) - - sample_sizes = [10, 100, 1_000, 10_000, 100_000] - distance_wrt_reference: list[float] = [] - distance_wrt_target: list[float] = [] - - for num_samples in sample_sizes: - (reference_vs_rejsample_dist, - target_vs_rejsample_dist) = helper.run_and_compare_distributions( - draft_probs, - target_probs, - reference_probs, - num_samples, - ) - - distance_wrt_reference.append(reference_vs_rejsample_dist) - distance_wrt_target.append(target_vs_rejsample_dist) - - relative_change_in_distance_wrt_target = get_ratio_first_to_last( - distance_wrt_target) - relative_change_in_distance_wrt_reference = get_ratio_first_to_last( - distance_wrt_reference) - - print(f"{num_samples=} {target_vs_rejsample_dist=:.05f} " - f"{reference_vs_rejsample_dist=:.05f}") - print(f"{num_samples=} {relative_change_in_distance_wrt_target=:.02f} " - f"{relative_change_in_distance_wrt_reference=:.02f}") - - relative_change_in_distance_wrt_target = get_ratio_first_to_last( - distance_wrt_target) - relative_change_in_distance_wrt_reference = get_ratio_first_to_last( - distance_wrt_reference) - - expected_improvement_multiplier = 20 - assert (relative_change_in_distance_wrt_target - > relative_change_in_distance_wrt_reference * - expected_improvement_multiplier) - - -def get_ratio_first_to_last(elements: list[float]) -> float: - return elements[0] / elements[-1] - - -class _CorrectnessTestHelper: - """Class that packages together logic required for the unit-level - rejection sampling correctness test. - """ - - def __init__(self, vocab_size: int, rejection_sampler: RejectionSampler): - self.rejection_sampler = rejection_sampler - self.vocab_size = vocab_size - self.vocab_range = (0, vocab_size) - - self.rejection_sampler.init_gpu_tensors(device=0) - - # Keep test simple, use k=1 - self.k = 1 - - # Bonus tokens not used, but rejection sampler requires - # correct shape. - self.num_bonus_tokens = 1 - - def generate_probs_for_test( - self, draft_and_target_probs_equal: bool - ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - draft_probs, target_probs = (F.softmax( - torch.rand(self.vocab_size, dtype=torch.float32), - dim=-1, - ) for _ in range(2)) - - num_reference_probs = 100 - reference_probs = F.softmax( - torch.rand(num_reference_probs, - self.vocab_size, - dtype=torch.float32), - dim=-1, - ) - - if draft_and_target_probs_equal: - target_probs = draft_probs.clone() - - return draft_probs, target_probs, reference_probs - - def run_and_compare_distributions(self, draft_probs: torch.Tensor, - target_probs: torch.Tensor, - reference_probs: torch.Tensor, - num_samples: int) -> tuple[float, float]: - # Sample using rejection sampling. - rej_sample_probs = self._estimate_rejection_sampling_pdf( - draft_probs, target_probs, num_samples) - - # Average distance from reference probs. - reference_vs_rejsample_dist = torch.dist( - reference_probs, - rej_sample_probs).item() / reference_probs.shape[0] - target_vs_rejsample_dist = torch.dist(target_probs, - rej_sample_probs).item() - - return reference_vs_rejsample_dist, target_vs_rejsample_dist - - def _estimate_rejection_sampling_pdf( - self, - draft_probs: torch.Tensor, - target_probs: torch.Tensor, - num_samples: int, - ) -> torch.Tensor: - # Repeat draft probs num_samples times. - draft_probs = draft_probs.reshape(1, self.k, self.vocab_size).repeat( - num_samples, 1, 1) - - # Repeat target probs num_samples * (k + 1) times. - # Rejection sampler requires bonus token probs, but they aren't used. - target_probs = target_probs.reshape(1, 1, self.vocab_size).repeat( - num_samples, self.k + 1, 1) - - # Randomly sample draft token ids from draft probs. - draft_token_ids = torch.multinomial(draft_probs[:, 0, :], - num_samples=1, - replacement=True).reshape( - num_samples, self.k) - - # Bonus tokens not used but required. - bonus_token_ids = torch.zeros((1, self.num_bonus_tokens), - dtype=torch.int64, - device="cuda").repeat(num_samples, 1) - - # Get output tokens via rejection sampling. - output_token_ids = self.rejection_sampler(target_probs.to("cuda"), - bonus_token_ids.to("cuda"), - draft_probs.to("cuda"), - draft_token_ids.to("cuda")) - - # Remove bonus tokens - output_token_ids = output_token_ids[:, :-1].flatten() - - # Estimate probability density function - hist = torch.histogram(output_token_ids.to(dtype=torch.float, - device="cpu"), - bins=self.vocab_size, - range=self.vocab_range, - density=True) - - return hist.hist diff --git a/tests/samplers/test_typical_acceptance_sampler.py b/tests/samplers/test_typical_acceptance_sampler.py deleted file mode 100644 index 119841470bfb..000000000000 --- a/tests/samplers/test_typical_acceptance_sampler.py +++ /dev/null @@ -1,480 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests for rejection sampling.""" - -import pytest -import torch - -from vllm.model_executor.layers.typical_acceptance_sampler import ( - TypicalAcceptanceSampler) -from vllm.model_executor.utils import set_random_seed - -CUDA_DEVICES = [f"cuda:{i}" for i in range(1)] - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - This file tests V0 internals, so set VLLM_USE_V1=0. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') - - -def get_zero_temperature_prob_dist(batch_size, k, vocab_size): - """ - Generates a fake temperature zero probability distribution. - Returns: - 1. A fake temperature zero probability distribution of shape - [batch_size, k, vocab_size] - 2. Tensor of shape [batch_size, k] containing the token ids - of the probability 1.0 tokens at each position. - """ - # Simulate temperature 0 probability distribution for target probabilities - # and create target probabilities such that only 1 token id has - # probability 1.0 - target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - probs = torch.rand(batch_size, k, vocab_size) - _, zero_temperature_token_ids = torch.max(probs, dim=-1) - # set the probability of the tokens with ids in zero_temperature_token_ids - # to 1 and the rest to 0. - target_probs = torch.zeros_like(probs).scatter_( - -1, zero_temperature_token_ids.unsqueeze(-1), 1.0) - return target_probs, zero_temperature_token_ids - - -def get_draft_token_ids(batch_size: int, k: int, vocab_size: int, - token_ids_to_exclude: torch.Tensor): - """ - Returns a tensor of shape [batch_size, k] of fake draft token ids - drawn randomly from a vocab of size vocab_size. We however ensure - that token_ids from token_ids_to_exclude are excluded at the - corresponding positions. - """ - draft_token_ids = torch.empty(batch_size, k, dtype=torch.long) - for i in range(batch_size): - for j in range(k): - # Generate a random token ID excluding token_ids_to_exclude[i, j] - while True: - token_id = torch.randint(0, vocab_size, (1, )).item() - if token_id != token_ids_to_exclude[i, j]: - draft_token_ids[i, j] = token_id - break - return draft_token_ids - - -def get_acceptance_sampler( - posterior_threshold: float = 0.03, - posterior_alpha: float = 0.9, - strict_mode: bool = False, -) -> TypicalAcceptanceSampler: - """ - Initializes and returns a TypicalAcceptanceSampler. - """ - return TypicalAcceptanceSampler(posterior_threshold, posterior_alpha, - strict_mode) - - -@pytest.mark.parametrize("k", list(range(1, 6))) -@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) -@pytest.mark.parametrize("batch_size", list(range(1, 32))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, - device: str): - """ - Tests that the TypicalAcceptancSampler forward succeeds for - different combinations of k, vocab_size, batch_size and num devices. - """ - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler() - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_with_bonus_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - # Verify that sampling succeeds for all cases. - typical_acceptance_sampler(target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - - -@pytest.mark.parametrize("above_or_below_vocab_range", ["above", "below"]) -@pytest.mark.parametrize("which_token_ids", - ["bonus_token_ids", "draft_token_ids"]) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_raises_when_vocab_oob(above_or_below_vocab_range: str, - which_token_ids: str, device: str): - """ - Tests that we throw an exception of the token ids fall outside - the bound of the provided vocabulary. - """ - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_with_bonus_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - # Verify that appropriate exceptions are thrown for out - # of bound vocabs. - oob_token_ids = None - if which_token_ids == "bonus_token_ids": - oob_token_ids = bonus_token_ids - elif which_token_ids == "draft_token_ids": - oob_token_ids = draft_token_ids - else: - raise AssertionError() - - if above_or_below_vocab_range == "above": - rogue_token_id = vocab_size + 1 - elif above_or_below_vocab_range == "below": - rogue_token_id = -1 - else: - raise AssertionError() - - oob_token_ids[0][0] = rogue_token_id - - with pytest.raises(AssertionError): - typical_acceptance_sampler(target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_uniform_target_distribution_accepts_all_tokens( - seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with a uniform target probability - distribution. - - This test verifies that when provided with a uniform target probability - distribution, the TypicalAcceptanceSampler accepts all draft tokens. The - entropy of the uniform target distribution being high should lead to all - draft tokens being accepted. - """ - set_random_seed(seed) - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_with_bonus_probs = torch.rand(batch_size, - k + 1, - vocab_size, - dtype=torch.float32) - draft_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - # We are using a uniform target probability distribution. - # For a uniform distribution the entropy is very high and it - # should lead to all draft tokens being accepted. Verify that. - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, -1] == bonus_token_ids.squeeze()) - - assert torch.all(output_token_ids[:, :k] == draft_token_ids) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_temperature_zero_target_distribution(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with a zero-temperature target - probability distribution. - - This test verifies that when using a zero-temperature target probability - distribution, where only one token has a probability of 1.0, the - TypicalAcceptanceSampler correctly rejects all draft tokens that do not - match this probability. Additionally, it ensures that when all draft - tokens are rejected, the sampler falls back to greedy sampling to select a - single token from the target distribution. - """ - set_random_seed(seed) - k = 3 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # Simulate temperature 0 probability distribution for target probabilities - # and create target probabilities such that only 1 token id has - # probability 1.0 - target_with_bonus_probs, zero_temperature_token_ids = \ - get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - # Populate draft_token_ids such that they exclude the token_ids - # with probability = 1.0 - draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, - zero_temperature_token_ids) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - # The target probaility distribution is a temperature zero distribution - # with zero entropy. Since our draft token ids don't match the probability - # 1.0 tokens in the target distribution we will reject all of them and - # fallback to the greedy sampling for selecting 1 token for each sequence. - # Verify the same. - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, -1] == -1) - assert torch.all(output_token_ids[:, 0] == zero_temperature_token_ids[:, - 0]) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_mixed_target_distribution(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with a mixed target probability - distribution. - - This test ensures that the TypicalAcceptanceSampler handles a mixed - target probability distribution correctly. Specifically, it uses a - zero-temperature distribution for some sequences and a uniform - distribution for others. The test verifies that: - - - For sequences with a zero-temperature distribution, only the token - with a probability of 1.0 is accepted, and all other tokens are rejected. - - For sequences with a uniform distribution, all draft tokens are - accepted. - """ - set_random_seed(seed) - k = 3 - batch_size = 4 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # For sequences 0 and 2 set the distribution to a temperature - # zero distribution. For sequences 1 and 3 set it to a uniform - # distribution. - target_with_bonus_probs, zero_temperature_token_ids = \ - get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - target_probs = target_with_bonus_probs[:, :-1] - draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, - zero_temperature_token_ids) - uniform_probs = torch.rand(2, k, vocab_size, dtype=torch.float32) - target_probs[[1, 3]] = uniform_probs - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - # verify the shape of output_token_ids - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - # For sequences 0 and 2 verify that only 1 token is accepted - # which is the token with probability 1.0 in the target distribution - # at position 0. - assert torch.all(output_token_ids[[0, 2], 1:] == -1) - assert (torch.all(output_token_ids[[0, 2], - 0] == zero_temperature_token_ids[[0, 2], - 0])) - # For sequences 1 and 3 verify that all tokens are accepted since the - # target probability distribution is uniform. In addition verify that - # we also accept the bonus tokens. - assert torch.all( - output_token_ids[[1, 3], :-1] == draft_token_ids[[1, 3], :]) - assert torch.all(output_token_ids[[1, 3], -1] != -1) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_accept_tokens_partially(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler's behavior when only a subset of draft - tokens should be accepted. - - This test verifies that the TypicalAcceptanceSampler correctly accepts or - rejects draft tokens based on a zero-temperature target probability - distribution. Specifically, it ensures that: - - - When all draft tokens match tokens with a probability of 1.0 in the - target distribution, all draft tokens are accepted. - - When only some draft tokens match tokens with a probability of 1.0 in - the target distribution, only those matching tokens are accepted, and the - rest are rejected. - """ - set_random_seed(seed) - k = 5 - batch_size = 1 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # Create a temperature zero target probability distribution and ensure - # all draft token ids correspond to the tokens with 1.0 probability. - # Verify that all of them are accepted. - target_with_bonus_probs, zero_temperature_token_ids = \ - get_zero_temperature_prob_dist(batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - draft_token_ids = zero_temperature_token_ids - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) - assert torch.all(output_token_ids[:, -1] == bonus_token_ids) - # Next only keep the first 2 draft tokens same as the zero temperature - # tokens. For the remaining 3 choose some other tokens. In the - # response we will expect the first 2 tokens to be the same as the - # draft tokens and the recovered token and rest as -1 - draft_token_ids_to_replace = get_draft_token_ids( - batch_size, k, vocab_size, zero_temperature_token_ids) - draft_token_ids = torch.cat( - (draft_token_ids[:, :2], draft_token_ids_to_replace[:, -3:]), dim=1) - output_token_ids = typical_acceptance_sampler( - target_with_bonus_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, :2] == draft_token_ids[:, :2]) - assert torch.all( - output_token_ids[:, 2] == target_with_bonus_probs.argmax(-1)[:, 2]) - assert torch.all(output_token_ids[:, -3:] == -1) - - -@pytest.mark.parametrize("seed", list(range(1))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_accept_tokens_set_non_default_posteriors(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler with custom posterior thresholds and - alpha values. This test verifies that by modifying the posterior - thresholds and alpha values we can change the acceptance behavior of the - sampler. - """ - set_random_seed(seed) - k = 5 - batch_size = 1 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - # Simulate temperature 0 probability distribution for target - # probabilities and create target probabilities such that only 1 token - # id has probability 1.0 and others have a very low probability of - # 0.00001. Populate draft_token_ids such that they exclude the token_ids - # with probability = 1.0. Without any changes to the posterior thresholds - # none of the draft tokens are accepted. - target_probs, zero_temperature_token_ids = get_zero_temperature_prob_dist( - batch_size, k + 1, vocab_size) - zero_temperature_token_ids = zero_temperature_token_ids[:, :-1] - target_probs[target_probs == 0] = 0.00001 - draft_token_ids = get_draft_token_ids(batch_size, k, vocab_size, - zero_temperature_token_ids) - bonus_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, 1), - dtype=torch.int64) - output_token_ids = typical_acceptance_sampler( - target_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, 1:-1] == -1) - - # Change the posterior threshold values to 0.0 so that we will - # now accept even draft tokens with very low probability in the - # target distribution. Simulate and verify the same. - typical_acceptance_sampler = TypicalAcceptanceSampler( - strict_mode=True, posterior_threshold=0.0, posterior_alpha=0.0) - typical_acceptance_sampler.init_gpu_tensors(device=device) - output_token_ids = typical_acceptance_sampler( - target_probs, - bonus_token_ids, - draft_probs=None, - draft_token_ids=draft_token_ids) - assert output_token_ids.shape[0] == batch_size - assert output_token_ids.shape[1] == (k + 1) - assert torch.all(output_token_ids[:, 0:-1] == draft_token_ids) - assert torch.all(output_token_ids[:, -1] == bonus_token_ids) - - -@pytest.mark.parametrize("seed", list(range(10))) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_get_recovered_token_ids(seed: int, device: str): - """ - Test the TypicalAcceptanceSampler's method for generating - replacement token IDs. - - This test verifies that the `_get_recovered_token_ids` method of the - TypicalAcceptanceSampler correctly identifies the token IDs to be used - as recovered token IDs based on the target probability distribution. - Specifically, it ensures that the method correctly identifies the - tokens with the highest probability for each sequence in the batch. - """ - set_random_seed(seed) - k = 10 - batch_size = 5 - vocab_size = 30_000 - torch.set_default_device(device) - typical_acceptance_sampler = get_acceptance_sampler(strict_mode=True) - typical_acceptance_sampler.init_gpu_tensors(device=device) - target_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32) - expected_replacement_tokens = torch.argmax(target_probs, dim=-1) - actual_replacement_tokens = ( - typical_acceptance_sampler._get_recovered_token_ids(target_probs)) - assert torch.all(expected_replacement_tokens == actual_replacement_tokens) diff --git a/tests/spec_decode/__init__.py b/tests/spec_decode/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/spec_decode/conftest.py b/tests/spec_decode/conftest.py deleted file mode 100644 index 375b248ebeda..000000000000 --- a/tests/spec_decode/conftest.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import pytest - - -@pytest.fixture(scope="function", autouse=True) -def use_v0_only(monkeypatch): - """ - Since this module is V0 only, set VLLM_USE_V1=0 for - all tests in the module. - """ - monkeypatch.setenv('VLLM_USE_V1', '0') diff --git a/tests/spec_decode/e2e/__init__.py b/tests/spec_decode/e2e/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py deleted file mode 100644 index f3fe9db3f79e..000000000000 --- a/tests/spec_decode/e2e/conftest.py +++ /dev/null @@ -1,307 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Sequence -from itertools import cycle -from typing import Optional, Union - -import pytest -import torch - -from vllm import LLM, SamplingParams -from vllm.distributed import cleanup_dist_env_and_memory -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import PromptLogprobs, SampleLogprobs - -from ...models.utils import (TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs, - check_logprobs_close, check_outputs_equal) -from ...utils import RemoteOpenAIServer - -PROMPTS = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - "San Francisco is know for its", - "Facebook was created in 2004 by", - "Curious George is a", - "Python 3.11 brings improvements to its", -] - - -@pytest.fixture -def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, - test_llm_kwargs, seed): - - def generate(): - kwargs = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **test_llm_kwargs, - } - - llm = LLM(**kwargs) - - if seed is not None: - set_random_seed(seed) - - yield llm - - del llm - cleanup_dist_env_and_memory() - - return generate - - -def maybe_assert_ngram_worker(llm): - # Verify the proposer worker is ngram if ngram is specified. - if (llm.llm_engine.speculative_config is not None - and llm.llm_engine.speculative_config.method == "ngram"): - from vllm.spec_decode.ngram_worker import NGramWorker - assert isinstance( - llm.llm_engine.model_executor.driver_worker.proposer_worker, - NGramWorker) - - -def get_output_from_llm_generator( - llm_generator, prompts, - sampling_params) -> tuple[list[str], list[list[int]], float]: - tokens: list[str] = [] - token_ids: list[list[int]] = [] - acceptance_rate: float = -1.0 - for llm in llm_generator(): - maybe_assert_ngram_worker(llm) - - outputs = llm.generate(prompts, sampling_params, use_tqdm=True) - - token_ids = [output.outputs[0].token_ids for output in outputs] - tokens = [output.outputs[0].text for output in outputs] - - # Fetch acceptance rate if logging is enabled. - if stat_loggers := getattr(llm.llm_engine, "stat_loggers", None): - stat_logger = stat_loggers["prometheus"] - acceptance_rate = (stat_logger.metrics. - gauge_spec_decode_draft_acceptance_rate.labels( - **stat_logger.labels)._value.get()) - del llm - - return tokens, token_ids, acceptance_rate - - -def check_logprobs_correctness( - spec_outputs: Sequence[Union[TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs]], - baseline_outputs: Sequence[Union[TokensTextLogprobs, - TokensTextLogprobsPromptLogprobs]], - disable_logprobs: bool = False, -): - """Compare sampled and prompt logprobs between baseline and spec decoding - """ - if not disable_logprobs: - return check_logprobs_close( - outputs_0_lst=baseline_outputs, - outputs_1_lst=spec_outputs, - name_0="org", - name_1="sd", - ) - - # Check correctness when disable_logprobs == True - for spec_output, baseline_output in zip(spec_outputs, baseline_outputs): - # Check generated token logprobs. - spec_logprobs = spec_output[2] - baseline_logprobs = baseline_output[2] - _check_logprobs_when_output_disabled(spec_logprobs, - baseline_logprobs, - is_prompt_logprobs=False) - - # Check prompt logprobs too, if they exist - if len(baseline_output) == 4: - assert len(spec_output) == 4 - spec_prompt_logprobs = spec_output[3] - baseline_prompt_logprobs = baseline_output[3] - _check_logprobs_when_output_disabled(spec_prompt_logprobs, - baseline_prompt_logprobs, - is_prompt_logprobs=True) - - -def _check_logprobs_when_output_disabled( - spec_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs], - baseline_logprobs: Union[Optional[PromptLogprobs], SampleLogprobs], - is_prompt_logprobs: bool = False, -): - # Prompt logprobs are optional - if is_prompt_logprobs and baseline_logprobs is None: - assert spec_logprobs is None - return - - assert spec_logprobs is not None - assert baseline_logprobs is not None - assert len(spec_logprobs) == len(baseline_logprobs) - - # For each generated position of the sequence. - for pos, (spec_pos_logprobs, baseline_pos_logprobs) in enumerate( - zip(spec_logprobs, baseline_logprobs)): - - # First prompt logprob is expected to be None - if is_prompt_logprobs and baseline_pos_logprobs is None: - assert spec_pos_logprobs is None - assert pos == 0 - continue - - assert spec_pos_logprobs is not None - assert baseline_pos_logprobs is not None - - # When disabled, the 1 logprob is returned with dummy values for the - # score and rank, but the token id should match the baseline model - assert len(spec_pos_logprobs) == 1 - (spec_pos_logprob_token_id, - spec_pos_logprob) = next(iter(spec_pos_logprobs.items())) - assert spec_pos_logprob.rank == -1 - assert spec_pos_logprob.logprob == 0.0 - if isinstance(spec_pos_logprob_token_id, torch.Tensor): - spec_pos_logprob_token_id = spec_pos_logprob_token_id.item() - assert spec_pos_logprob_token_id in baseline_pos_logprobs - - -def run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, - max_output_len: int, - seed: Optional[int] = 0, - temperature: float = 0.0, - disable_seed: bool = False, - ignore_eos: bool = True, - ensure_all_accepted: bool = False, - expected_acceptance_rate: Optional[float] = None, - logprobs: Optional[int] = None, - prompt_logprobs: Optional[int] = None, - disable_logprobs: bool = False): - - org_args = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **baseline_llm_kwargs, - } - - sd_args = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **test_llm_kwargs, - } - - prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))] - - if disable_seed: - seed = None - - sampling_params = SamplingParams(temperature=temperature, - max_tokens=max_output_len, - seed=seed, - ignore_eos=ignore_eos, - logprobs=logprobs, - prompt_logprobs=prompt_logprobs) - - with vllm_runner(**org_args) as vllm_model: - org_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params) - - with vllm_runner(**sd_args) as vllm_model: - if ensure_all_accepted or expected_acceptance_rate is not None: - # Force log interval to be 0 to catch all metrics. - stat_logger = vllm_model.model.llm_engine.stat_loggers[ - 'prometheus'] - stat_logger.local_interval = -100 - - sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params) - - if ensure_all_accepted or expected_acceptance_rate is not None: - acceptance_rate = (stat_logger.metrics. - gauge_spec_decode_draft_acceptance_rate.labels( - **stat_logger.labels)._value.get()) - - if ensure_all_accepted: - assert True - # FIXME: ci fails to log acceptance rate. - # It works locally. - # assert acceptance_rate == 1.0 - - if expected_acceptance_rate is not None: - assert acceptance_rate >= expected_acceptance_rate - 1e-2 - - # Only pass token entries, not the logprobs - check_outputs_equal(outputs_0_lst=[out[0:2] for out in org_outputs], - outputs_1_lst=[out[0:2] for out in sd_outputs], - name_0="org", - name_1="sd") - - # Check logprobs if requested - if logprobs is not None or prompt_logprobs is not None: - check_logprobs_correctness(sd_outputs, org_outputs, disable_logprobs) - - -def run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, - max_output_len: int, - seed: int = 0, - temperature: float = 0.0, - logprobs: Optional[int] = None): - """Helper method that compares the outputs of both the baseline LLM and - the test LLM. It asserts greedy equality, e.g. that the outputs are exactly - the same when temperature is zero. - """ - arg1 = common_llm_kwargs + per_test_common_llm_kwargs + baseline_llm_kwargs - arg2 = common_llm_kwargs + per_test_common_llm_kwargs + test_llm_kwargs - env1 = env2 = None - - max_wait_seconds = 240 - results = [] - - prompts = [prompt for prompt, _ in zip(cycle(PROMPTS), range(batch_size))] - for args, env in ((arg1, env1), (arg2, env2)): - with RemoteOpenAIServer(model, - args, - env_dict=env, - max_wait_seconds=max_wait_seconds) as server: - client = server.get_client() - - completion = client.completions.create(model=model, - prompt=prompts, - max_tokens=max_output_len, - seed=seed, - temperature=temperature, - logprobs=logprobs) - - results.append({ - "test": - "seeded_sampling", - "text": [choice.text for choice in completion.choices], - "logprobs": [choice.logprobs for choice in completion.choices], - "finish_reason": - [choice.finish_reason for choice in completion.choices], - "usage": - completion.usage, - }) - - n = len(results) // 2 - arg1_results = results[:n] - arg2_results = results[n:] - # Separate logprobs to avoid asserting exact equality. - arg1_logprobs = [r.pop("logprobs") for r in arg1_results] - arg2_logprobs = [r.pop("logprobs") for r in arg2_results] - - for arg1_result, arg2_result in zip(arg1_results, arg2_results): - assert arg1_result == arg2_result, ( - f"Results for {model=} are not the same with {arg1=} and {arg2=}. " - f"{arg1_result=} != {arg2_result=}") - if logprobs: - for logs1, logs2 in zip(arg1_logprobs, arg2_logprobs): - for l1, l2 in zip(logs1, logs2): - assert l1.tokens == l2.tokens diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py deleted file mode 100644 index 6c453879a6a6..000000000000 --- a/tests/spec_decode/e2e/test_compatibility.py +++ /dev/null @@ -1,66 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from vllm import SamplingParams - -from .conftest import get_output_from_llm_generator - - -@pytest.mark.parametrize("common_llm_kwargs", - [{ - "model": "meta-llama/Llama-3.2-1B-Instruct", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - # Speculative max model len > overridden max model len should raise. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 129, - }, - "max_model_len": 128, - }, - { - # Speculative max model len > draft max model len should raise. - # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12 - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 2048 + 1, - }, - }, - { - # Speculative max model len > target max model len should raise. - # https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/9213176726f574b556790deb65791e0c5aa438b6/config.json#L18 - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 131072 + 1, - }, - }, - ]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_xfail_spec_max_model_len(test_llm_generator): - """Verify that speculative decoding validates speculative_max_model_len. - """ - output_len = 128 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - ] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - with pytest.raises(ValueError, match="cannot be larger than"): - get_output_from_llm_generator(test_llm_generator, prompts, - sampling_params) diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py deleted file mode 100644 index 7c369feec415..000000000000 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ /dev/null @@ -1,480 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, EAGLE would not break the -correctness for the target model outputs. -""" - -import pytest - -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "abhigoyal/vllm-eagle-llama-68m-random" - -# max. number of speculative tokens: this corresponds to -# num_heads in the config.json of the speculator model. -MAX_SPEC_TOKENS = 4 - -# precision -PRECISION = "float32" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": False, - }, -}, { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": True, - }, -}]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_eagle_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int): - - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "enforce_eager": False, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_e2e_greedy_correctness_cuda_graph( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality with cuda graph enabled and different - batch sizes.""" - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 8, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that eagle speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_eagle_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that eagle speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": "float16", - - # Main model - "model_name": "meta-llama/Llama-2-7b-chat-hf", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "yuhuili/EAGLE-llama2-chat-7B", - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize("seed", [1]) -def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # 2 for small prompt, 256//16 for generated. - "num_gpu_blocks_override": 2 + 256 // 16, - "max_model_len": (2 + 256 // 16) * 16, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": "float16", - - # Main model - "model_name": "meta-llama/Meta-Llama-3-8B-Instruct", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B", - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize("seed", [1]) -def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # 2 for small prompt, 256//16 for generated. - "num_gpu_blocks_override": 2 + 256 // 16, - "max_model_len": (2 + 256 // 16) * 16, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": "float16", - - # Main model - "model_name": "Qwen/Qwen2-7B-Instruct", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "yuhuili/EAGLE-Qwen2-7B-Instruct", - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize("seed", [1]) -def test_qwen2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/spec_decode/e2e/test_integration.py b/tests/spec_decode/e2e/test_integration.py deleted file mode 100644 index f15a9224c003..000000000000 --- a/tests/spec_decode/e2e/test_integration.py +++ /dev/null @@ -1,161 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests which cover integration of the speculative decoding framework with -other features, e.g. cuda graphs. -""" - -import pytest - -from .conftest import run_equality_correctness_test - -MAIN_MODEL = "JackFram/llama-68m" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Verify equality when cuda graphs allowed. - "enforce_eager": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - # Identical models. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("output_len", [32]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): - """Verify spec decode equality when cuda graphs are enabled. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", []) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - # Explicitly specify draft model quantization - { - "speculative_config": { - "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", - "num_speculative_tokens": 5, - "quantization": "gptq", - }, - }, - # Explicitly specify GPTQ-based draft model to use marlin quantization - { - "speculative_config": { - "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", - "num_speculative_tokens": 5, - "quantization": "marlin", - }, - }, - # Not explicitly specify draft model quantization - { - "speculative_config": { - "model": "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", - "num_speculative_tokens": 5, - "quantization": None, - }, - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, seed: int): - """Verify spec decode works well with draft model quantization configs. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": MAIN_MODEL, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - """Verify that speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py deleted file mode 100644 index a18be80c50dd..000000000000 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ /dev/null @@ -1,247 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests which cover integration of the speculative decoding framework with -tensor parallelism. -""" - -import json -from typing import Optional - -import pytest -import torch - -from vllm.platforms import current_platform - -from .conftest import run_equality_correctness_test_tp - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor-parallel-size", - "2" - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize("test_llm_kwargs", [ - [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - }), - ], - [ - "--speculative_config", - json.dumps({ - "model": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - }), - ], -]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): - """Verify greedy equality when tensor parallelism is used. - """ - if current_platform.is_rocm(): - pytest.skip("hip is not well-supported yet") - run_equality_correctness_test_tp("JackFram/llama-68m", - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor_parallel_size", - "2", - - # precision - "--dtype", - "bfloat16", - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize( - "model, test_llm_kwargs", - [("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "draft_tensor_parallel_size": 1, - }), - ]), - ("ibm-granite/granite-3b-code-instruct", [ - "--speculative_config", - json.dumps({ - "model": "ibm-granite/granite-3b-code-instruct", - "num_speculative_tokens": 5, - "draft_tensor_parallel_size": 1, - }), - ])]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - seed: int): - """Verify spec decode works well with smaller tp for draft models. - """ - run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor_parallel_size", - "2", - - # precision - "--dtype", - "bfloat16", - ]]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [["--enable-chunked-prefill", "False"], - [ - "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4", - "--max-num-seqs", "4" - ]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize("model, test_llm_kwargs", - [("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - }), - ]), - ("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "draft_tensor_parallel_size": 1, - }), - ])]) -@pytest.mark.parametrize("logprobs", [None]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - logprobs: Optional[int], - batch_size: int, seed: int): - """Verify spec decode works well with same and different TP size for - the draft model with chunked prefill. - """ - run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0, - logprobs=logprobs) - - -@pytest.mark.skipif(torch.cuda.device_count() < 2, - reason="Need at least 2 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor_parallel_size", - "2", - - # precision - "--dtype", - "bfloat16", - ]]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [["--enable-chunked-prefill", "False"], - [ - "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4", - "--max-num-seqs", "4" - ]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize("model, test_llm_kwargs", - [("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }), - ]), - ("JackFram/llama-68m", [ - "--speculative_config", - json.dumps({ - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "draft_tensor_parallel_size": 1, - "disable_logprobs": False, - }), - ])]) -@pytest.mark.parametrize("logprobs", [2]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_spec_decode_chunked_prefill_tp2_with_logprobs( - model, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int], - batch_size: int, seed: int): - """Verify spec decode works well with same and different TP size for - the draft model with chunked prefill. - """ - run_equality_correctness_test_tp(model, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0, - logprobs=logprobs) diff --git a/tests/spec_decode/e2e/test_integration_dist_tp4.py b/tests/spec_decode/e2e/test_integration_dist_tp4.py deleted file mode 100644 index 039eec8fd2cc..000000000000 --- a/tests/spec_decode/e2e/test_integration_dist_tp4.py +++ /dev/null @@ -1,123 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests which cover integration of the speculative decoding framework with -tensor parallelism. -""" - -import json - -import openai -import pytest -import torch - -from .conftest import run_equality_correctness_test_tp - -MAIN_MODEL = "JackFram/llama-68m" -SPEC_MODEL = "JackFram/llama-68m" - - -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - # Skip cuda graph recording for fast test. - "--enforce_eager", - "--tensor-parallel-size", - "4", - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - [], -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - #TODO(wooyeon): add spec_draft_dp=2 case - [ - "--speculative_config", - json.dumps({ - "model": f"{SPEC_MODEL}", - "num_speculative_tokens": 5, - "draft_tensor_parallel_size": 1, - }), - ], - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize("seed", [1]) -def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - seed: int): - """Verify spec decode works well with smaller tp for draft models. - """ - run_equality_correctness_test_tp(MAIN_MODEL, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=32, - seed=seed, - temperature=0.0) - - -@pytest.mark.skipif(torch.cuda.device_count() < 4, - reason="Need at least 4 GPUs to run the test.") -@pytest.mark.parametrize( - "common_llm_kwargs", - [[ - - # Skip cuda graph recording for fast test. - "--enforce-eager", - "--tensor-parallel-size", - "4", - ]]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) -@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - [ - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "--speculative_config", - json.dumps({ - "model": f"{SPEC_MODEL}", - "num_speculative_tokens": 5, - "max_model_len": 32, - }), - ], - ]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # This must be a good bit larger than speculative_max_model_len so that - # we can test the case where all seqs are skipped, but still small to - # ensure fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_skip_speculation(common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int): - """Verify job failure with RuntimeError when all sequences skip speculation. - We do this by setting the max model len of the draft model to an - artificially low value, such that when the sequences grow beyond it, they - are skipped in speculative decoding. - - TODO: fix it to pass without raising Error. (#5814) - """ - with pytest.raises( - (openai.APIConnectionError, openai.InternalServerError)): - run_equality_correctness_test_tp(MAIN_MODEL, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_logprobs.py b/tests/spec_decode/e2e/test_logprobs.py deleted file mode 100644 index 4de7ee05605a..000000000000 --- a/tests/spec_decode/e2e/test_logprobs.py +++ /dev/null @@ -1,315 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from itertools import cycle - -import pytest - -from vllm import SamplingParams - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, -}, { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 7, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4, 12]) -def test_logprobs_equality(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int, prefill_chunk_size: int): - """Verify output logprobs are equal with and without speculative decoding, - as well as with and without chunked prefill. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, -}, { - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 6, - "disable_logprobs": False, - }, -}]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_logprobs_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int, logprobs: int): - """Veriy logprob greedy equality with different speculation lens. - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [{ - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - # Artificially limit the draft model max model len; this forces - # vLLM to skip speculation once the sequences grow beyond 32-k - # tokens. - "max_model_len": 32, - }, - }]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1]) -def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, logprobs: int): - """Verify logprobs greedy equality when some sequences skip speculation. - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, -}]) -@pytest.mark.parametrize("batch_size", [1]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [6]) -def test_logprobs_temp_1(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): - """Verify at least one logprob result has num_logprobs+1, which tests the - case where the sampled token is not in top-k logprobs. - - Ideally, this test should validate equality with non-spec by getting - logprobs. This is left as future improvement. - """ - temperature = 1.0 - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - "San Francisco is know for its", - "Facebook was created in 2004 by", - "Curious George is a", - "Python 3.11 brings improvements to its", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - logprobs=logprobs, - ) - - sd_args = { - **common_llm_kwargs, - **per_test_common_llm_kwargs, - **test_llm_kwargs, - } - - with vllm_runner(**sd_args) as vllm_model: - sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params) - - num_returned_logprobs = [ - len(seq_logprobs) for seq_logprobs in sd_outputs[-1] - ] - - # Assert one of the returned logprobs has > num_logprobs (indicating the - # sampled token is not in top-k). - assert any( - [num_returned > logprobs for num_returned in num_returned_logprobs]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": True, - }, -}]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("logprobs", [0]) -def test_logprobs_disabled(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, logprobs: int): - """Check the behavior when logprobs are disabled. - Token choices should match with the base model. - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - temperature=0.0, - logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) diff --git a/tests/spec_decode/e2e/test_medusa_correctness.py b/tests/spec_decode/e2e/test_medusa_correctness.py deleted file mode 100644 index bc9501bd5737..000000000000 --- a/tests/spec_decode/e2e/test_medusa_correctness.py +++ /dev/null @@ -1,417 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, Medusa would not break the -correctness for the target model outputs. -""" - -import pytest - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - -# main model -# lmsys/vicuna-7b-v1.3 was to be used but it's causing -# OOM in CI pipeline, so using a smaller model. -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random" - -# max number of speculative tokens: this corresponds to -# num_heads in the config.json of the speculator model. -MAX_SPEC_TOKENS = 5 - -# precision -PRECISION = "float32" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 8, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, logprobs: int, - prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "enforce_eager": False, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_correctness_cuda_graph( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality with cuda graph enabled and different - batch sizes.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify that medusa speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_medusa_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - output_len: int, seed: int, - prefill_chunk_size: int): - """Verify that medusa speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int, prefill_chunk_size: int): - """Verify that speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/spec_decode/e2e/test_mlp_correctness.py b/tests/spec_decode/e2e/test_mlp_correctness.py deleted file mode 100644 index 0e41d93eaa19..000000000000 --- a/tests/spec_decode/e2e/test_mlp_correctness.py +++ /dev/null @@ -1,533 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, MLPSpeculator would not break the -correctness for the target model outputs. -""" - -from unittest.mock import patch - -import pytest - -from vllm.model_executor.layers.vocab_parallel_embedding import pad_vocab_size - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "JackFram/llama-160m" - -# speculative model -SPEC_MODEL = "ibm-ai-platform/llama-160m-accelerator" - -# max. number of speculative tokens: this corresponds to -# n_predict in the config.json of the speculator model. -MAX_SPEC_TOKENS = 3 - -# precision -PRECISION = "float32" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [4, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 32]) -def test_mlp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int, prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "model": SPEC_MODEL, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [8]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -def test_mlp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int, prefill_chunk_size: int): - """Verify greedy equality with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - # NOTE Test is sensitive enough st if we don't enable chunked prefill - # scheduling on baseline too, we get slightly different logprobs, ending - # up sampling different tokens at the tail (ie top tokens don't change). - # TL;DR: sd+cp == org+cp but sd+cp != org..is this expected? - maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize("output_len", [2048]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -def test_mlp_e2e_acceptance_rate(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify acceptance rate with different batch size and large output - length.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=0.0, - seed=seed, - expected_acceptance_rate=0.48) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # Speculative config - "speculative_config": { - "model": SPEC_MODEL, - }, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}]) -@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}]) -@pytest.mark.parametrize("output_len", [64]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("temperature", [1.0]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - temperature: float, - prefill_chunk_size: int, seed: int): - """Verify seeded runs produce the same output.""" - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - maybe_enable_chunked_prefill(prefill_chunk_size, baseline_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - seed=seed) - - # Ensure this same test does fail if we _don't_ include per-request seeds - with pytest.raises(AssertionError): - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - seed=seed, - disable_seed=True) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": SPEC_MODEL, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -def test_mlp_e2e_greedy_correctness_with_padding( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify greedy equality when the vocab dimension is padded - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - - # Default pad_to is 64, test model has vocab_size of 32000 - def patched_pad_vocab_size(vocab_size, pad_to=None): - return pad_vocab_size(vocab_size, pad_to=32064) - - with patch( - "vllm.model_executor.layers.vocab_parallel_embedding.pad_vocab_size", - patched_pad_vocab_size): - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": SPEC_MODEL, - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - prefill_chunk_size: int, seed: int, output_len: int): - """Verify that mlp speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "disable_by_batch_size": 4, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -# Speculative decoding is disabled when sequences reach decoding and the batch -# consists of single-token requests. Hence we set `max_num_seqs` -# >= `speculative_disable_by_batch_size` to test feature interaction. -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mlp_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - prefill_chunk_size: int, seed: int, - output_len: int): - """Verify that mlp speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": MAIN_MODEL, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": SPEC_MODEL, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_mqa_scorer(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, prefill_chunk_size: int, seed: int): - """Verify that speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - maybe_enable_chunked_prefill(prefill_chunk_size, test_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_mtp_correctness.py b/tests/spec_decode/e2e/test_mtp_correctness.py deleted file mode 100644 index d9c7be8ffe71..000000000000 --- a/tests/spec_decode/e2e/test_mtp_correctness.py +++ /dev/null @@ -1,333 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various number of speculative tokens. - -With those tests, we can say at least, mtp would not break the -correctness for the target model outputs. -""" - -import pytest - -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "luccafong/deepseek_mtp_main_random" - -# max. number of speculative tokens: this corresponds to -# num_nextn_predict_layers in the config.json of the speculator model. -MAX_SPEC_TOKENS = 1 - -# precision -PRECISION = "bfloat16" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.85 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.85 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_mtp_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int): - - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - output_len, - seed, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "enforce_eager": False, - - # Print spec metrics. - "disable_log_stats": False, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - "gpu_memory_utilization": 0.85 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 128, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_e2e_greedy_correctness_cuda_graph(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size: int, - output_len: int, seed: int): - """Verify greedy equality with cuda graph enabled and different - batch sizes.""" - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 8, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.9 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - }, - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 128, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.9 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "num_speculative_tokens": k, - }, - } - # Try a range of num. speculative tokens - for k in range(1, 1 + MAX_SPEC_TOKENS) - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that mtp speculative decoding produces exact equality - to without spec decode with different values of num_speculative_tokens. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Precision - "dtype": PRECISION, - - # Main model - "model_name": MAIN_MODEL, - - # GPU memory utilization - "gpu_memory_utilization": 0.9 - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "num_speculative_tokens": MAX_SPEC_TOKENS, - "disable_by_batch_size": 4 - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_mtp_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that mtp speculative decoding produces exact equality - to without spec decode when speculation is disabled for large - batch sizes. - """ - run_equality_correctness_test(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size, output_len, seed) - - -if __name__ == "__main__": - import pytest - pytest.main([__file__]) diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py deleted file mode 100644 index ccc8e745ab37..000000000000 --- a/tests/spec_decode/e2e/test_multistep_correctness.py +++ /dev/null @@ -1,842 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""The tests in this file verify end-to-end speculative decoding correctness. - -This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. This gives us good coverage of temp=0. - -At temp=0, the TypicalAcceptanceSampler ensures that only the tokens with the -highest probability in the target distribution are accepted. Therefore, we can -expect greedy equality for the TypicalAcceptanceSampler at temp=0. - -For temp>0, we rely on unit tests on the rejection sampler to verify that the -output distribution is the same with spec decode vs. no spec decode (this would -be prohibitively expensive to run with a real model). Similarly, for the -TypicalAcceptance sampler also, we rely on unit tests to validate temp>0 -test cases. - -NOTE: Speculative decoding's distribution equality requires that the measured -distributions of the target model and proposal model be deterministic given the -same input. vLLM largely guarantees this. - -@cadedaniel has seen cases where the output probabilities of a draft/target -model change slightly with certain batch sizes or prompts, even with Torch -determinism flags set. It is unclear if this is a bug in vLLM, due to non- -determinism in on-device batched operations, a bug in vLLM's spec decode -implementation, or the "hardware numerics" limitations. Either way, rejection -sampling ensures the output distribution matches the target model, but it breaks -greedy-equality tests for those batch sizes/prompts. -""" - -from itertools import cycle - -import pytest -from transformers import AutoTokenizer - -from vllm import SamplingParams - -from ...utils import create_new_process_for_each_test -from .conftest import (get_output_from_llm_generator, - run_equality_correctness_test) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Use a small model for a fast test. - # Note this is repeated in the test body; to initialize a tokenizer. - "model": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - # Chunked prefill enabled with small value - # to make sure we get mixed batches. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, - { - # Verify the detokenizer assertions in the test work when spec - # decode is disabled. - }, - ]) -@pytest.mark.parametrize("test_llm_kwargs", [{}]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_with_detokenization(test_llm_generator, - batch_size: int): - """Run generation with speculative decoding on a batch. Verify the engine - generates the correct number of tokens (via ignore_eos=True), and that the - detokenization matches HF transformers. - """ - output_len = 32 - temperature = 0.0 - - prompts = [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))] - - sampling_params = SamplingParams( - max_tokens=output_len, - ignore_eos=True, - temperature=temperature, - ) - - batch_tokens, batch_token_ids, _ = get_output_from_llm_generator( - test_llm_generator, prompts, sampling_params) - - # Expect a generation for each prompt in the batch. - assert len(batch_token_ids) == len(prompts) - - # Expect each generation to have expected number of tokens (note ignore_eos - # is True). - assert [len(token_ids) - for token_ids in batch_token_ids] == ([output_len] * batch_size) - - # Expect detokenized string to match. - tok = AutoTokenizer.from_pretrained("JackFram/llama-68m") - for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids): - expected_tokens = tok.decode(actual_token_ids) - print(f"{actual_token_ids=}") - assert actual_tokens.strip() == expected_tokens.strip() - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model_name": "JackFram/llama-68m", - }, - { - "model_name": "JackFram/llama-160m", - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "disable_logprobs": False, - }, - "enable_chunked_prefill": False, -}, { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 3, - "disable_logprobs": False, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, -}]) -@pytest.mark.parametrize( - "output_len", - [ - # Use long output len for the small model test. - 10, - ]) -@pytest.mark.parametrize("batch_size", [1]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality on a tiny model with batch size of one. - - Since this test is cheaper than other e2e correctness tests, we generate - with a higher output_len. - - When the draft model is the same as the target model, we further check - whether all speculative tokens are accepted. - """ - ensure_all_accepted = per_test_common_llm_kwargs.get( - "model_name") == test_llm_kwargs.get("speculative_config")["model"] - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - prompt_logprobs=2, - logprobs=2, - disable_logprobs=False, - temperature=0.0, - ensure_all_accepted=ensure_all_accepted) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model_name": "JackFram/llama-68m", - }, - { - "model_name": "JackFram/llama-160m", - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 256, - ]) -@pytest.mark.parametrize("batch_size", [64]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality on a tiny model and large batch size. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # Try two different tiny base models. - # Note that one is equal to the draft model, another isn't. - { - "model_name": "JackFram/llama-68m", - }, - { - "model_name": "JackFram/llama-160m", - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("max_output_len", [ - 256, -]) -@pytest.mark.parametrize("batch_size", [32]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - max_output_len: int, seed: int): - """Verify greedy equality on a tiny model, with a large batch size, and when - sampling respects the EOS token. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len, - seed=seed, - temperature=0.0, - ignore_eos=False) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # A "real" model (not tiny). - "model_name": "meta-llama/Llama-2-7b-chat-hf", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("batch_size", [1]) -@pytest.mark.parametrize( - "output_len", - [ - # Use decently long output len for a high quality test. - 256, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_real_model_bs1( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality on a "real" model and batch size of 1. This is - separate from large BS tests to make identifying the source of bugs easier. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # A "real" model (not tiny). - "model_name": "meta-llama/Llama-2-7b-chat-hf", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("batch_size", [32]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_real_model_large_bs( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality with a "real" model on a nontrivial batch size. - This is the closest test to a real production workload. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-160m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 256, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize( - "per_test_common_llm_kwargs", - [ - # https://github.com/triton-lang/triton/issues/2266 tl.dot - # doesn't support embedding < 16 - { - "block_size": 16, - }, - { - "block_size": 32, - }, - ]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - """Verify greedy equality over different block sizes. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - - # Artificially limit the draft model max model len; this forces vLLM - # to skip speculation once the sequences grow beyond 32-k tokens. - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 32, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "max_model_len": 32, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - }, - ]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize( - "output_len", - [ - # This must be a good bit larger than speculative_max_model_len so that - # we can test the case where all seqs are skipped, but still small to - # ensure fast test. - 64, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_skip_speculation(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality when some (or all) sequences skip speculation. - We do this by setting the max model len of the draft model to an - artificially low value, such that when the sequences grow beyond it, they - are skipped in speculative decoding. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "disable_by_batch_size": 2, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": 5, - "disable_by_batch_size": 2, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - }, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("output_len", [10]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_disable_speculation(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality when all sequences disable speculation. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - }, - "enable_chunked_prefill": False, - } - # Try a range of common k, as well as large speculation. - for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63] - ] + [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4, - } for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, - output_len: int, seed: int): - """Verify that speculative decoding produces exact equality to without spec - decode with many different values of k. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-160m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - "acceptance_method": "typical_acceptance_sampler", - }, - "enable_chunked_prefill": False - } - # Try a range of common k. - for k in [1, 2, 3] - ] + [{ - "speculative_config": { - "model": "JackFram/llama-68m", - "num_speculative_tokens": k, - "acceptance_method": "typical_acceptance_sampler", - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - } for k in [1, 2, 3]]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -@create_new_process_for_each_test() -def test_typical_acceptance_sampling(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - seed: int): - """Verify that speculative decoding produces exact equality to without spec - decode with TypicalAcceptanceSampler as the draft token acceptance - sampling method. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py deleted file mode 100644 index 58d1a6ca7add..000000000000 --- a/tests/spec_decode/e2e/test_ngram_correctness.py +++ /dev/null @@ -1,392 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -Most of the tests rely on "greedy equality", where we expect the output of -speculative decoding on a sequence to exactly match the output of normal non- -speculative decoding. - -Since speculative decoding with rejection sampling guarantees that the output -distribution matches the target model's output distribution (up to hardware -numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy -equality. - -For ngram lookup, its idea comes from https://github.com/apoorvumang/prompt-lookup-decoding, -and is merged into transform code base: https://github.com/huggingface/transformers/pull/27775. -Since there is no model is needed for generate the proposal, we could make -the testcase much simpler than drafter multi-step one. - -However, we still need to verify below scenario could be passed: - * Batch size 1 greedy equality - * Batch size >1 greedy equality - * Test greedy equality under preemption - * Test greedy equality under various ngram sizes / speculative sizes - -With those tests, we can say at least, ngram spec would not break the -correctness for the target model outputs. -""" - -import pytest - -from ..utils import maybe_enable_chunked_prefill -from .conftest import run_equality_correctness_test - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-68m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": False, - }, - }, - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 256, -]) -@pytest.mark.parametrize("batch_size", [1, 32]) -@pytest.mark.parametrize("prefill_chunk_size", [-1, 4]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, - prefill_chunk_size: int, seed: int): - """Verify greedy equality on a tiny model with different batch size.""" - maybe_enable_chunked_prefill(prefill_chunk_size, common_llm_kwargs) - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # Print spec metrics. - "disable_log_stats": False, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-68m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_logprobs": False, - }, - }, - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_logprobs": True, - }, - }, -]) -@pytest.mark.parametrize("output_len", [ - 8, -]) -@pytest.mark.parametrize("batch_size", [8]) -@pytest.mark.parametrize("seed", [1]) -@pytest.mark.parametrize("logprobs", [1, 6]) -def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, - batch_size: int, output_len: int, seed: int, - logprobs: int): - """Verify greedy equality on a tiny model with different batch size.""" - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0, - logprobs=logprobs, - prompt_logprobs=logprobs, - disable_logprobs=test_llm_kwargs["speculative_config"] - ["disable_logprobs"]) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "block_size": 16, - # 2 for small prompt, 256//8 for generated. - "num_gpu_blocks_override": 2 + 256 // 8, - "max_model_len": (2 + 256 // 8) * 8, - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [ - { - "model_name": "JackFram/llama-160m", - }, -]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - }, - "enable_chunked_prefill": False, - }, - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": True, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 - }, -]) -@pytest.mark.parametrize( - "output_len", - [ - # Use small output len for fast test. - 256, - ]) -@pytest.mark.parametrize("batch_size", [4]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_e2e_greedy_correctness_with_preemption( - vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs, - baseline_llm_kwargs, test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify greedy equality, even when some sequences are preempted mid- - generation. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=0, - seed=seed) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize( - "test_llm_kwargs", - [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": k, - "prompt_lookup_max": 3, - }, - } - # Try a range of common k, as well as large speculation. - for k in [1, 3, 5] - ] + [ - { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": k, - "prompt_lookup_max": 1, - }, - } - # Try a range of common k, as well as large speculation. - for k in [1, 3, 5] - ]) -@pytest.mark.parametrize("batch_size", [2]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_different_k(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that ngram speculative decoding produces exact equality - to without spec decode with many different values of k and - different ngram prompt_lookup_max. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_by_batch_size": 4 - }, -}, { - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_by_batch_size": 4, - "disable_mqa_scorer": True, - }, - "enable_chunked_prefill": True, - "max_num_batched_tokens": 4, - "max_num_seqs": 4 -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_disable_queue(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that ngram speculative decoding produces exact equality - to without spec decode with many different values of k and - different ngram prompt_lookup_max. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # The original model is float32, keep it for numerical stability. - "dtype": "float32", - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) -@pytest.mark.parametrize("test_llm_kwargs", [{ - "speculative_config": { - "method": "ngram", - "num_speculative_tokens": 5, - "prompt_lookup_max": 3, - "disable_mqa_scorer": True, - }, -}]) -@pytest.mark.parametrize("batch_size", [1, 5]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 32, - ]) -@pytest.mark.parametrize("seed", [1]) -def test_ngram_scorer(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, output_len: int, - seed: int): - """Verify that ngram speculative decoding generates the same output - with batch expansion scorer and mqa scorer. - """ - run_equality_correctness_test(vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - seed=seed, - temperature=0.0) diff --git a/tests/spec_decode/e2e/test_seed.py b/tests/spec_decode/e2e/test_seed.py deleted file mode 100644 index 4cf373809dba..000000000000 --- a/tests/spec_decode/e2e/test_seed.py +++ /dev/null @@ -1,70 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - -from .conftest import run_equality_correctness_test - -# main model -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "JackFram/llama-160m" - - -@pytest.mark.parametrize( - "common_llm_kwargs", - [{ - "model_name": "JackFram/llama-68m", - - # Skip cuda graph recording for fast test. - "enforce_eager": True, - - # speculative config - "speculative_config": { - "model": "JackFram/llama-160m", - "num_speculative_tokens": 3, - }, - }]) -@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) -@pytest.mark.parametrize("baseline_llm_kwargs", [{"seed": 1}]) -@pytest.mark.parametrize("test_llm_kwargs", [{"seed": 5}]) -@pytest.mark.parametrize("batch_size", [1, 8, 32]) -@pytest.mark.parametrize("temperature", [0.1, 1.0]) -@pytest.mark.parametrize( - "output_len", - [ - # Use smaller output len for fast test. - 20, - ]) -def test_seeded_consistency(vllm_runner, common_llm_kwargs, - per_test_common_llm_kwargs, baseline_llm_kwargs, - test_llm_kwargs, batch_size: int, - temperature: float, output_len: int): - """Verify outputs are consistent across multiple runs with same seed - """ - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - disable_seed=False, - ) - - # Ensure this same test does fail if we _don't_ include per-request seeds - with pytest.raises(AssertionError): - run_equality_correctness_test( - vllm_runner, - common_llm_kwargs, - per_test_common_llm_kwargs, - baseline_llm_kwargs, - test_llm_kwargs, - batch_size, - max_output_len=output_len, - temperature=temperature, - disable_seed=True, - ) diff --git a/tests/spec_decode/test_batch_expansion.py b/tests/spec_decode/test_batch_expansion.py deleted file mode 100644 index d20c549b0905..000000000000 --- a/tests/spec_decode/test_batch_expansion.py +++ /dev/null @@ -1,110 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest -import torch - -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer - -from .utils import create_seq_group_metadata_from_prompts, mock_worker - - -@pytest.mark.parametrize('num_target_seq_ids', [100]) -@pytest.mark.skip_global_cleanup -def test_create_target_seq_id_iterator(num_target_seq_ids: int): - """Verify all new sequence ids are greater than all input - seq ids. - """ - scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - - all_seq_ids = [ - [1, 3, 5, 7], - list(range(100)) + [0], - [100], - ] - - for seq_ids in all_seq_ids: - max_seq_id = max(seq_ids) - iterator = scorer._create_target_seq_id_iterator(seq_ids) # pylint: disable=protected-access - for _ in range(num_target_seq_ids): - assert next(iterator) > max_seq_id - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup -def test_get_token_ids_to_score(k: int): - """Verify correct tokens are selected for scoring. - """ - proposal_token_ids = torch.tensor( - list(range(k)), - dtype=torch.int64, - device='cuda', - ) - - expected_output: list[list[int]] = [ - [], - ] - for i in range(proposal_token_ids.shape[0]): - expected_output.append(proposal_token_ids[:i + 1].tolist()) - - scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - actual_output = scorer._get_token_ids_to_score(proposal_token_ids.tolist()) # pylint: disable=protected-access - - actual_output = [ - x.tolist() if isinstance(x, torch.Tensor) else x for x in actual_output - ] - - assert actual_output == expected_output - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.skip_global_cleanup -def test_create_single_target_seq_group_metadata(k: int): - """Verify correct creation of a batch-expanded seq group metadata. - """ - - prompt_tokens = [1, 2, 3] - prev_output_tokens = [4, 5, 6] - - token_ids = list(range(k)) - - num_tokens_processed = len(prompt_tokens) + len(prev_output_tokens) - 1 - - final_seq_len = len(prompt_tokens) + len(prev_output_tokens) + len( - token_ids) - - block_size = 32 - input_seq_group_metadata = create_seq_group_metadata_from_prompts( - [prompt_tokens], 2048 // block_size, block_size, [final_seq_len], - [prev_output_tokens], [num_tokens_processed])[0] - - input_seq_id = list(input_seq_group_metadata.seq_data.keys())[0] - target_seq_id = 100 - - scorer = BatchExpansionTop1Scorer(mock_worker(), 'cuda:0', 32_000) - output = scorer._create_single_target_seq_group_metadata( # pylint: disable=protected-access - input_seq_group_metadata, - input_seq_id, - target_seq_id, - token_ids, - input_seq_group_metadata.sampling_params, - ) - - assert output.request_id == input_seq_group_metadata.request_id - assert output.sampling_params.repetition_penalty == \ - input_seq_group_metadata.sampling_params.repetition_penalty - assert output.sampling_params.temperature == \ - input_seq_group_metadata.sampling_params.temperature - assert output.sampling_params.top_p == \ - input_seq_group_metadata.sampling_params.top_p - assert output.sampling_params.top_k == \ - input_seq_group_metadata.sampling_params.top_k - assert len(output.seq_data) == 1 - assert output.seq_data[target_seq_id].get_prompt_token_ids() == tuple( - prompt_tokens) - assert output.seq_data[target_seq_id].get_output_token_ids() == tuple( - prev_output_tokens + token_ids) - - assert len(output.block_tables) == 1 - assert output.block_tables[ - target_seq_id] == input_seq_group_metadata.block_tables[input_seq_id] diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py deleted file mode 100644 index 407786ad3c64..000000000000 --- a/tests/spec_decode/test_dynamic_spec_decode.py +++ /dev/null @@ -1,90 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from unittest.mock import MagicMock, patch - -import pytest -import torch - -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.metrics import AsyncMetricsCollector -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker -from vllm.spec_decode.top1_proposer import Top1Proposer - -from .test_utils import mock_spec_decode_sampler -from .utils import create_batch, mock_worker - - -@pytest.mark.parametrize('queue_size', [4]) -@pytest.mark.parametrize('batch_size', [1]) -@pytest.mark.parametrize('k', [1]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int, - acceptance_sampler_method: str): - """Verify that speculative tokens are disabled when the batch size - exceeds the threshold. - """ - disable_by_batch_size = 3 - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker(proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - disable_by_batch_size=disable_by_batch_size) - - exception_secret = 'artificial stop' - draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - running_queue_size=queue_size) - - if queue_size > disable_by_batch_size: - with patch.object(worker, - '_run_no_spec', - side_effect=ValueError(exception_secret)), \ - pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - - # When the batch size is larger than the threshold, - # we expect no speculative tokens (0). - expected_num_spec_tokens = None if queue_size < disable_by_batch_size else 0 - assert seq_group_metadata_list[ - 0].num_speculative_tokens == expected_num_spec_tokens - - draft_worker.sampler_output.side_effect = ValueError(exception_secret) - - proposer = Top1Proposer( - worker=draft_worker, - device='cpu', # not used - vocab_size=100, # not used - # Must be long enough to avoid being skipped due to length. - max_proposal_len=1024, - ) - - if queue_size < disable_by_batch_size: - # Should raise exception when executing the mocked draft model. - with pytest.raises(ValueError, match=exception_secret): - proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - else: - # Should not execute the draft model because spec decode is disabled - # for all requests. Accordingly, the proposal length should be 0. - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - assert proposals.proposal_lens.tolist() == [0] * batch_size diff --git a/tests/spec_decode/test_memory_usage.py b/tests/spec_decode/test_memory_usage.py deleted file mode 100644 index 5d9dd3f72a78..000000000000 --- a/tests/spec_decode/test_memory_usage.py +++ /dev/null @@ -1,91 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""This docstring details important information on the testing methodology. - -This test verifies that memory usage remains constant (or never grows) when -we enable / disable speculation via --speculative-disable-by-batch-size. - -There are a lot of things we try to keep track of between batches of requests -and if certain tensors are not freed from memory, can result in CUDA ooms. - -This is particularly relevant for production situations where speculation might -be enabled during off hours, but disabled once traffic peaks during the workday. -Since traffic will stay high for a long period of time, verifying we do not -increase our memory usage over time is essential to prevent possible CUDA ooms. -""" - -import torch - -import vllm -from tests.core.utils import create_dummy_prompt -from vllm.sequence import SequenceGroup - -ITERATIONS = 100 -MAIN_MODEL = "JackFram/llama-68m" - -# speculative model -SPEC_MODEL = "abhigoyal/vllm-medusa-llama-68m-random" - -BATCH_SIZE = 5 -SPEC_DISABLE_BATCH_SIZE = 2 - - -def add_seq_group_to_engine(engine: vllm.LLMEngine, seq_group: SequenceGroup): - scheduler = engine.scheduler[0] - scheduler.add_seq_group(seq_group) - - -""" -Since we are using a batch size greater than the disabled batch size, -we can ensure we go through the _no_spec codepath for most of our engine steps. -""" - - -def test_memory_usage_no_spec(): - previous_memory_allocated = None - llm = vllm.LLM(model=MAIN_MODEL, - speculative_config={ - "model": SPEC_MODEL, - "num_speculative_tokens": 3, - "disable_by_batch_size": SPEC_DISABLE_BATCH_SIZE, - }) - - batch_sequences = set() - engine = llm.llm_engine - - for i in range(ITERATIONS): - seq, seq_group = create_dummy_prompt(request_id=str(i), - prompt_length=10, - min_tokens=10, - max_tokens=10) - - add_seq_group_to_engine(engine, seq_group) - - batch_sequences.add(seq) - engine.step() - for seq in list(batch_sequences): - if seq.is_finished(): - batch_sequences.remove(seq) - - # If we aren't at our batch size yet, continue - if len(batch_sequences) <= BATCH_SIZE: - continue - - # Otherwise, loop until at least one request is done - while not any(seq.is_finished() for seq in batch_sequences): - engine.step() - - # Remove it from the set - for seq in list(batch_sequences): - if seq.is_finished(): - batch_sequences.remove(seq) - - # At this point, we are always at the case where we have finished - # processing some number of requests from the batch after running - # several _no_spec executions. The memory should not have - # increased between the previous time this was recorded and the - # current time. - if previous_memory_allocated is None: - previous_memory_allocated = torch.cuda.memory_allocated() - else: - assert previous_memory_allocated == torch.cuda.memory_allocated() diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py deleted file mode 100644 index e8de410f8a94..000000000000 --- a/tests/spec_decode/test_metrics.py +++ /dev/null @@ -1,205 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.spec_decode.metrics import AsyncMetricsCollector - - -def test_initial_call_returns_none(): - """Expect first call to get metrics to return None. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collector = AsyncMetricsCollector(spec_decode_sampler) - collector.init_gpu_tensors(rank=0) - maybe_metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert maybe_metrics is None - - -def test_second_call_returns_metrics(): - """Expect second call to not return None. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - -@pytest.mark.parametrize("rank", [1, 2, 3, 4]) -def test_nonzero_rank_noop(rank): - """Verify nonzero ranks don't collect metrics. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collector = AsyncMetricsCollector(spec_decode_sampler) - collector.init_gpu_tensors(rank=rank) - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is None - - -def test_noop_until_time(): - """Verify metrics aren't collected until enough time passes. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, collect_interval_s - 0.1, collect_interval_s - 0.1, - collect_interval_s + 0.1, collect_interval_s + 0.1 - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is None - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - -def test_timer_is_reset(): - """Verify that the internal timer inside AsyncMetricsCollector - is reset after collection. - """ - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = 0 - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, - collect_interval_s + 0.1, - collect_interval_s + 0.1, - collect_interval_s + 0.2, - collect_interval_s + 0.2, - 2 * collect_interval_s + 0.1, - 2 * collect_interval_s + 0.1, - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is None - - _ = collector.maybe_collect_rejsample_metrics(k=5) - metrics = collector.maybe_collect_rejsample_metrics(k=5) - assert metrics is not None - - -@pytest.mark.parametrize("has_data", [True, False]) -def test_initial_metrics_has_correct_values(has_data: bool): - """Test correctness of metrics data. - """ - if has_data: - num_accepted_tokens = 103 - num_emitted_tokens = 104 - num_draft_tokens = 105 - else: - num_accepted_tokens = 0 - num_emitted_tokens = 0 - num_draft_tokens = 0 - k = 5 - - max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens( - num_draft_tokens, k) - - spec_decode_sampler = MagicMock() - spec_decode_sampler.num_accepted_tokens = torch.tensor(num_accepted_tokens, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_emitted_tokens = torch.tensor(num_emitted_tokens, - dtype=torch.long, - device='cuda') - spec_decode_sampler.num_draft_tokens = num_draft_tokens - - collect_interval_s = 5.0 - timer = MagicMock() - timer.side_effect = [ - 0.0, collect_interval_s + 0.1, collect_interval_s + 0.2 - ] - - collector = AsyncMetricsCollector(spec_decode_sampler=spec_decode_sampler, - timer=timer, - collect_interval_s=collect_interval_s) - collector.init_gpu_tensors(rank=0) - _ = collector.maybe_collect_rejsample_metrics(k) - metrics = collector.maybe_collect_rejsample_metrics(k) - - assert metrics.num_spec_tokens == k - assert metrics.accepted_tokens == num_accepted_tokens - assert metrics.draft_tokens == num_draft_tokens - assert metrics.emitted_tokens == num_emitted_tokens - - if has_data: - assert (metrics.draft_acceptance_rate == num_accepted_tokens / - num_draft_tokens) - assert (metrics.system_efficiency == num_emitted_tokens / - max_num_emitted_tokens) - else: - assert math.isnan(metrics.draft_acceptance_rate) - assert math.isnan(metrics.system_efficiency) diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py deleted file mode 100644 index f2d93203b8e1..000000000000 --- a/tests/spec_decode/test_multi_step_worker.py +++ /dev/null @@ -1,838 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.attention.selector import (_Backend, - global_force_attn_backend_context_manager) -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import (ExecuteModelRequest, HiddenStates, Logprob, - get_all_seq_ids) -from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker import Worker - -from .utils import (assert_logprobs_dict_allclose, create_batch, - create_seq_group_metadata_from_prompts, create_worker, - patch_execute_model_with_seeds, zero_kv_cache) - - -@pytest.mark.parametrize('num_steps', list(range(1, 17))) -def test_assert_enough_kv_space(num_steps: int): - """Test that the multi step worker checks for sufficient space in the KV - cache. It should throw if it cannot run all the steps. - """ - block_size = 16 - num_gpu_blocks = 2048 // block_size - - prompts = [ - list(range(block_size * 3)), - list(range(block_size * 2)), - ] - - prev_output_tokens = [ - list(range(block_size * 1)), - list(range(block_size * 2)), - ] - - final_prompt_lens = [ - len(prompt + output) + num_steps - for prompt, output in zip(prompts, prev_output_tokens) - ] - - inputs = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens, - continuations=prev_output_tokens) - - assert_enough_kv_space = MultiStepWorker._assert_enough_kv_space # pylint: disable=protected-access - worker = MagicMock() - worker.model_runner.block_size = block_size - - for seq_group_metadata in inputs: - original_block_tables = seq_group_metadata.block_tables - - # No exception. - assert_enough_kv_space(worker, inputs, num_steps) - - seq_group_metadata.block_tables = { - seq_id: [] - for seq_id, physical_blocks in original_block_tables.items() - } - - # Expect exception. - with pytest.raises(ValueError, - match='times but found insufficient KV space for'): - assert_enough_kv_space(worker, inputs, num_steps) - - seq_group_metadata.block_tables = original_block_tables - - -@torch.inference_mode() -def test_same_output_for_single_step(): - """Verify the multi step worker produces the same output as the normal - worker for num_steps=1. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 32 - num_gpu_blocks = 2048 // block_size - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - # multi_step_worker.model_runner = worker.model_runner - # multi_step_worker.cache_engine = worker.cache_engine - - num_steps = 1 - - prompts = [ - [1, 2, 3, 4, 5], - [6, 7, 8, 9, 10], - ] - - final_prompt_lens = [len(prompt) + num_steps for prompt in prompts] - - multi_step_seq_group = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - actual_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=multi_step_seq_group), - sample_len=num_steps, - seq_ids_with_bonus_token_in_last_step=set()) - assert len(actual_output) == num_steps - actual_output = actual_output[0] - - single_step_seq_group = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - zero_kv_cache(worker.cache_engine) - set_random_seed(seed) - expected_output = worker.execute_model( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=single_step_seq_group))[0] - - actual_token_ids = [ - output.samples[0].output_token for output in actual_output - ] - actual_logprobs = [output.samples[0].logprobs for output in actual_output] - - expected_token_ids = [ - output.samples[0].output_token for output in expected_output - ] - expected_logprobs = [ - output.samples[0].logprobs for output in expected_output - ] - - assert actual_token_ids == expected_token_ids - - print(f'{actual_logprobs=}') - print(f'{expected_logprobs=}') - assert_logprobs_dict_allclose(actual_logprobs, expected_logprobs) - - -@torch.inference_mode() -def test_same_output_for_multi_step(): - """Verify the multi-step worker produces the same output as the normal - worker when num_steps > 1. This test runs the multi-step worker once, and - then runs the worker num_steps times, and compares the output. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 16 - num_gpu_blocks = 2048 // block_size - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - # Make sure we go over the block boundary. - num_steps = block_size + 1 - - random.seed(seed) - prompts = [[ - random.randint(0, 1000) for _ in range(random.randint(10, 20)) - ] for _ in range(10)] - - final_prompt_lens = [len(prompt) + num_steps for prompt in prompts] - - rand_seeds = list(random.randint(0, 100) for _ in range(num_steps)) - multi_step_worker.execute_model = patch_execute_model_with_seeds( - multi_step_worker, rand_seeds) - worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds) - - continuations = [[1] for _ in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step. - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - multi_step_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=num_steps, - seq_ids_with_bonus_token_in_last_step=set()) - - # Run single-step repeatedly. - zero_kv_cache(worker.cache_engine) - single_step_output: list[SamplerOutput] = [] - continuations = [[1] for _ in prompts] - set_random_seed(seed) - - for _ in multi_step_output: - - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - - single_step_output.extend( - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list))) - - # Append output tokens to new sequence data. - for i, seq_group_output in enumerate(single_step_output[-1]): - continuations[i].append(seq_group_output.samples[0].output_token) - - # Get token ids and logprobs for comparison. - multi_step_output_logprobs: list[list[dict[int, - Logprob]]] = [[] - for _ in prompts] - single_step_output_logprobs: list[list[dict[int, - Logprob]]] = [[] - for _ in prompts] - - multi_step_output_token_ids: list[list[int]] = [[] for _ in prompts] - single_step_output_token_ids: list[list[int]] = [[] for _ in prompts] - for i, _ in enumerate(prompts): - for multi_step, single_step in zip(multi_step_output, - single_step_output): - multi_step_output_token_ids[i].append( - multi_step[i].samples[0].output_token) - single_step_output_token_ids[i].append( - single_step[i].samples[0].output_token) - - multi_step_output_logprobs[i].append( - multi_step[i].samples[0].logprobs) - single_step_output_logprobs[i].append( - single_step[i].samples[0].logprobs) - - # Print per-sequence token ids - for i, (multi_step_tokens, single_step_tokens) in enumerate( - zip(multi_step_output_token_ids, single_step_output_token_ids)): - print(f'{i=} {multi_step_tokens=}') - print(f'{i=} {single_step_tokens=}') - print(f'{i=} equal {multi_step_tokens == single_step_tokens}') - - # Assert token ids are equal. - for multi_step_tokens, single_step_tokens in zip( - multi_step_output_token_ids, single_step_output_token_ids): - assert multi_step_tokens == single_step_tokens - - # Assert logprobs are equal. - for multi_step_logprobs, single_step_logprobs in zip( - multi_step_output_logprobs, single_step_output_logprobs): - assert_logprobs_dict_allclose(multi_step_logprobs, - single_step_logprobs) - - -@torch.inference_mode() -def test_multi_step_with_batch_expansion_correct_output(): - """ - In this test we verify that the MultiStepWorker is able to handle bonus - tokens correctly. The test verifies that if a sequence has a - bonus token then the MultiStepWorker is able to expand the batch by adding - new sequences corresponding to the sequences with bonus tokens. The - expanded batch is then used for predicting the next tokens. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 16 - num_gpu_blocks = 2048 // block_size - batch_size = 128 - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - multi_step_worker.set_include_gpu_probs_tensor() - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - random.seed(seed) - prompts = [[0] for _ in range(batch_size)] - num_steps = 2 - final_prompt_lens = [(num_steps + 1) for prompt in prompts] - rand_seeds = list(random.randint(0, 100) for _ in range(num_steps)) - multi_step_worker.execute_model = patch_execute_model_with_seeds( - multi_step_worker, rand_seeds) - worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds) - # Create the test continuations - continuations = [[random.randint(0, 1000)] for _ in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - - # Run single-step twice to generate 2 tokens. This - # will simulate the bonus token case with the second token - # being the bonus token. - zero_kv_cache(worker.cache_engine) - single_step_output: list[SamplerOutput] = [] - set_random_seed(seed) - for _ in range(num_steps): - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - single_step_output.extend( - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list))) - # Append output tokens to new sequence data. - for i, seq_group_output in enumerate(single_step_output[-1]): - continuations[i].append(seq_group_output.samples[0].output_token) - - # Create continuations for the MultiStepWorker. The continuations have - # 2 tokens in order to simulate the bonus token case. - multi_step_continuations = [] - for continuation in continuations: - multi_step_continuations.append(continuation[:2]) - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step and verify that the third token prediction is accurate - # for all sequences. - zero_kv_cache(multi_step_worker.cache_engine) - all_seq_ids = {i for i in range(batch_size)} - multi_step_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=1, - seq_ids_with_bonus_token_in_last_step=all_seq_ids) - for index, output in enumerate(multi_step_output[-1].outputs): - assert (continuations[index][-1] == output.samples[0].output_token) - - -@torch.inference_mode() -def test_multi_step_with_batch_expansion_incorrect_output(): - """ - Tests the MultiStepWorker's ability to handle batch expansion with bonus - tokens in a negative case scenario. This test provides the MultiStepWorker - with a batch containing sequences with bonus tokens but specifies the - sequence IDs with bonus tokens incorrectly. The test verifies that the - MultiStepWorker generates correct tokens for the sequences where the - sequence ID is specified correctly and incorrect tokens for those where - the sequence ID is specified incorrectly. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - block_size = 16 - num_gpu_blocks = 2048 // block_size - batch_size = 128 - multi_step_worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - multi_step_worker.set_include_gpu_probs_tensor() - worker = create_worker( - Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - random.seed(seed) - prompts = [[0] for _ in range(batch_size)] - num_steps = 2 - final_prompt_lens = [(num_steps + 1) for prompt in prompts] - rand_seeds = list(random.randint(0, 100) for _ in range(num_steps)) - multi_step_worker.execute_model = patch_execute_model_with_seeds( - multi_step_worker, rand_seeds) - worker.execute_model = patch_execute_model_with_seeds(worker, rand_seeds) - # Create the test continuations - continuations = [[random.randint(0, 1000)] for _ in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - # Run single-step twice to generate 2 tokens. This - # will simulate the bonus token case with the second token - # being the bonus token. - zero_kv_cache(worker.cache_engine) - single_step_output: list[SamplerOutput] = [] - set_random_seed(seed) - for _ in range(num_steps): - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=continuations, - final_prompt_lens=final_prompt_lens) - single_step_output.extend( - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list))) - # Append output tokens to new sequence data. - for i, seq_group_output in enumerate(single_step_output[-1]): - continuations[i].append(seq_group_output.samples[0].output_token) - - # Create continuations for the MultiStepWorker. The continuations have - # 2 tokens in order to simulate the bonus token case. - multi_step_continuations = [] - for continuation in continuations: - multi_step_continuations.append(continuation[:2]) - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step. In this run INCORRECTLY specify that only the odd number - # sequences have bonus tokens. Verify that with this setting the third token - # prediction is accurate only for the odd numbered sequences. Also verify - # that the prediction might be wrong for some of the even numbered - # sequences. - zero_kv_cache(multi_step_worker.cache_engine) - set_random_seed(seed) - odd_seq_ids = {i for i in range(batch_size) if i % 2 != 0} - multi_step_output, _ = multi_step_worker.sampler_output( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=1, - seq_ids_with_bonus_token_in_last_step=odd_seq_ids) - num_mismatch = 0 - for index, output in enumerate(multi_step_output[-1].outputs): - if (index % 2) != 0: - assert (continuations[index][-1] == output.samples[0].output_token) - elif (continuations[index][-1] != output.samples[0].output_token): - num_mismatch += 1 - # The prediction is accurate for some of the sequences even without proper - # handling of the bonus tokens. Hence verify that the number of sequences - # for which there is a mismatch is > 0. - assert (num_mismatch > 0) - - -@torch.inference_mode() -@pytest.mark.parametrize('num_steps', [1, 2, 3, 4]) -# The choice of backends forces the multi_step_worker to choose between -# the vanilla model_runner and TP1DraftModelRunner and that we can test -# both code paths. -@pytest.mark.parametrize('attn_backend', - [_Backend.XFORMERS, _Backend.FLASH_ATTN]) -def test_multi_step_correct_kvcache(num_steps, attn_backend): - """Verify that the KV cache of the draft model - is correctly updated for sequences with bonus token. - """ - seed = 100 - model_name = "JackFram/llama-68m" - - block_size = 16 - num_gpu_blocks = 2048 // block_size - batch_size = 1 - - with global_force_attn_backend_context_manager(attn_backend): - dtype = 'float16' if attn_backend == _Backend.FLASH_ATTN else 'float32' - multi_step_worker = create_worker(MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - dtype=dtype) - multi_step_worker.set_include_gpu_probs_tensor() - worker = create_worker(Worker, - model_name, - block_size, - num_gpu_blocks, - seed, - dtype=dtype) - - prompts = [[0] for _ in range(batch_size)] - # Already generate two tokens for the sequence - # so that we can simulate the bonus token case - multi_step_continuations = [[ - random.randint(0, 1000), - random.randint(0, 1000) - ] for _ in prompts] - final_prompt_lens = [len(prompt) + 2 + num_steps for prompt in prompts] - - seq_ids_with_bonus_token_in_last_step = set(range(batch_size)) - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - # Run multi-step. - zero_kv_cache(multi_step_worker.cache_engine) - multi_step_worker.sampler_output(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list), - sample_len=num_steps, - seq_ids_with_bonus_token_in_last_step= - seq_ids_with_bonus_token_in_last_step) - - # Run single-step repeatedly. - zero_kv_cache(worker.cache_engine) - # Generate the kv cache for the bonus token first - single_step_continuations = [c[:1] for c in multi_step_continuations] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=single_step_continuations, - final_prompt_lens=final_prompt_lens) - single_step_output = worker.execute_model( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list)) - for _ in range(num_steps): - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - continuations=multi_step_continuations, - final_prompt_lens=final_prompt_lens) - - single_step_output = worker.execute_model( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list)) - - for i, seq_group_output in enumerate(single_step_output[-1]): - multi_step_continuations[i].append( - seq_group_output.samples[0].output_token) - - # Verify that the KV cache of the single-step and - # multi-step workers are the same. - single_step_gpu_cache = worker.cache_engine[0].gpu_cache - multi_step_gpu_cache = multi_step_worker.cache_engine[0].gpu_cache - num_layers = len(single_step_gpu_cache) - allclose = lambda a, b: torch.allclose( - a.cuda(), b.cuda(), rtol=1e-2, atol=1e-2) - for i in range(num_layers): - assert allclose(single_step_gpu_cache[i][0], - multi_step_gpu_cache[i][0]) - assert allclose(single_step_gpu_cache[i][1], - multi_step_gpu_cache[i][1]) - - -@torch.inference_mode() -def test_draft_proposals_full_speculation_len(): - """Verify Top1Proposer correctly handles case where all sequences - can speculate. - """ - k = 10 - batch_size = 32 - vocab_size = 32_000 - device = 'cuda:0' - - draft_worker = MagicMock() - proposer = Top1Proposer( - worker=draft_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=2048, - ) - draft_worker.sampler_output.return_value = [ - SamplerOutput( - outputs=[], - sampled_token_probs=torch.rand(batch_size, - vocab_size, - device=device, - dtype=torch.float32), - logprobs=torch.rand(batch_size, - vocab_size, - device=device, - dtype=torch.float32), - sampled_token_ids=torch.randint(low=0, - high=vocab_size, - size=(batch_size, ), - device=device, - dtype=torch.long), - ) for _ in range(k) - ], True - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) - - assert proposals.proposal_lens.shape == torch.Size([batch_size]) - assert proposals.proposal_lens.tolist() == [k for _ in range(batch_size)] - - -@torch.inference_mode() -def test_draft_proposals_no_speculations(): - """Verify Top1Proposer correctly handles case where no sequences - can speculate. - """ - k = 10 - batch_size = 32 - vocab_size = 32_000 - device = 'cuda:0' - prompt_len = 10 - - draft_worker = MagicMock() - proposer = Top1Proposer( - worker=draft_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=prompt_len + k - 1, - ) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - prompt_len=prompt_len) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) - - assert proposals.proposal_lens.shape == torch.Size([batch_size]) - assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)] - - -@torch.inference_mode() -def test_draft_proposals_mixed_k(): - """Verify Top1Proposer correctly handles case some sequences can - speculate and some can't. - """ - k = 10 - batch_size = 32 - vocab_size = 32_000 - device = 'cuda:0' - - small_prompt_len = 5 - long_prompt_len = 10 - prev_output_token_len = 20 - - expected_num_proposal_seqs = 6 - expected_num_no_proposal_seqs = batch_size - expected_num_proposal_seqs - - prompt_len = [ - small_prompt_len for _ in range(expected_num_proposal_seqs - 1) - ] + [long_prompt_len - for _ in range(expected_num_no_proposal_seqs)] + [small_prompt_len] - - draft_worker = MagicMock() - proposer = Top1Proposer( - worker=draft_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=long_prompt_len + prev_output_token_len + k - 1, - ) - - draft_worker.sampler_output.return_value = [ - SamplerOutput( - outputs=[], - sampled_token_probs=torch.rand(expected_num_proposal_seqs, - vocab_size, - device=device, - dtype=torch.float32), - logprobs=torch.rand(expected_num_proposal_seqs, - vocab_size, - device=device, - dtype=torch.float32), - sampled_token_ids=torch.randint( - low=0, - high=vocab_size, - size=(expected_num_proposal_seqs, ), - device=device, - dtype=torch.long), - ) for _ in range(k) - ], True - - seq_group_metadata_list, _, _ = create_batch( - batch_size, - k, - prompt_len=prompt_len, - prev_output_token_len=prev_output_token_len, - ) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k), - seq_ids_with_bonus_token_in_last_step=set()) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k]) - - assert proposals.proposal_lens.shape == torch.Size([batch_size]) - assert proposals.proposal_lens.tolist() == [ - k for _ in range(expected_num_proposal_seqs - 1) - ] + [0 for _ in range(expected_num_no_proposal_seqs)] + [k] - - -@torch.inference_mode() -def test_use_draft_model_runner_advance_step(): - """Verify that draft model runner triggers advance step - when applicable. - """ - seed = 100 - model_name = 'JackFram/llama-68m' - - k = 5 - batch_size = 32 - block_size = 32 - num_gpu_blocks = 2048 // block_size - worker = create_worker( - MultiStepWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - - # Mock "_gpu_advance_step" to raise an exception when called. - exception_secret = "artificial stop" - worker.model_runner._gpu_advance_step = MagicMock() - worker.model_runner._gpu_advance_step.side_effect = ValueError( - exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - block_size=block_size, - num_gpu_blocks=num_gpu_blocks) - - # Fallback (should not call) when num_steps=1. - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - num_steps=1) - worker.execute_model(execute_model_req=execute_model_req) - - # Expect exception if _gpu_advance_step is called. - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - num_steps=k) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - call_args_list = worker.model_runner._gpu_advance_step.call_args_list - assert len(call_args_list) == 1 - - -@torch.inference_mode() -def test_expand_execute_model_request_sync_with_expand_hidden_states(): - """ - In this test we verify that the logic for expanding the - seq_group_metadata_list remains in sync with the expansion logic of - the HiddenStates in _expand_execute_model_request. - """ - k = 5 - batch_size = 16 - seq_with_bonus_token_in_last_step = [1, 3, 8, 10, 13, 15] - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - execute_model_request = ExecuteModelRequest( - seq_group_metadata_list, - previous_hidden_states=HiddenStates( - torch.arange(batch_size), seq_group_metadata_list, - torch.arange(batch_size, 2 * batch_size))) - - expanded_execute_model_request, orig_seq_group_ids = MultiStepWorker.\ - _expand_execute_model_request(execute_model_request, - seq_with_bonus_token_in_last_step) - - all_seq_ids = torch.tensor( - get_all_seq_ids( - expanded_execute_model_request.seq_group_metadata_list)) - ref_expanded_hidden_states = all_seq_ids + batch_size - ref_expanded_hidden_states[orig_seq_group_ids] -= batch_size - - assert (ref_expanded_hidden_states == expanded_execute_model_request. - previous_hidden_states.hidden_states).all().item() diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py deleted file mode 100644 index 8a7c11485681..000000000000 --- a/tests/spec_decode/test_ngram_worker.py +++ /dev/null @@ -1,221 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.ngram_worker import NGramWorker -from vllm.spec_decode.top1_proposer import Top1Proposer - -from .utils import create_seq_group_metadata_from_prompts, create_worker - - -def test_ngram_algo_correctness_for_single_no_match(): - """Verify our ngram algo find the right candidate in the prompt - - For the scenario cannot find any candidate in one single batch - """ - block_size = 32 - num_gpu_blocks = 2048 // block_size - seed = 100 - model_name = 'JackFram/llama-68m' - vocab_size = 32_000 - device = 'cuda:0' - - ngram_worker = create_worker( - NGramWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - proposer = Top1Proposer( - worker=ngram_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=20, - ) - - # set ngram window [1, 3], which is window=1/2/3 - ngram_worker.set_ngram_window_size(1, 3) - - prompts = [ - # shall find no candidate - [1, 2, 3, 4, 5, 6, 7], - ] - - proposal_len = 5 - final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=proposal_len), - seq_ids_with_bonus_token_in_last_step=None) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([1, proposal_len]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([1, proposal_len]) - assert proposals.proposal_lens.shape == torch.Size([1]) - assert proposals.proposal_lens.tolist() == [0] - - -def test_ngram_algo_correctness_for_batches_not_match_all(): - """Verify our ngram algo find the right candidate in the prompt - - For the scenario find some candidate not full in batchs - """ - block_size = 32 - num_gpu_blocks = 2048 // block_size - seed = 100 - model_name = 'JackFram/llama-68m' - vocab_size = 32_000 - device = 'cuda:0' - - ngram_worker = create_worker( - NGramWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - proposer = Top1Proposer( - worker=ngram_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=20, - ) - - # set ngram window [1, 3], which is window=1/2/3 - ngram_worker.set_ngram_window_size(1, 3) - - prompts = [ - # shall find no candidate - [1, 2, 3, 4, 5, 6, 7], - # shall find candidate 12,13,14,15,16 - [11, 12, 13, 14, 15, 16, 11], - # shall find candidate 23,24,25,26,21 - [21, 21, 22, 23, 24, 25, 26, 21, 22], - # shall find candidate 34,35,36,37,38 - [31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33], - # shall find no candidate as exceed max_proposal_len - [ - 31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 31, 32, 33, 34, 35, 36, 37, - 38, 31, 32, 33 - ], - ] - - proposal_len = 5 - final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - for sg in seq_group_metadata_list: - sg.is_prompt = False - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=proposal_len), - seq_ids_with_bonus_token_in_last_step=None) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([5, proposal_len]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([5, proposal_len]) - assert proposals.proposal_lens.shape == torch.Size([5]) - - # the first sequence has no match so proposal_len should be overwritten to 0 - assert proposals.proposal_lens.tolist( - ) == [0] + [proposal_len for _ in range(3)] + [0] - - for i in range(proposal_len): - assert proposals.proposal_token_ids[0][i] == -1 - assert proposals.proposal_token_ids[1][i] == prompts[1][i + 1] - assert proposals.proposal_token_ids[2][i] == prompts[2][i + 3] - assert proposals.proposal_token_ids[3][i] == prompts[3][i + 5] - assert proposals.proposal_token_ids[4][i] == -1 - - -def test_ngram_algo_correctness_for_batches_match_all(): - """Verify our ngram algo find the right candidate in the prompt - - For the scenario find candidate in all batches - """ - - block_size = 32 - num_gpu_blocks = 2048 // block_size - seed = 100 - model_name = 'JackFram/llama-68m' - vocab_size = 32_000 - device = 'cuda:0' - - ngram_worker = create_worker( - NGramWorker, - model_name, - block_size, - num_gpu_blocks, - seed, - ) - - proposer = Top1Proposer( - worker=ngram_worker, - device=device, - vocab_size=vocab_size, - max_proposal_len=20, - ) - - # set ngram window [0, 3], which is window=1/2/3 - ngram_worker.set_ngram_window_size(1, 3) - - prompts = [ - # shall find candidate 12,13,14,15,16 - [11, 12, 13, 14, 15, 16, 11], - # shall find candidate 23,24,25,26,21 - [21, 21, 22, 23, 24, 25, 26, 21, 22], - # shall find candidate 34,35,36,37,38 - [31, 32, 31, 32, 33, 34, 35, 36, 37, 38, 31, 32, 33], - ] - - proposal_len = 5 - final_prompt_lens = [len(prompt) + proposal_len for prompt in prompts] - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, - num_gpu_blocks, - block_size, - final_prompt_lens=final_prompt_lens) - - # Normally drafter is run on decode requests only; here we check the output - # of the ngram worker as it is the sole proposer that has no forward. - for sg in seq_group_metadata_list: - sg.is_prompt = False - proposals = proposer.get_spec_proposals( - execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=proposal_len), - seq_ids_with_bonus_token_in_last_step=None) - - assert torch.is_tensor(proposals.proposal_token_ids) - assert torch.is_tensor(proposals.proposal_probs) - - assert proposals.proposal_token_ids.shape == torch.Size([3, proposal_len]) - assert proposals.proposal_probs.shape[:-1] == torch.Size([3, proposal_len]) - assert proposals.proposal_lens.shape == torch.Size([3]) - - assert proposals.proposal_lens.tolist() == [proposal_len for _ in range(3)] - - for i in range(proposal_len): - assert proposals.proposal_token_ids[0][i] == prompts[0][i + 1] - assert proposals.proposal_token_ids[1][i] == prompts[1][i + 3] - assert proposals.proposal_token_ids[2][i] == prompts[2][i + 5] diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py deleted file mode 100644 index 55fcf0055747..000000000000 --- a/tests/spec_decode/test_scorer.py +++ /dev/null @@ -1,116 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random - -import pytest -import torch - -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from vllm.spec_decode.interfaces import SpeculativeProposals, SpeculativeScores -from vllm.spec_decode.mqa_scorer import MQAScorer -from vllm.worker.worker import Worker - -from .utils import create_batch, create_worker - - -def create_proposal(propose_lens: list[int], vocab_size: int, - device: str) -> SpeculativeProposals: - batch_size = len(propose_lens) - max_propose_len = max(propose_lens) - proposal_probs = torch.rand((batch_size, max_propose_len, vocab_size), - device=device) - - proposal_token_ids = torch.full((batch_size, max_propose_len), - fill_value=-1, - device=device) - for i in range(batch_size): - proposal_token_ids[i][:propose_lens[i]] = torch.argmax( - proposal_probs[i][:propose_lens[i]], dim=-1) - - propose_lens = torch.tensor(propose_lens, device=device) - return SpeculativeProposals(proposal_token_ids, proposal_probs, - propose_lens) - - -def assert_score_equal(score1: SpeculativeScores, - score2: SpeculativeScores) -> None: - assert torch.allclose(score1.probs, score2.probs) - assert torch.allclose(score1.logprobs, score2.logprobs) - assert torch.equal( - score1.token_ids, - score2.token_ids), f"{score1.token_ids}, {score2.token_ids}" - - -@pytest.mark.parametrize('model_name', ['facebook/opt-125m']) -@pytest.mark.parametrize('batch_size', [1, 2, 4, 8, 16]) -@pytest.mark.parametrize('max_propose_len', [1, 3, 5]) -@pytest.mark.parametrize('mixed_propose_len', [True]) -@pytest.mark.parametrize('device', ['cuda']) -@pytest.mark.parametrize('prefill_chunking', [False, True]) -def test_scorer(model_name: str, batch_size: int, max_propose_len: int, - mixed_propose_len: bool, device: str, - prefill_chunking: bool) -> None: - """ - Compare the batch expansion scorer and mqa scorer return the same score. - We test for both queries with the same propose length and different - propose length, as well as mixed prefill-decode batches. - """ - seed = 0 - block_size = 32 - num_gpu_blocks = 2048 // block_size - scorer_worker = create_worker(Worker, model_name, block_size, - num_gpu_blocks, seed) - scorer_worker.model_runner.disable_logprobs = True # accessed by mqa_scorer - scorer_worker.model_runner.sampler.include_gpu_probs_tensor = True - scorer_worker.model_runner.sampler.should_modify_greedy_probs_inplace = True - - vocab_size = scorer_worker.vocab_size - - if not mixed_propose_len: - propose_lens = [max_propose_len] * batch_size - else: - # There must be at least 1 decode request, otherwise - # we have nothing to score (`_run_no_spec`). - non_zero_cnt = random.randint(1, batch_size) - propose_lens = [max_propose_len - ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt) - random.shuffle(propose_lens) - - seq_group_metadatalist, _, _ = create_batch(batch_size, - max_propose_len, - block_size=block_size, - num_gpu_blocks=num_gpu_blocks) - - if mixed_propose_len and prefill_chunking and (n_prefills := - batch_size - non_zero_cnt): - prefill, _, _ = create_batch(n_prefills, - None, - prefill_chunk_size=4, - block_size=block_size, - num_gpu_blocks=num_gpu_blocks, - seq_ids=list( - range(batch_size, - batch_size + n_prefills))) - # re-order to guarantee prefill|decode order - target_group_metadatalist = [ - seq_group_metadatalist[i] for i, p in enumerate(propose_lens) - if p > 0 - ] - seq_group_metadatalist = prefill + target_group_metadatalist - propose_lens = [0] * n_prefills + [p for p in propose_lens if p > 0] - - proposals = create_proposal(propose_lens, vocab_size, device) - requests = ExecuteModelRequest(seq_group_metadatalist, - num_lookahead_slots=max_propose_len) - - batch_expansion_scorer = BatchExpansionTop1Scorer(scorer_worker, device, - vocab_size) - batch_expansion_score = batch_expansion_scorer.score_proposals( - requests, proposals) - - mqa_scorer = MQAScorer(scorer_worker, device, vocab_size) - mqa_score = mqa_scorer.score_proposals(requests, proposals) - - assert_score_equal(batch_expansion_score, mqa_score) diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py deleted file mode 100644 index 8aceaadff8d3..000000000000 --- a/tests/spec_decode/test_spec_decode_worker.py +++ /dev/null @@ -1,945 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from collections import defaultdict -from types import SimpleNamespace -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.utils import set_random_seed -from vllm.sequence import ExecuteModelRequest, SequenceOutput -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer -from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.metrics import (AsyncMetricsCollector, - SpecDecodeWorkerMetrics) -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.spec_decode_worker import (SpecDecodeWorker, - split_num_cache_blocks_evenly) -from vllm.worker.worker import Worker - -from .test_utils import mock_spec_decode_sampler -from .utils import (create_batch, create_sampler_output_list, create_worker, - mock_worker) - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_correctly_calls_draft_model(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker calls the draft worker with correct - inputs. Everything else is mocked out. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker( - draft_worker, - target_worker, - mock_spec_decode_sampler(acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector) - exception_secret = 'artificial stop' - draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - - call_args_list = draft_worker.get_spec_proposals.call_args_list - assert len(call_args_list) == 1 - - for args, _ in call_args_list: - actual_execute_model_data = args[0] - assert actual_execute_model_data == execute_model_req - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_batch_expansion_correctly_calls_target_model( - k: int, batch_size: int, acceptance_sampler_method: str): - """Verify SpecDecodeWorker calls the target model with correct - inputs with batch expansion. Everything else is mocked out. - """ - draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False) - target_worker = mock_worker(use_spec=False) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker( - draft_worker, - target_worker, - mock_spec_decode_sampler(acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - disable_mqa_scorer=True) - worker.init_device() - - vocab_size = 32_000 - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, prompts, prev_output_tokens = create_batch( - batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - exception_secret = 'artificial stop' - target_worker.execute_model.side_effect = ValueError(exception_secret) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - - seen_contexts: list[list[int]] = [] - - call_args_list = target_worker.execute_model.call_args_list - assert len(call_args_list) == 1 - for _, kwargs in call_args_list: - seq_group_metadata_list = kwargs[ - "execute_model_req"].seq_group_metadata_list - - assert len(seq_group_metadata_list) == (k + 1) * batch_size - for seq_group_metadata in seq_group_metadata_list: - for seq_data in seq_group_metadata.seq_data.values(): - seen_contexts.append(seq_data.get_token_ids()) - - expected_seen_contexts: list[list[int]] = [] - - for prompt, prev_generated, draft_tokens in zip( - prompts, prev_output_tokens, proposal_token_ids.tolist()): - - for i in range(len(draft_tokens) + 1): - expected_seen_contexts.append(prompt + prev_generated + - draft_tokens[:i]) - - seen_contexts.sort() - expected_seen_contexts.sort() - assert expected_seen_contexts == seen_contexts - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_correctly_calls_spec_decode_sampler(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker calls the rejection sampler with - correct inputs. Everything else is mocked out. - """ - vocab_size = 32_000 - - draft_worker = mock_worker(cls=MultiStepWorker, - vocab_size=vocab_size, - use_spec=False) - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector) - worker.init_device() - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - exception_secret = 'artificial stop' - - spec_decode_sampler.side_effect = ValueError(exception_secret) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - - assert len(spec_decode_sampler.call_args_list) == 1 - _, kwargs = spec_decode_sampler.call_args_list[0] - actual = SimpleNamespace(**kwargs) - - assert torch.equal(actual.bonus_token_ids, - target_token_ids.reshape(batch_size, k + 1)[:, -1:]) - assert torch.equal(actual.target_with_bonus_probs, - target_token_probs.reshape(batch_size, k + 1, -1)) - assert torch.equal(actual.draft_token_ids, proposal_token_ids) - assert torch.equal(actual.draft_probs, proposal_probs) - - -@pytest.mark.parametrize('k', [1, 2, 6]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_correctly_formats_output(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker formats sampler output correctly. - Everything else is mocked out. - """ - vocab_size = 32_000 - - draft_worker = mock_worker(cls=MultiStepWorker, - vocab_size=vocab_size, - use_spec=False) - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector) - worker.init_device() - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - spec_decode_sampler_output = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k + 1), - dtype=torch.int64, - device='cuda') - for i in range(batch_size): - minimum_accepted_tokens = 1 - spec_decode_sampler_output[i][ - -random.randint(minimum_accepted_tokens, k + 1):] = -1 - - spec_decode_sampler.return_value = spec_decode_sampler_output - output = worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - - expected_output = create_sampler_output_list( - token_ids=spec_decode_sampler_output.transpose(0, 1), - probs=[None for _ in range(k + 1)], - logprobs=[None for _ in range(k + 1)]) - - seq_ids = [ - next(iter(seq_group_metadata.seq_data.keys())) - for seq_group_metadata in seq_group_metadata_list - ] - actual_output_by_seq: dict[int, list[SequenceOutput]] = { - seq_id: [] - for seq_id in seq_ids - } - expected_output_by_seq: dict[int, list[SequenceOutput]] = { - seq_id: [] - for seq_id in seq_ids - } - - for step in output: - for seq_group in step: - for sample in seq_group.samples: - seq_id = sample.parent_seq_id - actual_output_by_seq[seq_id].append(sample) - - for step in expected_output: - for seq_group in step: - for sample in seq_group.samples: - seq_id = sample.parent_seq_id - expected_output_by_seq[seq_id].append(sample) - - all_seen_seq_ids = set( - list(actual_output_by_seq.keys()) + - list(expected_output_by_seq.keys())) - for seq_id in all_seen_seq_ids: - actual_by_step = actual_output_by_seq[seq_id] - expected_by_step = expected_output_by_seq[seq_id] - - for i in range(k + 1): - if i >= len(actual_by_step): - assert expected_by_step[i].output_token == -1 - continue - assert actual_by_step[i].output_token == expected_by_step[ - i].output_token - - -@pytest.mark.parametrize('k', [1, 2]) -@pytest.mark.parametrize('batch_size', [1]) -@pytest.mark.parametrize('returns_metrics', [True, False]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker collects metrics. - """ - vocab_size = 32_000 - - draft_worker = mock_worker(cls=MultiStepWorker, - vocab_size=vocab_size, - use_spec=False) - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector) - worker.init_device() - - proposal_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k), - dtype=torch.int64, - device='cuda') - proposal_probs = torch.rand(batch_size, - k, - vocab_size, - dtype=torch.float32, - device='cuda') - - proposal_lens = torch.ones(batch_size, dtype=torch.int64, - device='cuda') * k - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - - draft_worker.get_spec_proposals.return_value = SpeculativeProposals( - proposal_token_ids=proposal_token_ids, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - spec_decode_sampler_output = torch.randint(low=0, - high=vocab_size, - size=(batch_size, k + 1), - dtype=torch.int64, - device='cuda') - for i in range(batch_size): - minimum_accepted_tokens = 1 - spec_decode_sampler_output[i][ - -random.randint(minimum_accepted_tokens, k + 1):] = -1 - spec_decode_sampler.return_value = spec_decode_sampler_output - - mock_rejsample_metrics = MagicMock( - spec=SpecDecodeWorkerMetrics) if returns_metrics else None - metrics_collector.maybe_collect_rejsample_metrics.return_value = ( - mock_rejsample_metrics) - - output = worker.execute_model(execute_model_req=ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k)) - assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics - - call_args_list = ( - metrics_collector.maybe_collect_rejsample_metrics.call_args_list) - assert len(call_args_list) == 1 - args, kwargs = call_args_list[0] - assert args[0] == k or kwargs.get('k', -1) == k - - -@pytest.mark.parametrize('k', [0]) -@pytest.mark.parametrize('batch_size', [1, 2, 32]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_k_equals_zero(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify that the SpecDecodeWorker calls the draft and target workers - when k is zero. This happens during prefill. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - sampler_output = MagicMock(spec=SamplerOutput) - sampler_output.hidden_states = None - target_worker.execute_model.return_value = [sampler_output] - - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - ) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - prev_output_token_len=0) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k) - - out = worker.execute_model(execute_model_req=execute_model_req) - - assert len(out) == 1, f"expected only one token output when {k=}" - assert out[0].sampled_token_probs is None, ( - "expect gpu tensor references to be None") - assert out[ - 0].sampled_token_ids is None, "expect gpu tensor references to be None" - - draft_worker.execute_model.assert_called_once_with(execute_model_req) - target_worker.execute_model.assert_called_once_with(execute_model_req) - - -@pytest.mark.parametrize('k', [0, 5]) -@pytest.mark.parametrize('batch_size', [0]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_empty_input_batch(k: int, batch_size: int, - acceptance_sampler_method: str): - """Verify that the SpecDecodeWorker calls the draft and target workers - when the input batch is empty. This can happen if the engine communicates - to the workers information without scheduling a batch. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - sampler_output = MagicMock(spec=SamplerOutput) - sampler_output.hidden_states = None - target_worker.execute_model.return_value = [sampler_output] - - draft_worker.device = 'cuda' - target_worker.device = 'cuda' - - set_random_seed(1) - - worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - disable_logprobs=False, - metrics_collector=metrics_collector, - ) - - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - prev_output_token_len=0) - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, num_lookahead_slots=k) - - out = worker.execute_model(execute_model_req=execute_model_req) - - assert len(out) == 1, f"expected only one token output when {k=}" - assert out[0].sampled_token_probs is None, ( - "expect gpu tensor references to be None") - assert out[ - 0].sampled_token_ids is None, "expect gpu tensor references to be None" - - draft_worker.execute_model.assert_called_once_with(execute_model_req) - target_worker.execute_model.assert_called_once_with(execute_model_req) - - -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@pytest.mark.skip_global_cleanup -def test_init_device(acceptance_sampler_method: str): - """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as - well as other GPU initialization. - """ - draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False) - target_worker = mock_worker(use_spec=False) - spec_decode_sampler = mock_spec_decode_sampler(acceptance_sampler_method) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - worker = SpecDecodeWorker( - proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=spec_decode_sampler, - disable_logprobs=False, - metrics_collector=metrics_collector, - ) - worker.init_device() - - draft_worker.init_device.assert_called_once() - - target_worker.init_device.assert_called_once() - - metrics_collector.init_tensors.assert_called_once() - spec_decode_sampler.init_tensors.assert_called_once() - - -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@torch.inference_mode() -def test_initialize_cache(acceptance_sampler_method): - """Verify SpecDecodeWorker invokes initialize_cache on proposer/scorer - workers. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - worker = SpecDecodeWorker(proposer_worker=draft_worker, - scorer_worker=target_worker, - spec_decode_sampler=mock_spec_decode_sampler( - acceptance_sampler_method), - metrics_collector=metrics_collector) - - kwargs = {"num_gpu_blocks": 1024, "num_cpu_blocks": 1023} - worker.initialize_cache(**kwargs) - - draft_worker.initialize_cache.assert_called_once_with(**kwargs) - target_worker.initialize_cache.assert_called_once_with(**kwargs) - - -@pytest.mark.parametrize('available_gpu_blocks', [1, 1024]) -@pytest.mark.parametrize('available_cpu_blocks', [500]) -@pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096]) -@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.parametrize("acceptance_sampler_method", - ["rejection_sampler", "typical_acceptance_sampler"]) -@pytest.mark.skip_global_cleanup -def test_determine_num_available_blocks(available_gpu_blocks: int, - available_cpu_blocks: int, - target_cache_block_size_bytes: int, - draft_kv_size_bytes: int, - acceptance_sampler_method: str): - """Verify SpecDecodeWorker correctly profiles num available GPU blocks. - Specifically, it should run profiling in the scorer worker, and then evenly - split the blocks between proposer and scorer worker. - """ - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - - target_worker.determine_num_available_blocks.return_value = ( - available_gpu_blocks, available_cpu_blocks) - target_worker.get_cache_block_size_bytes.return_value = ( - target_cache_block_size_bytes) - draft_worker.get_cache_block_size_bytes.return_value = draft_kv_size_bytes - - worker = SpecDecodeWorker( - draft_worker, target_worker, - mock_spec_decode_sampler(acceptance_sampler_method), metrics_collector) - - num_gpu_blocks, num_cpu_blocks = worker.determine_num_available_blocks() - - target_worker.determine_num_available_blocks.assert_called_once() - assert num_cpu_blocks == available_cpu_blocks - - assert num_gpu_blocks == split_num_cache_blocks_evenly( - target_cache_block_size_bytes, draft_kv_size_bytes, - available_gpu_blocks) - - -@pytest.mark.parametrize('available_gpu_blocks', - list(range(20)) + [1024, 1024**2]) -@pytest.mark.parametrize('target_cache_block_size_bytes', - [2 * 2 * 4096, 2 * 2 * 8192]) -@pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.skip_global_cleanup -def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, - target_cache_block_size_bytes: int, - draft_kv_size_bytes: int): - """Verify split_num_cache_blocks_evenly does not exceed original memory - allocation in bytes. - """ - num_blocks = split_num_cache_blocks_evenly(target_cache_block_size_bytes, - draft_kv_size_bytes, - available_gpu_blocks) - assert (num_blocks * target_cache_block_size_bytes) + ( - num_blocks * draft_kv_size_bytes) <= (available_gpu_blocks * - target_cache_block_size_bytes) - - -@torch.inference_mode() -def test_populate_seq_ids_with_bonus_tokens(): - """ - Verify that a call to _create_output_sampler_list correctly updates - seq_with_bonus_token_in_last_step. - - seq_with_bonus_token_in_last_step is an internal data structure in - SpecDecodeWorker that tracks the sequence IDs which are assigned bonus - tokens by the target model in their last forward pass. This state is - maintained only for models relying on the KV cache, such as those using - the MultiStepWorker. - """ - batch_size = 10 - k = 5 - vocab_size = 10000 - num_sequences_with_bonus_tokens = 5 - target_worker = mock_worker(vocab_size=vocab_size, use_spec=False) - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)] - target_worker.device = 'cuda' - - set_random_seed(1) - draft_worker = mock_worker(cls=MultiStepWorker) - draft_worker.device = 'cuda' - # The sequence_ids attached to each sequence in the batch. - # The sequence at index i has seq_id assigned_seq_ids[i] - assigned_seq_ids = list(range(batch_size)) - seq_group_metadata_list, _, _ = create_batch(batch_size, - k, - seq_ids=assigned_seq_ids, - prev_output_token_len=10) - target_token_logprobs = torch.rand(batch_size, (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - accepted_token_ids = torch.randint(low=0, - high=vocab_size, - size=(batch_size, (k + 1)), - dtype=torch.int64, - device='cuda') - expected_request_id_seq_ids_mapping: dict[str, set[int]] = defaultdict(set) - for seq_group_metadata in seq_group_metadata_list: - for seq_id in seq_group_metadata.seq_data: - expected_request_id_seq_ids_mapping[ - seq_group_metadata.request_id].add(seq_id) - # Generate a random sample of sequence indexes with bonus tokens - seq_indexes_with_bonus_tokens = random.sample( - range(batch_size), num_sequences_with_bonus_tokens) - # Create a mask that is True for indices in seq_indexes_with_bonus_tokens - mask = torch.ones(batch_size, dtype=torch.bool, device='cuda') - mask[seq_indexes_with_bonus_tokens] = False - # Set the last token ID to -1 for all indices not in - # seq_indexes_with_bonus_tokens to indicate the lack of bonus token in - # those indices. - accepted_token_ids[mask, -1:] = -1 - worker = SpecDecodeWorker(draft_worker, - target_worker, - mock_spec_decode_sampler("rejection_sampler"), - disable_logprobs=False, - metrics_collector=metrics_collector) - # Initialize _seq_with_bonus_token_in_last_step with a set of sequence IDs. - # This set includes all sequence IDs in the batch as well as an additional - # `num_extra_sequence_ids` sequence IDs. Note that the sequence IDs are in - # the range [0, batch_size + num_extra_sequence_ids). - num_extra_sequence_ids = 10 - worker._seq_with_bonus_token_in_last_step = set( - range(batch_size + num_extra_sequence_ids)) - worker._create_output_sampler_list( - seq_group_metadata_list=seq_group_metadata_list, - accepted_token_ids=accepted_token_ids, - target_logprobs=target_token_logprobs, - prompt_logprobs=None, - k=k, - stage_times=(0, 0, 0)) - # Verify that _seq_with_bonus_token_in_last_step contains the following: - # 1. Sequence IDs that were already present in - # _seq_with_bonus_token_in_last_step but were not part of the current - # batch are retained. - # 2. Of the sequence IDs present in the current batch, only those with a - # bonus token are retained in _seq_with_bonus_token_in_last_step. - # Sequence IDs that are present in the current batch but do not have - # bonus tokens are removed from _seq_with_bonus_token_in_last_step. - expected_seq_ids_with_bonus_tokens = \ - set([assigned_seq_ids[i] for i in seq_indexes_with_bonus_tokens]) - additional_sequence_ids = \ - set(range(batch_size, batch_size + num_extra_sequence_ids)) - assert worker._seq_with_bonus_token_in_last_step == \ - expected_seq_ids_with_bonus_tokens.union(additional_sequence_ids) - assert worker._request_id_seq_id_mapping == \ - expected_request_id_seq_ids_mapping - - -@torch.inference_mode() -def test_handle_finished_requests(): - """ - Test to verify that finished request IDs are appropriately processed to - update the internal state of the SpecDecodeWorker. - - This test initializes the SpecDecodeWorker with mock data, marks certain - requests as finished, and ensures that the corresponding sequence IDs are - correctly removed from the internal mappings. - """ - batch_size = 32 - k = 3 - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker(draft_worker, target_worker, - mock_spec_decode_sampler("rejection_sampler"), - metrics_collector) - # Initialize the request_id_seq_id_mapping mapping dict with a few fake - # request ids and corresponding sequence ids. - worker._request_id_seq_id_mapping = \ - {'request-1': {1,2,3}, 'request-2': {4,5,6,7}, - 'request-3': {8,9}, 'request-4': {10,11}} - # Initialize seq_with_bonus_token_in_last_step with a few fake - # sequence ids. - worker._seq_with_bonus_token_in_last_step = {1, 4, 5, 8, 9, 10} - exception_secret = 'artificial stop' - draft_worker.get_spec_proposals.side_effect = ValueError(exception_secret) - - seq_group_metadata_list, _, _ = create_batch(batch_size, k) - # Mark requests with ids request-1 and request-3 as finished. - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=seq_group_metadata_list, - num_lookahead_slots=k, - finished_requests_ids=['request-1', 'request-3']) - - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - # Verify that request-1 and request-3 are removed from - # request_id_seq_id_mapping - assert worker._request_id_seq_id_mapping == \ - {'request-2': {4,5,6,7}, 'request-4': {10,11}} - # Verify that all sequence ids corresponding to 'request-1' - # and 'request-3' are removed from seq_with_bonus_token_in_last_step. - assert worker._seq_with_bonus_token_in_last_step == \ - {4,5,10} - - -@pytest.mark.parametrize('k', [3]) -@pytest.mark.parametrize('batch_size', [2, 32]) -@pytest.mark.parametrize("batch_composition", - ["prefill_only", "decode_only", "mixed"]) -@torch.inference_mode() -def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str): - """ - Verify SpecDecodeWorker calls match the expected flow. - """ - vocab_size = 32_000 - draft_worker = mock_worker(cls=MultiStepWorker) - target_worker = mock_worker() - metrics_collector = MagicMock(spec=AsyncMetricsCollector) - worker = SpecDecodeWorker(draft_worker, - target_worker, - mock_spec_decode_sampler("rejection_sampler"), - disable_logprobs=False, - metrics_collector=metrics_collector) - exception_secret = 'artificial stop' - worker.scorer = mock_worker(BatchExpansionTop1Scorer) - worker.scorer.score_proposals.side_effect = ValueError(exception_secret) - - # Create batch with combination of terminal/non-terminal prefill chunks - # and decodes (different seq_ids). - decodes, _, _ = create_batch(batch_size, k) - # Pre-chunking here, get 'batch_size' chunks. - prefill, _, _ = create_batch(batch_size, - k, - prefill_chunk_size=4, - seq_ids=list(range(batch_size, - batch_size * 2))) - - if batch_composition == "prefill_only": - n_prefills = batch_size - elif batch_composition == "decode_only": - n_prefills = 0 - else: - n_prefills = random.randint(1, batch_size - 1) - n_decodes = batch_size - n_prefills - - prefill = random.sample(prefill, n_prefills) - decodes = random.sample(decodes, n_decodes) - target_group_metadata_list = prefill + decodes - execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=target_group_metadata_list, - # For prefill only batches we expect num_lookahead_slots = 0. - num_lookahead_slots=k if n_decodes > 0 else 0) - - target_token_ids = torch.randint(low=0, - high=vocab_size, - size=(1, batch_size * (k + 1)), - dtype=torch.int64, - device='cuda') - target_token_probs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_token_logprobs = torch.rand(1, - batch_size * (k + 1), - vocab_size, - dtype=torch.float32, - device='cuda') - target_output = create_sampler_output_list(target_token_ids, - target_token_probs, - target_token_logprobs) - - target_worker.execute_model.return_value = [target_output[0]] - - if not len(decodes): - worker.execute_model(execute_model_req=execute_model_req) - # no spec run (prefill only) - draft_worker.execute_model.assert_called_once_with(execute_model_req) - target_worker.execute_model.assert_called_once_with(execute_model_req) - else: - # Decode-only run OR mixed batch, scorer call fails (it's mocked) - with pytest.raises(ValueError, match=exception_secret): - worker.execute_model(execute_model_req=execute_model_req) - # but first draft still counted - assert draft_worker.get_spec_proposals.call_count == 1 - - -def test_correctly_load_weight_for_eagle(): - """ - Verify SpecDecodeWorker loads lm_head weight for eagle correctly. - """ - seed = 100 - block_size = 32 - num_gpu_blocks = 8096 // block_size - target_worker = create_worker( - Worker, - "JackFram/llama-68m", - block_size, - num_gpu_blocks, - seed, - ) - draft_worker = create_worker( - MultiStepWorker, - "abhigoyal/vllm-eagle-llama-68m-random", - block_size, - num_gpu_blocks, - seed, - model_runner_cls=TP1DraftModelRunner, - ) - - spec_decode_sampler = mock_spec_decode_sampler("rejection_sampler") - worker = SpecDecodeWorker(draft_worker, - target_worker, - spec_decode_sampler, - disable_logprobs=False) - worker.proposer_worker.maybe_load_lm_head_weight( - target_worker.model_runner.model.lm_head.weight.data) - assert torch.allclose( - worker.proposer_worker.worker.model_runner.model.lm_head.weight.data, - worker.scorer_worker.model_runner.model.lm_head.weight.data) diff --git a/tests/spec_decode/test_utils.py b/tests/spec_decode/test_utils.py deleted file mode 100644 index 9cfc618b9d95..000000000000 --- a/tests/spec_decode/test_utils.py +++ /dev/null @@ -1,150 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from unittest.mock import MagicMock - -import pytest -import torch - -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.model_executor.layers.sampler import _get_ranks -from vllm.model_executor.layers.typical_acceptance_sampler import ( - TypicalAcceptanceSampler) -from vllm.sequence import SequenceGroupMetadata, get_all_seq_ids -from vllm.spec_decode.util import (get_sampled_token_logprobs, - split_batch_by_proposal_len) - - -def test_get_all_seq_ids(): - """Verify get_all_seq_ids extracts all seq ids. - """ - expected_seq_ids = list(range(10)) + list(range(100, 110)) - - seq_group_metadata_list = [ - SequenceGroupMetadata( - request_id=str(seq_id), - is_prompt=True, - seq_data={ - seq_id: MagicMock(), - }, - sampling_params=MagicMock(), - block_tables={ - seq_id: MagicMock(), - }, - lora_request=None, - ) for seq_id in expected_seq_ids - ] - - actual_seq_ids = get_all_seq_ids(seq_group_metadata_list) - assert actual_seq_ids == expected_seq_ids - - -@pytest.fixture -def fake_sequence_group_metadata(): - seq_ids = list(range(3)) - return [ - SequenceGroupMetadata( - request_id=str(i), - is_prompt=True, - seq_data={ - i: MagicMock(), - }, - sampling_params=MagicMock(), - block_tables={ - i: MagicMock(), - }, - lora_request=None, - ) for i in seq_ids - ] - - -def test_filter_zero_length_proposals(fake_sequence_group_metadata): - proposal_lens = [0, 1, 0] - _, (filtered_groups, - indices) = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - expected_groups = [ - fake_sequence_group_metadata[0], fake_sequence_group_metadata[2] - ] - expected_indices = [0, 2] - - assert filtered_groups == expected_groups - assert indices == expected_indices - - -def test_filter_non_zero_length_proposals(fake_sequence_group_metadata): - proposal_lens = [0, 1, 2] - (filtered_groups, - indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - expected_groups = [ - fake_sequence_group_metadata[1], fake_sequence_group_metadata[2] - ] - expected_indices = [1, 2] - - assert filtered_groups == expected_groups - assert indices == expected_indices - - -def test_empty_inputs(): - _, (filtered_groups, indices) = split_batch_by_proposal_len([], []) - - assert filtered_groups == [] - assert indices == [] - - -def test_all_zero_with_non_zero_filter(fake_sequence_group_metadata): - proposal_lens = [0, 0, 0] - (filtered_groups, - indices), _ = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - assert filtered_groups == [] - assert indices == [] - - -def test_all_non_zero_with_zero_filter(fake_sequence_group_metadata): - proposal_lens = [1, 1, 1] - _, (filtered_groups, - indices) = split_batch_by_proposal_len(fake_sequence_group_metadata, - proposal_lens) - - assert filtered_groups == [] - assert indices == [] - - -def mock_spec_decode_sampler(acceptance_sampler_method): - """ - Returns either a RejectionSampler or TypicalAcceptanceSampler - object depending on whether acceptance_sampler_method is - 'rejection_sampler' or 'typical_acceptance_sampler' respectively. - """ - if acceptance_sampler_method == "rejection_sampler": - sampler = MagicMock(spec=RejectionSampler) - sampler.token_id_dtype = torch.int64 - return sampler - elif acceptance_sampler_method == "typical_acceptance_sampler": - sampler = MagicMock(spec=TypicalAcceptanceSampler) - sampler.token_id_dtype = torch.int64 - return sampler - else: - raise ValueError(f"Invalid sampler name {acceptance_sampler_method}") - - -def test_get_sampled_token_logprobs(): - """Verify get_sampled_token_logprobs returns consistent rankings - with regular get_ranks when probabilities match exactly. - """ - logprob_tensor = torch.tensor( - [[[-.1, -.1]] * 2]) # shape (num_steps, batch_size, vocab_size) - sampled_token_tensor = torch.tensor([[1, - 0]]) # shape (num_steps, batch_size) - ranks_spec_dec, _ = get_sampled_token_logprobs(logprob_tensor, - sampled_token_tensor) - - ranks_regular = _get_ranks(logprob_tensor.reshape((2, -1)), - sampled_token_tensor.reshape(-1)) - - assert torch.equal(ranks_spec_dec.reshape(-1), ranks_regular) diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py deleted file mode 100644 index 1733f66feec0..000000000000 --- a/tests/spec_decode/utils.py +++ /dev/null @@ -1,290 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Sequence as GenericSequence -from itertools import count -from typing import Callable, Optional, TypeVar, Union -from unittest.mock import MagicMock - -import torch - -from vllm.engine.arg_utils import EngineArgs -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.utils import set_random_seed -from vllm.sampling_params import SamplingParams -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - SequenceData, SequenceGroupMetadata, SequenceOutput) -from vllm.utils import get_distributed_init_method, get_ip, get_open_port -from vllm.worker.cache_engine import CacheEngine -from vllm.worker.model_runner import ModelRunner -from vllm.worker.worker import Worker - -T = TypeVar("T", bound=Worker) - - -def round_up_to_next_block(seq_len: int, block_size: int) -> int: - return (seq_len + block_size - 1) // block_size - - -def mock_worker(cls=None, - vocab_size: int = 30_000, - max_model_len: int = 2048, - rank: int = 0, - use_spec: bool = True) -> MagicMock: - if cls is None: - cls = Worker - - spec = cls if use_spec else None - - worker = MagicMock(spec=spec) - worker.vocab_size = vocab_size - worker.max_model_len = max_model_len - worker.rank = rank - worker.device = 'cuda:0' - return worker - - -def patch_execute_model_with_seeds(worker: Worker, rand_seeds: list[int]): - seed_iter = iter(rand_seeds) - original_execute_model = worker.execute_model - - def new_execute_model(*args, **kwargs): - result = original_execute_model(*args, **kwargs) - set_random_seed(next(seed_iter)) - return result - - return new_execute_model - - -def zero_kv_cache(cache_engine: list[CacheEngine]): - assert cache_engine[0].gpu_cache - for key_blocks, value_blocks in cache_engine[0].gpu_cache: - key_blocks.zero_() - value_blocks.zero_() - - -def create_worker(cls: Callable[..., T], - model_name: str, - block_size: int, - num_gpu_blocks: int, - seed: int, - is_driver_worker: bool = True, - enforce_eager: bool = True, - model_runner_cls: Optional[ModelRunner] = None, - dtype: Optional[str] = "auto") -> T: - engine_args = EngineArgs( - model=model_name, - seed=seed, - block_size=block_size, - enforce_eager=enforce_eager, - dtype=dtype, - ) - engine_config = engine_args.create_engine_config() - - distributed_init_method = get_distributed_init_method( - get_ip(), get_open_port()) - - worker = cls( - vllm_config=engine_config, - local_rank=0, - rank=0, - distributed_init_method=distributed_init_method, - is_driver_worker=is_driver_worker, - model_runner_cls=model_runner_cls, - ) - - worker.init_device() - worker.load_model() - - engine_config.cache_config.num_gpu_blocks = num_gpu_blocks - engine_config.cache_config.num_cpu_blocks = 0 - worker.initialize_cache( - num_gpu_blocks=engine_config.cache_config.num_gpu_blocks, - num_cpu_blocks=engine_config.cache_config.num_cpu_blocks) - - return worker - - -def create_seq_group_metadata_from_prompts( - prompts: list[list[int]], - num_gpu_blocks: int, - block_size: int, - final_prompt_lens: list[int], - continuations: Optional[list[list[int]]] = None, - seq_ids: Optional[list[int]] = None, -) -> list[SequenceGroupMetadata]: - - if continuations is None: - continuations = [[] for _ in prompts] - - if seq_ids is None: - seq_ids = list(i for i, _ in enumerate(prompts)) - - free_gpu_blocks = list(range(num_gpu_blocks)) - - block_allocations = { - i: [ - free_gpu_blocks.pop() - for _ in range(round_up_to_next_block(final_len, block_size)) - ] - for i, final_len in enumerate(final_prompt_lens) - } - - seq_grou_metadata_list = [] - for i, (prompt_token_ids, - cont_token_ids) in enumerate(zip(prompts, continuations)): - data = SequenceData.from_seqs(prompt_token_ids, cont_token_ids) - data.update_num_computed_tokens( - len(prompt_token_ids) + len(cont_token_ids) - 1) - seq_data = {i: data} - seq_grou_metadata_list.append( - SequenceGroupMetadata( - request_id=str(i), - is_prompt=len(cont_token_ids) == 0, - seq_data=seq_data, - sampling_params=SamplingParams(temperature=0.0), - block_tables={i: block_allocations[i][:]}, - )) - return seq_grou_metadata_list - - -def create_chunked_seq_group_metadata_from_prompt( - prompt: list[int], - num_gpu_blocks: int, - chunk_size: int, - block_size: int, - seq_id: Optional[int] = None) -> list[SequenceGroupMetadata]: - - if seq_id is None: - seq_id = 0 - - free_gpu_blocks = list(range(num_gpu_blocks)) - - block_allocations = [ - free_gpu_blocks.pop() - for _ in range(round_up_to_next_block(len(prompt), block_size)) - ] - - seq_group_metadata_list = [] - for i, idx in enumerate(range(0, len(prompt), chunk_size)): - chunk_ids = prompt[idx:idx + chunk_size] - data = SequenceData.from_seqs(prompt) - data.update_num_computed_tokens(idx) - seq_data = {i: data} - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=str(seq_id), - is_prompt=True, - do_sample=idx + chunk_size >= len(prompt), # terminal chunk - seq_data=seq_data, - sampling_params=SamplingParams(temperature=0.0), - block_tables={i: block_allocations}, - token_chunk_size=len(chunk_ids))) - return seq_group_metadata_list - - -def assert_logprobs_dict_allclose( - actual_logprobs: list[dict[int, Logprob]], - expected_logprobs: list[dict[int, Logprob]]) -> None: - for single_step_actual_logprobs, single_step_expected_logprobs in zip( - actual_logprobs, expected_logprobs): - assert set(single_step_actual_logprobs.keys()) == set( - single_step_expected_logprobs.keys()) - for token_id in single_step_actual_logprobs: - actual = torch.tensor( - single_step_actual_logprobs[token_id].logprob) - expected = torch.tensor( - single_step_expected_logprobs[token_id].logprob) - torch.testing.assert_close(actual, expected) - - -def create_sampler_output_list( - token_ids: torch.Tensor, - probs: GenericSequence[Optional[torch.Tensor]], - logprobs: GenericSequence[Optional[torch.Tensor]], - seq_ids: Optional[list[int]] = None) -> list[SamplerOutput]: - num_steps, batch_size = token_ids.shape - token_ids_by_step = token_ids.tolist() - - if seq_ids is None: - seq_ids = list(range(batch_size)) - - return [ - SamplerOutput(outputs=[ - CompletionSequenceGroupOutput( - samples=[ - SequenceOutput( - output_token=token_id, - parent_seq_id=seq_ids[seq_index], - logprobs={token_id: Logprob(0)}, - ) - ], - prompt_logprobs=None, - ) for seq_index, token_id in enumerate(token_ids_by_step[step]) - ], - sampled_token_probs=probs[step], - logprobs=logprobs[step], - sampled_token_ids=token_ids[step]) - for step in range(num_steps) - ] - - -def create_batch(batch_size, - k, - prompt_len: Union[int, list[int]] = 10, - prev_output_token_len: int = 10, - seq_ids: Optional[list[int]] = None, - num_gpu_blocks: Optional[int] = None, - block_size: Optional[int] = None, - prefill_chunk_size: Optional[int] = None): - if block_size is None: - block_size = 8 - - if num_gpu_blocks is None: - num_gpu_blocks = 2048 // block_size - - iterator = count() - - if isinstance(prompt_len, int): - prompt_lens = [prompt_len for _ in range(batch_size)] - else: - prompt_lens = prompt_len - - prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens] - - if prefill_chunk_size: - # Create a batch of chunked prompts. - if not seq_ids: - seq_ids = list(range(len(prompts))) - seq_group_metadata_list = [] - for p, sid in zip(prompts, seq_ids): - seq_group_metadata_list += \ - create_chunked_seq_group_metadata_from_prompt( - p, num_gpu_blocks, prefill_chunk_size, block_size, sid) - seq_group_metadata_list = seq_group_metadata_list[:batch_size] - prev_output_tokens = [] - else: - prev_output_tokens = [[ - next(iterator) for _ in range(prev_output_token_len) - ] for _ in range(batch_size)] - final_prompt_lens = [ - len(prompt) + len(prev_output_token) + k + 1 - for prompt, prev_output_token in zip(prompts, prev_output_tokens) - ] - - seq_group_metadata_list = create_seq_group_metadata_from_prompts( - prompts, num_gpu_blocks, block_size, final_prompt_lens, - prev_output_tokens, seq_ids) - return seq_group_metadata_list, prompts, prev_output_tokens - - -def maybe_enable_chunked_prefill(prefill_chunk_size, llm_kwargs): - if prefill_chunk_size > 0: - llm_kwargs.update( - **{ - "enable_chunked_prefill": True, - "max_num_batched_tokens": prefill_chunk_size, - "max_num_seqs": prefill_chunk_size - }) - else: - llm_kwargs["enable_chunked_prefill"] = False diff --git a/tests/test_sequence.py b/tests/test_sequence.py index a782a3bf7716..c734c8514a6d 100644 --- a/tests/test_sequence.py +++ b/tests/test_sequence.py @@ -29,7 +29,6 @@ def test_sampler_output_initialization(sampler_output, sample_outputs): assert len(sampler_output) == len(sample_outputs) assert sampler_output.sampled_token_probs is None assert sampler_output.sampled_token_ids is None - assert sampler_output.spec_decode_worker_metrics is None def test_sampler_output_getitem(sampler_output, sample_outputs): diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 7a7ba346a719..39515d710e81 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -40,12 +40,6 @@ def test_unsupported_configs(monkeypatch): with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - with pytest.raises(NotImplementedError): - AsyncEngineArgs( - model=MODEL, - kv_cache_dtype="fp8", - ).create_engine_config() - with pytest.raises(NotImplementedError): AsyncEngineArgs( model=MODEL, diff --git a/tools/mypy.sh b/tools/mypy.sh index 77d342da1ec8..af4c61233abd 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -32,6 +32,5 @@ run_mypy vllm/lora run_mypy vllm/model_executor run_mypy vllm/plugins run_mypy vllm/prompt_adapter -run_mypy vllm/spec_decode run_mypy vllm/worker run_mypy vllm/v1 diff --git a/vllm/config.py b/vllm/config.py index 7ae9b1b7fd02..8383a663c75e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2536,8 +2536,6 @@ def __post_init__(self): SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa", "mlp_speculator", "draft_model", "deepseek_mtp"] -SpeculativeAcceptanceMethod = Literal["rejection_sampler", - "typical_acceptance_sampler"] @config @@ -2560,13 +2558,6 @@ class SpeculativeConfig: If using `ngram` method, the related configuration `prompt_lookup_max` and `prompt_lookup_min` should be considered.""" - acceptance_method: SpeculativeAcceptanceMethod = "rejection_sampler" - """The method to use for accepting draft tokens:\n - - "rejection_sampler" maps to `RejectionSampler`.\n - - "typical_acceptance_sampler" maps to `TypicalAcceptanceSampler`. - - If using `typical_acceptance_sampler`, the related configuration - `posterior_threshold` and `posterior_alpha` should be considered.""" draft_tensor_parallel_size: Optional[int] = None """The degree of the tensor parallelism for the draft model. Can only be 1 or the same as the target model's tensor parallel size.""" @@ -2593,9 +2584,6 @@ class SpeculativeConfig: will use the default version.""" # Advanced control - disable_mqa_scorer: bool = False - """Disable the MQA scorer and fall back to batch expansion for scoring - proposals.""" disable_by_batch_size: Optional[int] = None """Disable speculative decoding for new incoming requests when the number of enqueued requests is larger than this value, if provided.""" @@ -2608,16 +2596,6 @@ class SpeculativeConfig: """Minimum size of ngram token window when using Ngram proposer, if provided. Defaults to 1.""" - # Typical acceptance sampler configuration - posterior_threshold: Optional[float] = None - """A threshold value that sets a lower bound on the posterior probability - of a token in the target model for it to be accepted. This threshold is - used only when we use the `TypicalAcceptanceSampler` for token acceptance. - """ - posterior_alpha: Optional[float] = None - """Scaling factor for entropy-based threshold, applied when using - `TypicalAcceptanceSampler`.""" - speculative_token_tree: Optional[str] = None """Specifies the tree structure for speculative token generation. """ @@ -2795,8 +2773,8 @@ def __post_init__(self): elif (self.draft_model_config.hf_config.model_type == "mlp_speculator"): self.method = "mlp_speculator" - elif (self.draft_model_config.hf_config.model_type == - "deepseek_mtp"): + elif (self.draft_model_config.hf_config.model_type + in ("deepseek_mtp", "mimo_mtp")): self.method = "deepseek_mtp" if self.num_speculative_tokens > 1: logger.warning( @@ -2806,6 +2784,11 @@ def __post_init__(self): ) else: self.method = "draft_model" + raise NotImplementedError( + "Speculative decoding with draft model is not " + "supported yet. Please consider using other " + "speculative decoding methods such as ngram, medusa, " + "eagle, or deepseek_mtp.") # Replace hf_config for EAGLE draft_model if self.method in ("eagle", "eagle3"): @@ -2864,12 +2847,6 @@ def __post_init__(self): self.target_parallel_config, self.draft_tensor_parallel_size)) - if self.acceptance_method == "typical_acceptance_sampler": - if self.posterior_threshold is None: - self.posterior_threshold = 0.09 - if self.posterior_alpha is None: - self.posterior_alpha = 0.3 - @staticmethod def _maybe_override_draft_max_model_len( speculative_max_model_len: Optional[int], @@ -2975,30 +2952,6 @@ def _verify_args(self) -> Self: if self.draft_model_config: self.draft_model_config.verify_with_parallel_config( self.draft_parallel_config) - # Validate and set draft token acceptance related settings. - - if self.acceptance_method is None: - raise ValueError("acceptance_method is not set. " - "Expected values are rejection_sampler or " - "typical_acceptance_sampler.") - - if (self.acceptance_method != 'rejection_sampler' - and self.acceptance_method != 'typical_acceptance_sampler'): - raise ValueError( - "Expected acceptance_method to be either " - "rejection_sampler or typical_acceptance_sampler. Instead it " - f"is {self.acceptance_method}") - - if self.acceptance_method == "typical_acceptance_sampler" and ( - (self.posterior_threshold is not None - and self.posterior_threshold < 0) or - (self.posterior_alpha is not None and self.posterior_alpha < 0)): - raise ValueError( - "Expected the posterior_threshold and posterior_alpha of " - "typical_acceptance_sampler to be > 0. " - "Instead found posterior_threshold = " - f"{self.posterior_threshold} and posterior_alpha = " - f"{self.posterior_alpha}") if (self.disable_by_batch_size is not None and self.disable_by_batch_size < 2): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b20defde73ed..a7fcf6c354e8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1417,28 +1417,12 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: return False # V1 supports N-gram, Medusa, and Eagle speculative decoding. - is_ngram_enabled = False - is_eagle_enabled = False - is_medusa_enabled = False - if self.speculative_config is not None: - # This is supported but experimental (handled below). - speculative_method = self.speculative_config.get("method") - if speculative_method: - if speculative_method in ("ngram", "[ngram]"): - is_ngram_enabled = True - elif speculative_method == "medusa": - is_medusa_enabled = True - elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"): - is_eagle_enabled = True - else: - speculative_model = self.speculative_config.get("model") - if speculative_model in ("ngram", "[ngram]"): - is_ngram_enabled = True - if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled): - # Other speculative decoding methods are not supported yet. - _raise_or_fallback(feature_name="Speculative Decoding", - recommend_to_remove=False) - return False + if (self.speculative_config is not None + and self.speculative_config.get("method") == "draft_model"): + raise NotImplementedError( + "Speculative decoding with draft model is not supported yet. " + "Please consider using other speculative decoding methods " + "such as ngram, medusa, eagle, or deepseek_mtp.") # No XFormers so far. V1_BACKENDS = [ diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 25fa1c3058be..e2f8de1990b5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1780,13 +1780,6 @@ def _get_stats(self, num_generation_tokens_from_prefill_groups) num_tokens_iter = (num_generation_tokens_iter + num_prompt_tokens_iter) - # Spec decode, if enabled, emits specialized metrics from the worker in - # sampler output. - if model_output and isinstance(model_output[0], SamplerOutput) and ( - model_output[0].spec_decode_worker_metrics is not None): - spec_decode_metrics = model_output[0].spec_decode_worker_metrics - else: - spec_decode_metrics = None return Stats( now=now, @@ -1808,7 +1801,6 @@ def _get_stats(self, num_tokens_iter=num_tokens_iter, time_to_first_tokens_iter=time_to_first_tokens_iter, time_per_output_tokens_iter=time_per_output_tokens_iter, - spec_decode_metrics=spec_decode_metrics, num_preemption_iter=num_preemption_iter, # Request stats diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 8d51f0472351..ba8dbd1fad79 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter from typing import Dict, List, Optional, Type, Union, cast @@ -19,9 +18,6 @@ else: ray_metrics = None -if TYPE_CHECKING: - from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics - logger = init_logger(__name__) prometheus_client.disable_created_metrics() @@ -199,30 +195,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig): documentation="Count of successfully processed requests.", labelnames=labelnames + [Metrics.labelname_finish_reason]) - # Speculative decoding stats - self.gauge_spec_decode_draft_acceptance_rate = self._gauge_cls( - name="vllm:spec_decode_draft_acceptance_rate", - documentation="Speulative token acceptance rate.", - labelnames=labelnames, - multiprocess_mode="sum") - self.gauge_spec_decode_efficiency = self._gauge_cls( - name="vllm:spec_decode_efficiency", - documentation="Speculative decoding system efficiency.", - labelnames=labelnames, - multiprocess_mode="sum") - self.counter_spec_decode_num_accepted_tokens = (self._counter_cls( - name="vllm:spec_decode_num_accepted_tokens_total", - documentation="Number of accepted tokens.", - labelnames=labelnames)) - self.counter_spec_decode_num_draft_tokens = self._counter_cls( - name="vllm:spec_decode_num_draft_tokens_total", - documentation="Number of draft tokens.", - labelnames=labelnames) - self.counter_spec_decode_num_emitted_tokens = (self._counter_cls( - name="vllm:spec_decode_num_emitted_tokens_total", - documentation="Number of emitted tokens.", - labelnames=labelnames)) - # --8<-- [end:metrics-definitions] @@ -391,9 +363,6 @@ def log(self, stats: Stats) -> None: self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) self.num_generation_tokens.append(stats.num_generation_tokens_iter) - # Update spec decode metrics - self.maybe_update_spec_decode_metrics(stats) - # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): @@ -435,10 +404,6 @@ def log(self, stats: Stats) -> None: stats.gpu_prefix_cache_hit_rate * 100, stats.cpu_prefix_cache_hit_rate * 100, ) - if self.spec_decode_metrics is not None: - log_fn( - self._format_spec_decode_metrics_str( - self.spec_decode_metrics)) self._reset(stats, prompt_throughput, generation_throughput) @@ -447,21 +412,9 @@ def _reset(self, stats, prompt_throughput, generation_throughput) -> None: self.num_prompt_tokens = [] self.num_generation_tokens = [] self.last_local_log = stats.now - self.spec_decode_metrics = None self.last_prompt_throughput = prompt_throughput self.last_generation_throughput = generation_throughput - def _format_spec_decode_metrics_str( - self, metrics: "SpecDecodeWorkerMetrics") -> str: - - return ("Speculative metrics: " - f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, " - f"System efficiency: {metrics.system_efficiency:.3f}, " - f"Number of speculative tokens: {metrics.num_spec_tokens}, " - f"Number of accepted tokens: {metrics.accepted_tokens}, " - f"Number of draft tokens: {metrics.draft_tokens}, " - f"Number of emitted tokens: {metrics.emitted_tokens}.") - def info(self, type: str, obj: SupportsMetricsInfo) -> None: raise NotImplementedError @@ -579,33 +532,14 @@ def log(self, stats: Stats): self.num_prompt_tokens.append(stats.num_prompt_tokens_iter) self.num_generation_tokens.append(stats.num_generation_tokens_iter) - # Update spec decode metrics - self.maybe_update_spec_decode_metrics(stats) - # Log locally every local_interval seconds. if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): - if self.spec_decode_metrics is not None: - self._log_gauge( - self.metrics.gauge_spec_decode_draft_acceptance_rate, - self.spec_decode_metrics.draft_acceptance_rate) - self._log_gauge(self.metrics.gauge_spec_decode_efficiency, - self.spec_decode_metrics.system_efficiency) - self._log_counter( - self.metrics.counter_spec_decode_num_accepted_tokens, - self.spec_decode_metrics.accepted_tokens) - self._log_counter( - self.metrics.counter_spec_decode_num_draft_tokens, - self.spec_decode_metrics.draft_tokens) - self._log_counter( - self.metrics.counter_spec_decode_num_emitted_tokens, - self.spec_decode_metrics.emitted_tokens) # Reset tracked stats for next interval. self.num_prompt_tokens = [] self.num_generation_tokens = [] self.last_local_log = stats.now - self.spec_decode_metrics = None def info(self, type: str, obj: SupportsMetricsInfo) -> None: # Info type metrics are syntactic sugar for a gauge permanently set to 1 diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9375dc4c495b..3281a9121a9d 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -16,10 +16,9 @@ import time from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import List, Optional +from typing import List from vllm.config import SupportsMetricsInfo, VllmConfig -from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @dataclass @@ -65,8 +64,6 @@ class Stats: running_lora_adapters: List[str] max_lora: str - spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None - class StatLoggerBase(ABC): """Base class for StatLogger.""" @@ -77,7 +74,6 @@ def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None: self.num_generation_tokens: List[int] = [] self.last_local_log = time.time() self.local_interval = local_interval - self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None @abstractmethod def log(self, stats: Stats) -> None: @@ -86,9 +82,3 @@ def log(self, stats: Stats) -> None: @abstractmethod def info(self, type: str, obj: SupportsMetricsInfo) -> None: raise NotImplementedError - - def maybe_update_spec_decode_metrics(self, stats: Stats): - """Save spec decode metrics (since they are unlikely - to be emitted at same time as log interval).""" - if stats.spec_decode_metrics is not None: - self.spec_decode_metrics = stats.spec_decode_metrics diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index e0fa6a00ecfa..8b66ef0dc765 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -104,11 +104,6 @@ def process_outputs(self, seqs = sequence_group.get_seqs( status=SequenceStatus.FINISHED_ABORTED) - for output in outputs: - if output.samples[0].output_token != VLLM_INVALID_TOKEN_ID: - sequence_group.metrics.spec_token_acceptance_counts[ - output.step_index] += 1 - assert seqs, "Expected RUNNING or FINISHED_ABORTED sequences" assert len(seqs) == 1, ( "Beam search not supported in multi-step decoding.") diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py deleted file mode 100644 index db68f18726d3..000000000000 --- a/vllm/model_executor/layers/rejection_sampler.py +++ /dev/null @@ -1,406 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from functools import cached_property -from importlib.util import find_spec -from typing import Optional - -import torch -import torch.jit - -import vllm.envs as envs -from vllm.logger import init_logger -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeStochasticBaseSampler) -from vllm.platforms import current_platform - -logger = init_logger(__name__) - -if find_spec("flashinfer"): - """ - Consider utilizing the FlashInfer rejection sampling kernel initially, - as it employs a dedicated kernel rather than relying on - Torch tensor operations. This design choice helps to fuse operations, - reduce memory I/O, and consequently enhances performance. - """ - from flashinfer.sampling import chain_speculative_sampling -else: - chain_speculative_sampling = None - - -class RejectionSampler(SpecDecodeStochasticBaseSampler): - """Apply modified rejection sampling as described in "Accelerating Large - Language Model Decoding with Speculative Sampling" - https://arxiv.org/pdf/2302.01318.pdf. - """ - - def __init__(self, - strict_mode: bool = False, - use_flashinfer: Optional[bool] = None): - """Create a rejection sampler. - - Args: - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. - use_flashinfer: We will use this parameter to determine whether - to use the FlashInfer rejection sampling kernel or not. If it's - None, we will use the default value from the environment variable. - This parameter is only used for testing purposes. - """ - super().__init__(strict_mode=strict_mode) - if use_flashinfer is None: - self.use_flashinfer = envs.VLLM_USE_FLASHINFER_SAMPLER and ( - chain_speculative_sampling is not None) - else: - self.use_flashinfer = use_flashinfer - - if self.use_flashinfer: - logger.info("Use flashinfer for rejection sampling.") - else: - logger.info("Use pytorch for rejection sampling.") - - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - seeded_seqs: Optional[dict[int, torch.Generator]] = None, - ) -> torch.Tensor: - """Sample token ids using rejection sampling. This accepts or rejects - tokens proposed by the draft model using the probability of each token - according to the draft and target models. - - In the worst case where all draft tokens are rejected, it is guaranteed - one correct token will be emitted. - - In the case where all draft tokens are accepted, a bonus token will be - accepted as its cheap to have the target model score this speculative - sequence. - - Args: - target_with_bonus_probs: The probability distribution - over token ids given context according to the target model. - shape = [batch_size, num_speculative_tokens + 1, vocab_size] - - bonus_token_ids: The "bonus" token ids that are accepted iff all - speculative tokens in a sequence are accepted. - shape = [batch_size, num_bonus_tokens] - - draft_probs: The probability distribution over token ids given - context according to the draft model. - shape = [batch_size, num_speculative_tokens, vocab_size] - - draft_token_ids: The token ids that were sampled from the draft - probabilities. - shape = [batch_size, num_speculative_tokens] - - seeded_seqs: Dict of batch row index to torch generator, for - sequences using seeded generation. - - Returns: - output_token_ids: The token ids sampled via rejection sampling, - or -1 if unable to sample a token because the previous token - was rejected. - shape = [batch_size, num_speculative_tokens + num_bonus_tokens] - """ - # Only perform shape/dtype/device checking in strict mode, as it adds - # overhead. - if self._strict_mode: - self._raise_if_incorrect_input(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - - batch_size, k, _ = draft_probs.shape - - # batch_size = 0 when all requests in the batch are - # non_spec requests. In this case, output_token_ids is - # just an empty tensor. - if batch_size == 0: - return torch.empty(0, k + 1, device=draft_probs.device, dtype=int) - - # If use Flashinfer chain_speculative_sampling kernel - # for rejection sampling - if self.use_flashinfer and chain_speculative_sampling is not None: - batch_size, k, _ = draft_probs.shape - - (output_token_ids, accepted_token_num, - emitted_token_num) = chain_speculative_sampling( - draft_probs, - draft_token_ids, - target_with_bonus_probs, - ) - - # num_emitted_tokens returned by flashinfer - # does not include the bonus token - # Flashinfer stops at the first token that violates - # the condition p >= q and does not include recovery/bonus token. - # Therefore, we need to add batch_size here. - self.num_accepted_tokens += accepted_token_num.sum() - self.num_emitted_tokens += emitted_token_num.sum() + batch_size - self.num_draft_tokens += batch_size * k - else: - accepted, recovered_token_ids = ( - self._batch_modified_rejection_sampling( - target_with_bonus_probs[:, :-1], - draft_probs, - draft_token_ids, - seeded_seqs, - )) - - output_token_ids = self._create_output( - accepted, - recovered_token_ids, - draft_token_ids, - bonus_token_ids, - ) - - return output_token_ids - - def _batch_modified_rejection_sampling( - self, - target_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[dict[int, torch.Generator]], - ) -> tuple[torch.Tensor, torch.Tensor]: - """Perform modified rejection sampling on each sequence. - - Returns: - A tuple of two tensors: - 0: A bool tensor of which tokens in each sequence is accepted. - shape = [batch_size, k] - 1: Token ids sampled from a recovered distribution, to be used - when a token is rejected. - shape = [batch_size, k] - """ - - batch_size, k, vocab_size = draft_probs.shape - - # shape [batch_size, k] - accepted = self._get_accepted(target_probs, draft_probs, - draft_token_ids, seeded_seqs) - - recovered_probs = self._get_recovered_probs( - target_probs, draft_probs).reshape(batch_size * k, vocab_size) - - # NOTE: the recovered_probs are overwritten by this method. - recovered_token_ids = _multinomial( - recovered_probs, - num_samples=1, - k=k, - seeded_seqs=seeded_seqs or {}, - ).reshape(batch_size, k) - - return accepted, recovered_token_ids - - def _create_uniform_samples(self, - seeded_seqs: Optional[dict[int, - torch.Generator]], - batch_size: int, k: int, - device: torch.device) -> torch.Tensor: - """ - Generates a batch of uniform random samples, with optional seeding - for specific sequences. - - This method creates a tensor of shape `(batch_size, k + 1)` filled - with uniform random values in the range [0, 1). If `seeded_seqs` - is provided, the sequences corresponding to specific indices - will be generated using the provided `torch.Generator` for - reproducibility. The other sequences will be generated without - a seed. - - Args: - seeded_seqs : Optional[dict[int, torch.Generator]] - A dictionary mapping indices in the batch to - `torch.Generator` objects. If `None`, all samples are - generated without a seed. - batch_size : int - The number of sequences to generate. - k : int - The number of random samples per sequence. - device : torch.device - The device on which to allocate the tensor. - - Returns: - uniform_rand : torch.Tensor - A tensor of shape `(batch_size, k + 1)` containing uniform - random values in the range [0, 1). - """ - if not seeded_seqs: - return torch.rand(batch_size, k + 1, device=device) - - uniform_rand = torch.empty(batch_size, k + 1, device=device) - - non_seeded_indices = [] - for idx in range(batch_size): - generator = seeded_seqs.get(idx) - if generator is None: - non_seeded_indices.append(idx) - else: - uniform_rand[idx, :] = torch.rand(1, - k + 1, - dtype=self.probs_dtype, - device=device, - generator=generator) - if non_seeded_indices: - uniform_rand[non_seeded_indices, :] = torch.rand( - len(non_seeded_indices), - k + 1, - dtype=self.probs_dtype, - device=device) - return uniform_rand - - def _get_accepted( - self, - target_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_probs: torch.Tensor, # [batch_size, k, vocab_size] - draft_token_ids: torch.Tensor, # [batch_size, k] - seeded_seqs: Optional[dict[int, torch.Generator]], - ) -> torch.Tensor: - r"""Create bool matrix over the proposed draft tokens. If - True, then a token can be accepted, else it should be - rejected. - - Given $q(\hat{x}_{n+1}|x_1, \dots, x_n)$, the probability of - $\hat{x}_{n+1}$ given context $x_1, \dots, x_n$ according - to the target model, and $p(\hat{x}_{n+1}|x_1, \dots, x_n)$, the - same conditional probability according to the draft model, the token - is accepted with probability: - - $$ - \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} - {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) - $$ - - This implementation does not apply causality. When using the output, - if a token is rejected, subsequent tokens should not be used. - - Returns a bool tensor of shape [batch_size, k] specifying which tokens - are accepted. - """ - batch_size, k, _ = draft_probs.shape - batch_indices = torch.arange(batch_size, - device=target_probs.device)[:, None] - probs_indices = torch.arange(k, device=target_probs.device) - - # shape [batch_size, k] - selected_draft_probs = draft_probs[batch_indices, probs_indices, - draft_token_ids] - - # shape [batch_size, k] - selected_target_probs = target_probs[batch_indices, probs_indices, - draft_token_ids] - - uniform_rand = self._create_uniform_samples(seeded_seqs, batch_size, - k - 1, target_probs.device) - - capped_ratio = torch.minimum( - selected_target_probs / selected_draft_probs, - torch.full((1, ), 1, device=target_probs.device)) - accepted = uniform_rand < capped_ratio - - return accepted - - def _get_recovered_probs( - self, - target_probs: torch.Tensor, # [k, vocab_size] - draft_probs: torch.Tensor, # [k, vocab_size] - ) -> torch.Tensor: - r"""Create a probability distribution for each proposed token which can - be sampled if the proposed token is rejected. - - When this routine is applied sequentially, the true distribution of the - target model is recovered (within hardware numerics). - - The probability distribution used in this rejection case is constructed - as follows. Given $q(x|x_1, \dots, x_n)$, the probability of - $x$ given context $x_1, \dots, x_n$ according to the target - model and $p(x|x_1, \dots, x_n)$, the same conditional probability - according to the draft model: - - $$ - x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ - $$ - - where $(f(x))_+$ is defined as: - - $$ - (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} - $$ - - See https://github.com/vllm-project/vllm/pull/2336 for a visualization - of the draft, target, and recovered probability distributions. - - Returns a tensor of shape [batch_size, k, vocab_size]. - - Note: - This batches operations on GPU and thus constructs the recovered - distribution for all tokens, even if they are accepted. This causes - division-by-zero errors, so we use self._smallest_positive_value to - avoid that. This introduces some drift to the distribution. - """ - _, k, _ = draft_probs.shape - - # shape [batch_size, k, vocab_size] - difference = target_probs - draft_probs - - # TODO(cade): Can we use logprobs instead of probs, and avoid the - # division-by-zero errors without introducing distribution drift? - - # shape [batch_size, k, vocab_size] - f = torch.clamp(difference, min=self._smallest_positive_value) - - # shape [batch_size, k, vocab_size] - recovered_probs = f / torch.sum(f, dim=-1).reshape(-1, k, 1) - - return recovered_probs - - @cached_property - def _smallest_positive_value(self) -> float: - """Return the smallest positive value representable by the probs dtype. - This value is used when constructing a distribution from which to sample - recovered tokens in the first rejection case. - - See _get_recovered_probs for more details - - Note that this isn't actually the smallest positive value representable - by float32, but the smallest positive normal value. - See https://en.wikipedia.org/wiki/Subnormal_number for more information. - """ - return torch.finfo(self.probs_dtype).tiny - - -# torch.multinomial forces a GPU<->CPU sync. -# Therefore, we use an optimized implementation instead that skips the sync. -# Note that we always sample with replacement. -# probs will be modified in place, but this is fine, as we pass -# in a copy already. -@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) -def _multinomial( - probs: torch.Tensor, - num_samples: int, - k: int, - seeded_seqs: dict[int, torch.Generator], -) -> torch.Tensor: - - if num_samples > 1: - # This is equivalent to torch.repeat_interleaved (which also - # forces a GPU<->CPU sync). - probs = probs[:, None, :].expand(probs.shape[0], num_samples, - probs.shape[1]).contiguous().view( - -1, probs.shape[1]) - q = torch.empty_like(probs) - if not seeded_seqs: - q.exponential_(1.0) - else: - start = 0 - for idx in range(len(q) // k): - end = start + k - generator = seeded_seqs.get(idx) - # Note: generator might be None for non seeded - q[start:end].exponential_(1.0, generator=generator) - start = end - - return probs.div_(q).argmax(dim=1).view(-1, num_samples) diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py index 08840fc40cf6..e77eb637c894 100644 --- a/vllm/model_executor/layers/sampler.py +++ b/vllm/model_executor/layers/sampler.py @@ -21,7 +21,6 @@ from vllm.sequence import (VLLM_INVALID_TOKEN_ID, CompletionSequenceGroupOutput, Logprob, PromptLogprobs, SampleLogprobs, SequenceOutput) -from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): # yapf: disable @@ -119,9 +118,6 @@ class SamplerOutput( # specified in lieu of prompt token ids or text. sampled_token_embeds: Optional[torch.Tensor] = None - # Spec decode metrics populated by workers. - spec_decode_worker_metrics: Optional[SpecDecodeWorkerMetrics] = None - # Optional last hidden states from the model. hidden_states: Optional[torch.Tensor] = None @@ -159,11 +155,9 @@ def __repr__(self) -> str: else self.sampled_token_probs.shape) sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else self.sampled_token_ids.shape) - return ( - f"SamplerOutput(outputs={self.outputs}, " - f"sampled_token_probs={sampled_token_probs_repr}, " - f"sampled_token_ids={sampled_token_ids_repr}, " - f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") + return (f"SamplerOutput(outputs={self.outputs}, " + f"sampled_token_probs={sampled_token_probs_repr}, " + f"sampled_token_ids={sampled_token_ids_repr})") class Sampler(nn.Module): diff --git a/vllm/model_executor/layers/spec_decode_base_sampler.py b/vllm/model_executor/layers/spec_decode_base_sampler.py deleted file mode 100644 index 0a36fe9be45b..000000000000 --- a/vllm/model_executor/layers/spec_decode_base_sampler.py +++ /dev/null @@ -1,259 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import abstractmethod -from typing import Optional, Union - -import torch -import torch.jit -import torch.nn as nn - -from vllm.platforms import current_platform - - -class SpecDecodeBaseSampler(nn.Module): - """Base class for samplers used for Speculative Decoding verification - step. - """ - - def __init__(self, strict_mode: bool = False): - """Base class constructor. - Args: - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. - """ - super().__init__() - self._strict_mode = strict_mode - - # NOTE: A "bonus token" is accepted iff all proposal tokens are - # accepted. There is always only one possible bonus token. We store this - # value in a variable for readability. - self._num_bonus_tokens = 1 - - self.num_accepted_tokens: Optional[torch.Tensor] = None - self.num_emitted_tokens: Optional[torch.Tensor] = None - self.num_draft_tokens: int = 0 - - def init_gpu_tensors(self, device: Union[int, str]) -> None: - assert self.num_accepted_tokens is None - if isinstance(device, int): - device = f"{current_platform.device_type}:{device}" - elif not isinstance(device, str): - raise ValueError(f"Device must be int or str, get {type(device)}") - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - - def init_tensors(self, - device: Union[int, str], - device_type: Union[torch.device, str] = 'cuda') -> None: - assert self.num_accepted_tokens is None - if isinstance(device_type, torch.device): - device_type = device_type.type - if isinstance(device, int): - device = f"{device_type}:{device}" - self.num_accepted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - self.num_emitted_tokens = torch.tensor(0, - dtype=torch.long, - device=device) - - @property - def probs_dtype(self): - return torch.float32 - - @property - def token_id_dtype(self): - return torch.int64 - - def _create_output( - self, - accepted: torch.Tensor, # [batch_size, k] - substitute_token_ids: torch.Tensor, # [batch_size, k] - draft_token_ids: torch.Tensor, # [batch_size, k] - bonus_token_ids: torch.Tensor, # [batch_size] - ) -> torch.Tensor: - """Format output. Returns a matrix of token ids. When - a token is rejected via sampling, all subsequent token ids are - set to -1 for the sequence. - - Args: - accepted: A boolean tensor indicating if the corresponding - draft token in draft_token_ids should be accepted or not. - substitute_token_ids: A tensor of token_ids that can be used - as substitutes for the draft token ids if the proposed token - is rejected. - draft_token_ids: A tensor of token ids speculated by the - draft model. - bonus_token_ids: Token ids to use as the bonus token if - all the draft tokens are accepted. - Returns: - A tensor containing the accepted token ids. The shape of the - tensor is [batch_size, k + num_bonus_tokens] - """ - batch_size, k = substitute_token_ids.shape - bonus_token_ids = bonus_token_ids.squeeze(-1) - # Determine the index of the first False value for each row. - limits = (accepted == 0).max(1).indices - limits[~(accepted == 0).any(1)] = k - - # Create masks using the indices. - indices = torch.arange(k, device=accepted.device).unsqueeze(0) - accepted_mask = indices < limits.unsqueeze(1) - after_false_mask = indices == limits.unsqueeze(1) - - # Create an extended output tensor - output_with_bonus_tokens = -torch.ones( - (batch_size, k + self._num_bonus_tokens), - dtype=self.token_id_dtype, - device=accepted.device) - output = output_with_bonus_tokens[:, :k] - - # Fill in the first k columns of the output tensor using masks and data - # tensors. - output[:, :k] = torch.where(accepted_mask, draft_token_ids, - -torch.ones_like(draft_token_ids)) - - # Fill the last column. - # We check output directly as accepted may have True values inconsistent - # with causal acceptance. - output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1, - bonus_token_ids, -1) - - # Fill the recovered token ids. - output.mul_(~after_false_mask).add_( - substitute_token_ids.mul(after_false_mask)) - - self.num_accepted_tokens += accepted.sum() - self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() - self.num_draft_tokens += batch_size * k - - return output_with_bonus_tokens - - def _raise_if_incorrect_input( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - self._raise_if_incorrect_shape(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - self._raise_if_incorrect_dtype(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - self._raise_if_inconsistent_device(target_with_bonus_probs, - draft_token_ids, bonus_token_ids, - draft_probs) - self._raise_if_out_of_bounds_vocab(target_with_bonus_probs.shape[-1], - draft_token_ids, bonus_token_ids) - - def _raise_if_incorrect_shape( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - (target_batch_size, num_target_probs, - target_vocab_size) = target_with_bonus_probs.shape - - # Does not count the extra token - num_target_probs -= 1 - - # validate the shape of draft token ids. - draft_token_ids_batch_size, num_draft_token_ids = draft_token_ids.shape - assert draft_token_ids_batch_size == target_batch_size - assert num_draft_token_ids == num_target_probs - - # validate the shape of bonus token ids - bonus_batch_size, num_bonus_tokens = bonus_token_ids.shape - assert bonus_batch_size == target_batch_size - assert num_bonus_tokens == self._num_bonus_tokens - - # validate the shape of draft probs if it is set - if draft_probs is not None: - (draft_batch_size, num_draft_probs, - draft_vocab_size) = draft_probs.shape - assert draft_batch_size == target_batch_size - assert num_draft_probs == num_target_probs - assert (draft_vocab_size == target_vocab_size - ), f"{draft_vocab_size=} {target_vocab_size=}" - - def _raise_if_incorrect_dtype( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - assert target_with_bonus_probs.dtype == self.probs_dtype - assert draft_token_ids.dtype == self.token_id_dtype - assert bonus_token_ids.dtype == self.token_id_dtype - if draft_probs is not None: - assert draft_probs.dtype == self.probs_dtype - - def _raise_if_inconsistent_device( - self, - target_with_bonus_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: Optional[torch.Tensor] = None, - ) -> None: - devices = [ - t.device for t in [ - target_with_bonus_probs, bonus_token_ids, draft_probs, - draft_token_ids - ] if t is not None - ] - assert all([devices[0] == device for device in devices]) - - def _raise_if_out_of_bounds_vocab( - self, - vocab_size: int, - draft_token_ids: torch.Tensor, - bonus_token_ids: torch.Tensor, - ) -> None: - assert torch.all(bonus_token_ids < vocab_size) - assert torch.all(bonus_token_ids >= 0) - assert torch.all(draft_token_ids < vocab_size) - assert torch.all(draft_token_ids >= 0) - - -class SpecDecodeDeterministicBaseSampler(SpecDecodeBaseSampler): - """Base class for samplers used for Speculative Decoding verification - step which are deterministic. - """ - - @abstractmethod - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> torch.Tensor: - raise NotImplementedError - - -class SpecDecodeStochasticBaseSampler(SpecDecodeBaseSampler): - """Base class for samplers used for Speculative Decoding verification - step which are stochastic - """ - - @abstractmethod - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - seeded_seqs: Optional[dict[int, torch.Generator]] = None, - ) -> torch.Tensor: - raise NotImplementedError diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py deleted file mode 100644 index 5dabaa5379e7..000000000000 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ /dev/null @@ -1,166 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch -import torch.jit - -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeDeterministicBaseSampler) - - -class TypicalAcceptanceSampler(SpecDecodeDeterministicBaseSampler): - """Apply typical acceptance sampling as described in section 3.3.1 in - "MEDUSA: Simple LLM Inference Acceleration Framework with - Multiple Decoding Heads" - https://arxiv.org/pdf/2401.10774 - """ - - def __init__( - self, - posterior_threshold: float, - posterior_alpha: float, - strict_mode: bool = False, - ): - """Create a Typical Acceptance Sampler. - - Args: - strict_mode: Whether or not to perform shape/device/dtype checks - during sampling. This catches correctness issues but adds - nontrivial latency. - posterior_threshold : A threshold value that sets a lower bound - on the posterior probability of a token in target model for it - to be accepted. - posterior_alpha : A scaling factor for the entropy-based - threshold in typical acceptance sampling. - """ - self._posterior_threshold = posterior_threshold - self._posterior_alpha = posterior_alpha - super().__init__(strict_mode=strict_mode) - - def forward( - self, - target_with_bonus_probs: torch.Tensor, - bonus_token_ids: torch.Tensor, - draft_probs: torch.Tensor, - draft_token_ids: torch.Tensor, - ) -> torch.Tensor: - """Sample token ids using typical acceptance sampling. This accepts - or rejects tokens proposed by the draft model using the probability - of each token according to the draft and target models. - - In the worst case where all draft tokens are rejected, it is guaranteed - one token will be emitted. - - In the case where all draft tokens are accepted, the bonus token will be - accepted. - - Args: - target_probs: The probability distribution over token ids given - context according to the target model. - shape = [batch_size, num_speculative_tokens, vocab_size] - - bonus_token_ids: The "bonus" token ids that are accepted iff all - speculative tokens in a sequence are accepted. - shape = [batch_size, num_bonus_tokens] - - draft_probs: This parameter is unused by the acceptance sampler. - - draft_token_ids: The token ids that were sampled from the draft - probabilities. - shape = [batch_size, num_speculative_tokens] - - Returns: - output_token_ids: The token ids sampled via rejection sampling, - or -1 if unable to sample a token because the previous token - was rejected. - shape = [batch_size, num_speculative_tokens + num_bonus_tokens] - """ - # Only perform shape/dtype/device checking in strict mode, as it adds - # overhead. - if self._strict_mode: - self._raise_if_incorrect_input(target_with_bonus_probs, - draft_token_ids, bonus_token_ids) - target_probs = target_with_bonus_probs[:, :-1] - accepted = self._evaluate_accepted_tokens(target_probs, - draft_token_ids) - recovered_token_ids = self._get_recovered_token_ids(target_probs) - output_token_ids = self._create_output(accepted, recovered_token_ids, - draft_token_ids, - bonus_token_ids) - return output_token_ids - - def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): - r""" - Evaluates and returns a mask of accepted tokens based on the - posterior probabilities. - - Args: - target_probs (torch.Tensor): A tensor of shape - (batch_size, k, vocab_size) representing the probabilities of - each token in the vocabulary for each position in the proposed - sequence. This is the distribution generated by the target - model. - draft_token_ids (torch.Tensor): A tensor of shape (batch_size, k) - representing the proposed token ids. - - A draft token_id x_{n+k} is accepted if it satisfies the - following condition - - $$ - p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > - \min \left( \epsilon, \delta * \exp \left( - -H(p_{\text{original}}( - \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) - $$ - - where $p_{\text{original}}$ corresponds to target_probs - and $\epsilon$ and $\delta$ correspond to hyperparameters - specified using self._posterior_threshold and self._posterior_alpha - - This method computes the posterior probabilities for the given - draft token ids based on the provided target probabilities. It - calculates the entropy of the posterior distribution and determines - a dynamic threshold for each token position using the provided - posterior_threshold and posterior_alpha values. The method then - returns a boolean mask indicating which tokens can be accepted. - - Returns: - torch.Tensor: A boolean tensor of shape (batch_size, k) where each - element indicates whether the corresponding draft token has - been accepted or rejected. True indicates acceptance and false - indicates rejection. - """ - device = target_probs.device - candidates_prob = torch.gather( - target_probs, dim=-1, - index=draft_token_ids.unsqueeze(-1)).squeeze(-1) - # A small constant added to prevent computing the logarithm of zero, - # which can lead to undefined values. - epsilon = 1e-5 - posterior_entropy = -torch.sum( - target_probs * torch.log(target_probs + epsilon), dim=-1) - threshold = torch.minimum( - torch.ones_like(posterior_entropy, device=device) * - self._posterior_threshold, - torch.exp(-posterior_entropy) * self._posterior_alpha, - ) - accepted_mask = candidates_prob > threshold - return accepted_mask - - def _get_recovered_token_ids(self, target_probs): - """ - The recovered token ids will fill the first unmatched token - by the target token. - - Args: - target_probs (torch.Tensor): A tensor of shape - (batch_size, k, vocab_size) containing the target probability - distribution. - - Returns: - torch.Tensor: A tensor of shape (batch_size, k) with the recovered - token ids which are selected from target probs. - """ - max_indices = torch.argmax(target_probs, dim=-1) - - return max_indices diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py deleted file mode 100644 index c551ecd68ef8..000000000000 --- a/vllm/model_executor/models/eagle.py +++ /dev/null @@ -1,261 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from collections.abc import Iterable -from typing import Optional - -import torch -import torch.nn as nn - -from vllm.config import VllmConfig -from vllm.logger import init_logger -from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models import ModelRegistry -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors - -from .utils import maybe_prefix - -logger = init_logger(__name__) - - -class DummyInputLayerNorm(nn.Module): - - def __init__(self, weight=None, bias=None): - super().__init__() - self.weight = nn.Parameter(weight) if weight is not None else None - self.bias = nn.Parameter(bias) if bias is not None else None - - def forward(self, x): - return x - - -class DummyOutputNorm(nn.Module): - - def forward(self, x, residual): - if residual is None: - return x - else: - return x + residual, None - - -class EAGLE(nn.Module): - """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077 - Reference implementation: https://github.com/SafeAILab/EAGLE - - Differences from reference implementation: - 1. In reference, LlamaDecoderLayer implementation doesn't have - input_layernorm for 1st decoder layer (https://github.com/SafeAILab/EAGLE/blob/7d065d084443fbfd386f88839efd7193c12be869/eagle/model/cnets.py#L427). - Following this approach, our implementation also disables - the input_layernorm for the first decoder layer. - 2. We allow any decoder layer to be used in EAGLE whereas in reference - decoder layer is fixed to be LlamaDecoderLayer. - 3. We have an optional token_map which reduces draft vocab to most - frequently used tokens to give some additional speed-up by reducing - sampling overhead. This is disabled unless the checkpoint file has - explicit token_map tensor and config has an optional attribute - truncated_vocab_size < vocab_size. To use this technique, one has to find - the top-k most frequent tokens in target dataset and add that as a tensor - in the draft checkpoint (using key token_map). Also, the draft config - needs to have truncated_vocab_size (=k) as an attribute. - 4. We allow an enhanced EAGLE architecture similar to the DeepSeek MTP - module with regards to the use of additional RMS norms. The original - EAGLE architecture 1) skips the pre-attention norm in its first - transformer block, and 2) skips the final output norm, both of which we - found to be suboptimal. We also add the support for separate norms - applying to both the token embedding and hidden states before projection - as in DeepSeek MTP, which we found to improve performance as well. - """ - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - self.dtype = vllm_config.model_config.dtype - self.config = config - - architectures = getattr(self.config.model, "architectures", []) - model_cls, _ = ModelRegistry.resolve_model_cls(architectures) - - self.model = model_cls(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - - self.fc = nn.Linear(config.model.hidden_size * 2, - config.model.hidden_size, - bias=getattr(self.config, "eagle_fc_bias", False)) - - # Modify layer normalization and residual connections as suggested - # in the EAGLE framework: https://github.com/SafeAILab/EAGLE - # While weights and biases are generally not needed, - # they are retained here to support certain unit tests - # (e.g., spec_decode/e2e/test_eagle_correctness.py). - if not hasattr(self.config.model, - "skip_prenorm") or self.config.model.skip_prenorm: - self.model.model.layers[0].input_layernorm = DummyInputLayerNorm( - weight=self.model.model.layers[0].input_layernorm.weight) - - if not hasattr( - self.config.model, - "skip_output_norm") or self.config.model.skip_output_norm: - self.model.model.norm = DummyOutputNorm() - - self.add_para_norm = False - if hasattr(self.config.model, - "add_para_norm") and self.config.model.add_para_norm: - self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - self.add_para_norm = True - - self.orig_vocab_size = config.vocab_size - self.truncated_vocab_size = config.truncated_vocab_size - self.unpadded_vocab_size = self.truncated_vocab_size - - self.lm_head = ParallelLMHead( - self.unpadded_vocab_size, - config.hidden_size, - org_num_embeddings=self.truncated_vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE, - ) - - logit_scale = getattr(config, "logit_scale", 1.0) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - self.truncated_vocab_size, - logit_scale) - - # Token map is a idx to token mapping to reduce the vocab size for - # the draft model. Using smaller vocab size for draft, containing - # only most frequent tokens reduces the speculation overhead. This - # doesn't affect the acceptance rate much and thus gives more speed - # -up. By default, this is disabled and is only used if the EAGLE - # checkpoint file has token_map tensor. - self.token_map = None - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.model.get_input_embeddings(input_ids) - - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - previous_hidden_states: torch.Tensor, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - - if inputs_embeds is None: - inputs_embeds = self.get_input_embeddings(input_ids) - - # Handle both empty previous_hidden_states - # and mismatched batch size - batch_size = inputs_embeds.size(0) - if previous_hidden_states.size(0) == 0 or \ - previous_hidden_states.size(0) != batch_size: - hidden_dim = self.config.model.hidden_size - device = inputs_embeds.device - # Create zero tensor with matching batch size - previous_hidden_states = \ - torch.zeros(batch_size, hidden_dim, device=device) - - if self.add_para_norm: - inputs_embeds = torch.cat([ - self.enorm(inputs_embeds), - self.hnorm(previous_hidden_states) - ], - dim=-1) - else: - inputs_embeds = torch.cat([inputs_embeds, previous_hidden_states], - dim=-1) - - inputs_embeds = self.fc(inputs_embeds) - - inputs_embeds[positions == 0] = 0 # masking inputs at position=0 - - hidden_states = self.model.model( - input_ids=None, - inputs_embeds=inputs_embeds, - positions=positions, - intermediate_tensors=intermediate_tensors, - ) - return hidden_states - - def compute_logits(self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata) -> torch.Tensor: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - - if self.token_map is not None: - _logits = logits - logits = -torch.inf * torch.ones( - size=(*_logits.shape[:-1], self.orig_vocab_size), - device=_logits.device, - dtype=_logits.dtype) - - logits[..., self.token_map] = _logits - - return logits - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - # This implementation is incompatible with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B - # due to missing lm_head weights and its config being that of a - # Llama model. Here's a compatible version with the same weights: - # https://huggingface.co/abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm - # Also, here's an example script for converting trained EAGLE - # checkpoint to vLLM compatible version: https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d - model_weights = {} - for name, loaded_weight in weights: - if name == "token_map": - if self.config.truncated_vocab_size < self.config.vocab_size: - self.token_map = nn.Parameter(loaded_weight, - requires_grad=False) - elif name.startswith("fc.weight"): - weight_loader = getattr(self.fc.weight, "weight_loader", - default_weight_loader) - weight_loader(self.fc.weight, loaded_weight) - elif name.startswith("fc.bias"): - if self.fc.bias is not None: - weight_loader = getattr(self.fc.bias, "weight_loader", - default_weight_loader) - weight_loader(self.fc.bias, loaded_weight) - else: - logger.warning_once("Found bias in the loaded weights but " - "the model config doesn't have bias.") - elif name.startswith("enorm.weight"): - weight_loader = getattr(self.enorm.weight, "weight_loader", - default_weight_loader) - weight_loader(self.enorm.weight, loaded_weight) - elif name.startswith("hnorm.weight"): - weight_loader = getattr(self.hnorm.weight, "weight_loader", - default_weight_loader) - weight_loader(self.hnorm.weight, loaded_weight) - elif name.startswith("model.lm_head.") or name.startswith( - "model.model."): - model_weights[name.split("model.", 1)[-1]] = loaded_weight - elif name.startswith("lm_head.") or name.startswith("model."): - model_weights[name] = loaded_weight - else: - model_weights[f"model.{name}"] = loaded_weight - - if "lm_head.weight" in model_weights: - lm_head_weight = model_weights.pop("lm_head.weight") - - if self.token_map is not None and\ - lm_head_weight.shape[0] > self.token_map.shape[0]: - - lm_head_weight = lm_head_weight[self.token_map] - - else: - # NOTE(Shangming): initialize the placeholder for lm_head weight. - lm_head_weight = torch.zeros( - self.lm_head.org_vocab_size, - self.lm_head.embedding_dim, - dtype=self.dtype, - ) - - weight_loader = getattr(self.lm_head.weight, "weight_loader", - default_weight_loader) - weight_loader(self.lm_head.weight, lm_head_weight) - - self.model.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index fd831727ab2f..d5233c28b19b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -239,14 +239,15 @@ _SPECULATIVE_DECODING_MODELS = { "MiMoMTPModel": ("mimo_mtp", "MiMoMTP"), - "EAGLEModel": ("eagle", "EAGLE"), "EagleLlamaForCausalLM": ("llama_eagle", "EagleLlamaForCausalLM"), "EagleLlama4ForCausalLM": ("llama4_eagle", "EagleLlama4ForCausalLM"), "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), "MedusaModel": ("medusa", "Medusa"), - "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), + # Temporarily disabled. + # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. + # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), } _TRANSFORMERS_MODELS = { diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 240724a675a4..962e2b3aab60 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -132,14 +132,10 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: parallel_config.worker_cls = \ "vllm.worker.multi_step_worker.MultiStepWorker" elif vllm_config.speculative_config: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = \ - "vllm.v1.worker.gpu_worker.Worker" - else: - parallel_config.worker_cls = \ - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = \ - "vllm.worker.worker.Worker" + if not envs.VLLM_USE_V1: + raise NotImplementedError( + "Speculative decoding is not supported on vLLM V0.") + parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" else: if envs.VLLM_USE_V1: parallel_config.worker_cls = \ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index e9e18d3fe8e4..0bf9262776b1 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -326,15 +326,10 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: parallel_config.worker_cls = \ "vllm.worker.multi_step_worker.MultiStepWorker" elif vllm_config.speculative_config: - if envs.VLLM_USE_V1: + if not envs.VLLM_USE_V1: raise NotImplementedError( - "Speculative decoding is not yet supported on vLLM V1." - ) - else: - parallel_config.worker_cls = \ - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = \ - "vllm.worker.worker.Worker" + "Speculative decoding is not supported on vLLM V0.") + parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" else: if envs.VLLM_USE_V1: parallel_config.worker_cls = \ diff --git a/vllm/sequence.py b/vllm/sequence.py index ffe890eb2dab..87ba74c68536 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -112,13 +112,6 @@ class RequestMetrics: model_execute_time: The time spent in the model execute function. This will include model forward, block/sync across workers, cpu-gpu sync time and sampling time. - spec_token_acceptance_counts: number of accepted speculative tokens at - each position; the first token is from - the target model and is always accepted; - e.g., when it's [10, 8, 4, 2] for a req, - it means there were 10 forward passes in - total, and there were 8, 4, 2 accepted - tokens at 1st, 2nd, 3rd speculation step. """ arrival_time: float last_token_time: float @@ -129,7 +122,6 @@ class RequestMetrics: scheduler_time: Optional[float] = None model_forward_time: Optional[float] = None model_execute_time: Optional[float] = None - spec_token_acceptance_counts: Optional[list[int]] = None class SequenceDataDelta( @@ -748,9 +740,7 @@ def __init__(self, last_token_time=arrival_time, first_scheduled_time=None, first_token_time=None, - time_in_queue=None, - spec_token_acceptance_counts=[0] * - draft_size) + time_in_queue=None) self.last_token_latency = 0.0 self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None @@ -1390,8 +1380,6 @@ class ExecuteModelRequest( previous_hidden_states: Optional[HiddenStates] = None # The number of forward steps to run. num_steps: int = 1 - # The step index for spec model input. - spec_step_idx: Optional[int] = None # Finished request ids since last step. finished_requests_ids: list[str] = msgspec.field(default_factory=list) # The last sampled token ids for multi step decoding. diff --git a/vllm/spec_decode/__init__.py b/vllm/spec_decode/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py deleted file mode 100644 index f9b882469a4d..000000000000 --- a/vllm/spec_decode/batch_expansion.py +++ /dev/null @@ -1,506 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from array import array -from itertools import chain, count -from typing import Iterator, List, Optional, Tuple - -import torch - -from vllm import SamplingParams -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, VLLM_TOKEN_ID_ARRAY_TYPE, - ExecuteModelRequest, SequenceData, - SequenceGroupMetadata, get_all_seq_ids) -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.util import nvtx_range, split_batch_by_proposal_len - -SeqId = int -TargetSeqId = int -TokenId = int - -DEFAULT_SIMPLE_SAMPLING_PARAMS = SamplingParams() - - -class BatchExpansionTop1Scorer(SpeculativeScorer): - """Implements a speculative scorer that uses batch expansion to get - probabilities of speculative tokens according to the scoring model. - - Batch expansion converts a list of sequences and multiple query positions - to a new batch of sequences, each with a single query position. This allows - for MQA-like scoring in speculative decoding without requiring an MQA - kernel. - - It is strictly less efficient than MQA scoring. - - It only supports scoring the top1 proposal tokens of the proposer, instead - of topk/tree. - """ - - @nvtx_range("BatchExpansionTop1Scorer.score_proposals") - def score_proposals( - self, - execute_model_req: ExecuteModelRequest, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - """Score the proposed tokens via the scorer model. - - This converts each input sequence to a set of k+1 target sequences. The - target sequences have the unique continuations to be scored and a - unique sequence ID that is different from all input sequence ids. - - If a speculative sequence length would exceed the max model length, then - no speculation is produced for that sequence. - - Args: - execute_model_req: The execution request. - proposals: The speculative proposals to score. - Returns: - SpeculativeScores: The scores of each speculative token, along with - which sequences were ignored during scoring. - """ - - # TODO(cade) perform this on GPU to remove blocking call. - proposal_lens_list = proposals.proposal_lens.tolist() - proposal_token_ids_list = proposals.proposal_token_ids.tolist() - - # Filter the list to ignore invalid proposals. - proposal_token_ids_list_without_skips = [ - proposals for proposals in proposal_token_ids_list - if VLLM_INVALID_TOKEN_ID not in proposals - ] - - (spec_indices, non_spec_indices, target_seq_group_metadata_list, - num_scoring_tokens) = self._expand_batch( - seq_group_metadata_list=execute_model_req.seq_group_metadata_list, - proposal_token_ids_list=proposal_token_ids_list_without_skips, - proposal_lens_list=proposal_lens_list, - ) - - target_sampler_output = self._scorer_worker.execute_model( - execute_model_req=execute_model_req.clone( - seq_group_metadata_list=target_seq_group_metadata_list)) - assert len(target_sampler_output) == 1, "expected single-step output" - target_sampler_output = target_sampler_output[0] - - if not non_spec_indices: - # All sequence groups in batch have spec decoding enabled - return self._contract_batch_all_spec( - target_sampler_output=target_sampler_output, - proposals=proposals, - ) - else: - # Batch has a mix of spec decode enabled and disabled seq groups - return self._contract_batch( - execute_model_req.seq_group_metadata_list, - target_sampler_output=target_sampler_output, - proposals=proposals, - num_scoring_tokens=num_scoring_tokens, - non_spec_indices=non_spec_indices, - spec_indices=spec_indices, - k=execute_model_req.num_lookahead_slots, - ) - - def _expand_batch( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_token_ids_list: List[List[TokenId]], - proposal_lens_list: List[int], - ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]: - """Given the input sequences and potentially multiple corresponding - proposal tokens, create a new batch where each sequence has a single - query token. - """ - - # vLLM currently only supports proposal lens equal to zero or the batch - # proposal len. This adds some complexity (splitting the batch into spec - # and non spec sequences) and should be removed in the future. It can be - # done by supporting per-sequence proposal lens. - (spec_seqs, spec_indices), (non_spec_seqs, non_spec_indices) = \ - split_batch_by_proposal_len( - seq_group_metadata_list, proposal_lens_list) - - spec_expanded_seqs = self._create_scoring_model_input( - seq_group_metadata_list=spec_seqs, - proposal_token_ids=proposal_token_ids_list, - # NOTE: We determine the seq ids in the expanded batch using the - # full seq_group_metadata_list, instead of only spec_seqs. - target_seq_ids_iter=self._create_target_seq_id_iterator( - seq_ids=get_all_seq_ids(seq_group_metadata_list)), - ) - - num_scoring_tokens = len(spec_expanded_seqs) - # Batch speculative and non-speculative (e.g. chunked prefill) requests - # but make sure order is prefill|decode due to backend requirement. - target_seq_group_metadata_list = non_spec_seqs + spec_expanded_seqs - - return (spec_indices, non_spec_indices, target_seq_group_metadata_list, - num_scoring_tokens) - - def _contract_non_speculative( - self, scores: SpeculativeScores, - seq_group_metadata_list: List[SequenceGroupMetadata], - non_spec_indices: List[int], non_spec_outputs: SpeculativeScores, - has_prompt_log: bool) -> SpeculativeScores: - """ - Augment input `scores` with non-speculative requests outputs. - This includes decode requests with speculation turned off, as well - as prefill requests when `enable_chunked_prefill` is set. - For the latter, prefills are further separated into terminal and - non-terminal chunks (from which no token is sampled). - """ - if not non_spec_indices: - return scores - - if has_prompt_log: - # When prompt_logprobs is enabled, prefills yield output token - # (and respective prob) in the last entry (prompt|out): - # [.|.|.|prefill0_out|.|prefill1_out|decode0_out|..]. - # With chunked prefill, non-terminal chunks have -1 on each - # position: they're still picked, but they're discarded later. - seq_meta = seq_group_metadata_list - nospec_sizes = torch.tensor([ - seq_meta[i].token_chunk_size if seq_meta[i].is_prompt else 1 - for i in non_spec_indices - ]) - nospec_sampled_token_idxs = torch.cumsum(nospec_sizes, 0).add_(-1) - else: - # In this case only sampled tokens are returned, select all. - nospec_sampled_token_idxs = list( - range(len(non_spec_outputs.token_ids))) - - scores.token_ids[non_spec_indices, :1] = \ - non_spec_outputs.token_ids[nospec_sampled_token_idxs].unsqueeze(1) - scores.probs[non_spec_indices, :1, :] = \ - non_spec_outputs.probs[nospec_sampled_token_idxs].unsqueeze(1) - scores.logprobs[non_spec_indices, :1, :] = \ - non_spec_outputs.logprobs[nospec_sampled_token_idxs].unsqueeze(1) - if scores.hidden_states is not None: - assert non_spec_outputs.hidden_states is not None - scores.hidden_states[non_spec_indices, :1, :] = \ - non_spec_outputs.hidden_states[nospec_sampled_token_idxs].unsqueeze(1) - return scores - - def _contract_batch( - self, - contracted_seq_group_metadata_list: List[SequenceGroupMetadata], - target_sampler_output: SamplerOutput, - proposals: SpeculativeProposals, num_scoring_tokens: int, - non_spec_indices: List[int], spec_indices: List[int], - k: int) -> SpeculativeScores: - """Contract the expanded batch back into its original size. - This maps the scores of speculative tokens back to their original - sequences. - - contracted_bs is the original batch size, and the batch size that the - target_sampler_output will be contracted to. - """ - contracted_bs = len(contracted_seq_group_metadata_list) - (target_token_ids, target_probs, target_logprobs, target_hidden_states, - non_spec_target_token_ids, non_spec_target_probs, - non_spec_target_logprobs, - non_spec_target_hidden_states) = self._split_scoring_output( - target_sampler_output, num_scoring_tokens) - - # Map distinct sequences used to score each token - # of shape [batch_size * k + 1] back to [batch_size, k + 1]. - expanded_batch_size, k = proposals.proposal_token_ids.shape - - # The number of tokens in the expanded batch used for speculation is - # equal to the total expanded batch size minus the number of samples for - # non-speculative sequences, prefill chunks with no out tokens included - non_spec_expanded_bs = len(non_spec_indices) - spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs - - target_token_ids = target_token_ids.reshape(spec_expanded_bs, k + 1) - target_probs = target_probs.reshape(*target_token_ids.shape, - self._vocab_size) - target_logprobs = target_logprobs.reshape(target_probs.shape) - - if target_hidden_states is not None: - target_hidden_states = target_hidden_states.reshape( - *target_token_ids.shape, target_hidden_states.shape[-1]) - - all_tokens = target_token_ids.new_full(size=(contracted_bs, k + 1), - fill_value=-1) - all_probs = target_probs.new_zeros(*all_tokens.shape, self._vocab_size) - all_logprobs = target_logprobs.new_full(size=all_probs.shape, - fill_value=-float("inf")) - - if target_sampler_output.hidden_states is not None: - all_hidden_states = target_hidden_states.new_zeros( - size=(contracted_bs, k + 1, target_hidden_states.shape[-1])) - else: - all_hidden_states = None - - has_prompt_log = any((sg.sampling_params.prompt_logprobs - and sg.sampling_params.prompt_logprobs > 0) - for sg in contracted_seq_group_metadata_list) - # When prompt logprobs is enabled, lens of returned tensors go from - # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. - # We adjust stride accordingly to get the generated tokens and - # their probs, but pass on prompt_logprobs as is. - prompt_logprobs = None - if (not self._scorer_worker.model_runner.disable_logprobs\ - and has_prompt_log): - prompt_logprobs = [ - o.prompt_logprobs for o in target_sampler_output.outputs - ] - elif not has_prompt_log: - # When prompt logprobs are not to be returned, - # we can ignore non-terminal chunks (no out token). - non_spec_indices = [ - idx for idx in non_spec_indices - if contracted_seq_group_metadata_list[idx].do_sample - ] - - # "Contract" speculative. - if spec_indices: - all_tokens[spec_indices] = target_token_ids - all_probs[spec_indices] = target_probs - all_logprobs[spec_indices] = target_logprobs - if all_hidden_states is not None: - all_hidden_states[spec_indices] = target_hidden_states - - spec_scores = SpeculativeScores(probs=all_probs, - token_ids=all_tokens, - logprobs=all_logprobs, - hidden_states=all_hidden_states, - prompt_logprobs=prompt_logprobs) - - non_spec_outputs = SpeculativeScores( - probs=non_spec_target_probs, - token_ids=non_spec_target_token_ids, - logprobs=non_spec_target_logprobs, - hidden_states=non_spec_target_hidden_states) - # Contract remaining nonspec entries based on non_spec_indices, if any. - return self._contract_non_speculative( - spec_scores, contracted_seq_group_metadata_list, non_spec_indices, - non_spec_outputs, has_prompt_log) - - def _contract_batch_all_spec( - self, - target_sampler_output: SamplerOutput, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - """Contract the expanded batch back into its original size. - This maps the scores of speculative tokens back to their original - sequences. - - It assumes all sequences in the batch were previously expanded. - """ - - # Map distinct sequences used to score each token - # of shape [batch_size * k + 1] back to [batch_size, k + 1]. - contracted_bs, k = proposals.proposal_token_ids.shape - - # Reshape tensors to original batch size - target_token_ids = target_sampler_output.sampled_token_ids.reshape( - contracted_bs, k + 1) - target_probs = target_sampler_output.sampled_token_probs.reshape( - *target_token_ids.shape, self._vocab_size) - target_logprobs = target_sampler_output.logprobs.reshape( - target_probs.shape) - target_hidden_states = target_sampler_output.hidden_states - if target_hidden_states is not None: - target_hidden_states = target_hidden_states.reshape( - *target_token_ids.shape, target_hidden_states.shape[-1]) - - return SpeculativeScores(probs=target_probs, - token_ids=target_token_ids, - logprobs=target_logprobs, - hidden_states=target_hidden_states, - prompt_logprobs=None) - - def _create_scoring_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] - target_seq_ids_iter: Iterator[TargetSeqId], - ) -> List[SequenceGroupMetadata]: - """Given the original input sequences and proposed tokens from the draft - model, create a list of target sequences that can be used for scoring. - - target_seq_ids_iter provides sequence ids for the expanded batch, - fulfilling the requirement that no seq id in the expanded batch is equal - to the seq id in the original batch. - """ - - if not seq_group_metadata_list: - return [] - - target_seq_group_metadata = list( - chain.from_iterable( - self._create_target_seq_group_metadata( - seq_group_metadata, - proposal_token_ids, - i, - target_seq_ids_iter, - ) for i, seq_group_metadata in enumerate( - seq_group_metadata_list))) - - return target_seq_group_metadata - - def _create_target_seq_group_metadata( - self, - input_seq_group_metadata: SequenceGroupMetadata, - proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k] - batch_index: int, - target_seq_ids_iter: Iterator[TargetSeqId], - ) -> List[SequenceGroupMetadata]: - """Given an input sequence group metadata and a list of draft tokens, - create a list of target SequenceGroupMetadata, one for each - token id that needs to be scored. - - Naive speculative decoding requires K target model scores, one for each - draft model token. However one can add a bonus token such that if each - token is accepted, then a final token may be sampled from the model. - This function creates K+1 target SequenceGroupMetadata to take - advantage of the bonus token. - """ - assert len(input_seq_group_metadata.seq_data) == 1, ( - "Beam search " - "not supported in speculative decoding") - input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys())) - - token_ids_to_score = self._get_token_ids_to_score( - proposal_token_ids[batch_index]) - - sampling_params = input_seq_group_metadata.sampling_params - target_seq_group_metadata_list: List[SequenceGroupMetadata] = [] - for i, token_ids in enumerate(token_ids_to_score): - target_seq_group_metadata_list.append( - self._create_single_target_seq_group_metadata( - input_seq_group_metadata, - input_seq_id, - next(target_seq_ids_iter), - token_ids, - sampling_params=sampling_params, - )) - - return target_seq_group_metadata_list - - @staticmethod - def _create_single_target_seq_group_metadata( - seq_group_metadata: SequenceGroupMetadata, - seq_id: SeqId, - target_seq_id: TargetSeqId, - token_ids: List[TokenId], - sampling_params: SamplingParams, - ) -> SequenceGroupMetadata: - """Create a single target SequenceGroupMetadata. - - Args: - seq_group_metadata: The metadata for the input sequence. - seq_id: The input sequence ID. - target_seq_id: The corresponding target sequence ID. - token_ids: The list of token ids that are to be appended to the - input sequence. - """ - seq_data = seq_group_metadata.seq_data[seq_id] - prompt_token_ids = seq_data.prompt_token_ids_array - new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids] - mrope_position_delta = seq_data.mrope_position_delta - - new_seq_data_dict = { - target_seq_id: - SequenceData( - prompt_token_ids, - _output_token_ids=array(VLLM_TOKEN_ID_ARRAY_TYPE, - new_output_token_ids), - ), - } - # This is a hack. Technically, spec decoding should compute - # num_lookahead slots at one shot, but instead, it expands the batch - # and evaluate one by one right now. context_len is seq_len - 1 because - # the kv cache is filled by a previous batch in the batch expansion. - for data in new_seq_data_dict.values(): - data.update_num_computed_tokens(data.get_len() - 1) - data.mrope_position_delta = mrope_position_delta - - return SequenceGroupMetadata( - request_id=seq_group_metadata.request_id, - is_prompt=seq_group_metadata.is_prompt, - seq_data=new_seq_data_dict, - sampling_params=sampling_params, - block_tables={ - target_seq_id: seq_group_metadata.block_tables[seq_id], - }, - lora_request=None, - token_chunk_size=1, - ) - - @staticmethod - def _split_scoring_output( - sampler_output: SamplerOutput, num_scoring_tokens: int - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], torch.Tensor, torch.Tensor, - torch.Tensor, Optional[torch.Tensor]]: - """Split the target model output into speculative and non-speculative - output. - """ - - # vLLM currently only supports proposal lens equal to zero or the batch - # proposal len. This adds some complexity (splitting the batch into spec - # and non spec sequences) and should be removed in the future. It can be - # done by supporting per-sequence proposal lens. - # - # First samples are non-speculative, latter samples are from speculative - # scoring (prefill|decode order). - split_sizes = (sampler_output.sampled_token_ids.numel() - - num_scoring_tokens, num_scoring_tokens) - (non_spec_probs, - spec_probs) = sampler_output.sampled_token_probs.split(split_sizes) - (non_spec_sampled_tokens, spec_sampled_tokens - ) = sampler_output.sampled_token_ids.flatten().split(split_sizes) - (non_spec_logprobs, - spec_logprobs) = sampler_output.logprobs.split(split_sizes) - - if sampler_output.hidden_states is not None: - (non_spec_hidden_states, spec_hidden_states - ) = sampler_output.hidden_states.split(split_sizes) - else: - non_spec_hidden_states, spec_hidden_states = None, None - - return (spec_sampled_tokens, spec_probs, spec_logprobs, - spec_hidden_states, non_spec_sampled_tokens, non_spec_probs, - non_spec_logprobs, non_spec_hidden_states) - - @staticmethod - def _create_target_seq_id_iterator( - seq_ids: List[SeqId]) -> Iterator[TargetSeqId]: - """Create an iterator for creating target sequence ids. - Target sequence ids are distinct from sequence ids because we create a - distinct target sequence id for each proposal token to be scored. - - This implementation increments a counter starting at 1 + max of all - provided input sequence ids. - """ - return count(start=max(seq_ids) + 1) - - @staticmethod - def _get_token_ids_to_score( - full_spec_token_ids: List[TokenId] # shape: [k] - ) -> List[List[TokenId]]: - """Given an int tensor of proposal token ids, return a list of - token ids that should be scored. - - Returns k+1 output lists. The additional one is used for generating the - bonus token. - - Example: - Input: [0, 1, 2, 3] (k=4) - Output: (k+1 lists) - [] - [0] - [0, 1] - [0, 1, 2] - [0, 1, 2, 3] - """ - empty_token_ids: List[TokenId] = [] - - token_ids_to_score = [empty_token_ids] - token_ids_to_score.extend(full_spec_token_ids[:i + 1] - for i in range(len(full_spec_token_ids))) - return token_ids_to_score diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py deleted file mode 100644 index 96646ec94718..000000000000 --- a/vllm/spec_decode/draft_model_runner.py +++ /dev/null @@ -1,349 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional - -import torch - -from vllm.forward_context import set_forward_context -from vllm.model_executor.layers.sampler import SamplerOutput - -try: - try: - from vllm.attention.backends.flash_attn import FlashAttentionMetadata - except (ModuleNotFoundError, ImportError): - # vllm_flash_attn is not installed, try the ROCm FA metadata - from vllm.attention.backends.rocm_flash_attn import ( - ROCmFlashAttentionMetadata as FlashAttentionMetadata) -except (ModuleNotFoundError, ImportError) as err: - raise RuntimeError( - "Draft model speculative decoding currently only supports " - "CUDA and ROCm flash attention backend.") from err - -from vllm.logger import init_logger -from vllm.multimodal import MultiModalKwargs -from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.worker.model_runner_base import (ModelRunnerBase, - ModelRunnerInputBase, - ModelRunnerWrapperBase) - -logger = init_logger(__name__) - -# A flag to enable debug prints for the updated input tensors -# before each step. -debug_advance_input = False -# A flag to allow GPU advance step for draft model runner. -# Set to False for debugging. -allow_gpu_advance_step = True - - -class TP1DraftModelRunner(ModelRunnerWrapperBase): - """Specialized model runner for speculative decoding draft model. - Since the draft model always execute k forward passes consecutively to - generate k speculative tokens in a single speculative decoding step, - we could get rid of most CPU-GPU synchronization and data transfer - overheads by keeping model input and output tensors on GPU all the time. - - TODOs: - 1. Currently supports only flash-attn, add support for other attn_backends. - 2. Support TP > 1 (this requires some designs because we do not expect - any broadcasting inside execute_model). - """ - - def __init__(self, model_runner: ModelRunnerBase): - super().__init__(model_runner) - - self.indices_of_seq_with_bonus_tokens = None - - def _update_sampling_metadata(self, sampling_metadata, num_seqs, - num_queries): - - assert sampling_metadata.num_prompts == 0 - assert len(sampling_metadata.seq_groups) == num_queries - assert sampling_metadata.selected_token_indices.shape == ( - num_queries, ) - # assert sampling_metadata.categorized_sample_indices == TODO: Add if needed # noqa: E501 - - # Verify that all sequences are decodes - for i in range(num_queries): - seq_group = sampling_metadata.seq_groups[i] - - assert seq_group.is_prompt is False # No prompt - assert seq_group.prompt_logprob_indices == [] # No prompt - assert seq_group.sample_indices == [i] # Simple - - def _gpu_advance_step(self, model_input: ModelRunnerInputBase, - last_output: SamplerOutput) -> ModelRunnerInputBase: - # Currently, we expect "decode mode" only - assert not model_input.is_prompt - - # Get num_seqs - num_seqs = len(model_input.seq_lens) - num_queries = len(model_input.query_lens) - - # Get output tokens GPU tensor - sampled_token_ids = last_output.sampled_token_ids - assert sampled_token_ids is not None - - # Update attn_metadata - attn_metadata = model_input.attn_metadata - assert isinstance(attn_metadata, FlashAttentionMetadata) - - attn_metadata.advance_step(model_input, sampled_token_ids, - self.block_size, num_seqs, num_queries) - - # Update sampling_metadata - sampling_metadata = model_input.sampling_metadata - self._update_sampling_metadata(sampling_metadata, num_seqs, - num_queries) - - # Create new input - new_model_input = self._model_input_cls( - input_tokens=model_input.input_tokens, - input_positions=model_input.input_positions, - attn_metadata=attn_metadata, - seq_lens=attn_metadata.seq_lens, - query_lens=model_input.query_lens, - lora_mapping=model_input.lora_mapping, - lora_requests=model_input.lora_requests, - multi_modal_kwargs=model_input.multi_modal_kwargs, - sampling_metadata=model_input.sampling_metadata, - is_prompt=False, - ) - - # Ensure we skip CPU samples - assert new_model_input.sampling_metadata.skip_sampler_cpu_output is True - # We can reuse sampling tensors since every decode iteration is the same - new_model_input.sampling_metadata.reuse_sampling_tensors = True - - if debug_advance_input: - logger.debug("NEW INPUT: ") - logger.debug(" input_tokens = %s", new_model_input.input_tokens) - logger.debug(" input_positions = %s", - new_model_input.input_positions) - logger.debug(" seq_lens = %d", new_model_input.seq_lens) - logger.debug(" query_lens = %d", new_model_input.query_lens) - logger.debug(" attn_metadata:") - logger.debug(" seq_lens_tensor: %s", - attn_metadata.seq_lens_tensor) - logger.debug(" slot_mapping: %s", attn_metadata.slot_mapping) - logger.debug(" block_tables: %s", attn_metadata.block_tables) - - return new_model_input - - def supports_gpu_multi_step(self, execute_model_req: ExecuteModelRequest): - """Determines if draft_model_runner GPU multi-step can be used. - Currently required conditions are: - 1. Only decodes - 2. Only flash-attn - 3. No LORA - 4. No prompt_adapter_config - """ - if not allow_gpu_advance_step: - return False - - # We allow multi-step GPU only in decode mode - for seq_group in execute_model_req.seq_group_metadata_list: - if seq_group.is_prompt: - return False - - # TODO: Add support for other attn backends - if self.attn_backend.get_name() not in ("FLASH_ATTN", ): - return False - - # TODO: Add support for LORA - if self.lora_config: - return False - - # TODO: Add soft-tuning prompt adapter support - return not self.prompt_adapter_config - - def set_indices_of_seq_with_bonus_tokens(self, - indices_of_seq_with_bonus_tokens): - self.indices_of_seq_with_bonus_tokens = indices_of_seq_with_bonus_tokens - - @torch.inference_mode() - def execute_model( - self, - model_input: ModelRunnerInputBase, - kv_caches: List[torch.Tensor], - previous_hidden_states: Optional[torch.Tensor] = None, - intermediate_tensors: Optional[IntermediateTensors] = None, - num_steps: int = 1, - **kwargs, - ) -> Optional[List[SamplerOutput]]: - """Executes num_steps forward passes with advacement of input tensors - on the GPU. Look at supports_gpu_multi_step(..) for pre-conditions. - - Optimizations used: - 1. Input tensors are updated on the GPU directly - 2. Skips GPU=>CPU serialization of sampler outputs (we don't need - them since we do batch expansion later that uses GPU outputs) - 3. Reuses sampling tensors (since we run only decodes and they have - a repeating sampling logic) - """ - - # When num_steps == 1, we execute the fallback here for the GPU - # advance_step, which runs prepare_inputs on CPU and for each spec - # iteration invokes this function only once - # (Look at multi-step-worker code) - is_fallback = num_steps == 1 - if not is_fallback: - # Since we do not broadcast data inside execute_model anymore, - # we need to figure out the best way to support TP > 1 in this - # case, because we will at least need to broadcast the sampled - # tokens to all workers. - if not self.is_driver_worker: - raise ValueError("TP1DraftModelRunner only supports TP=1.") - - # Sanity - if self.lora_config is not None: - raise ValueError("TP1DraftModelRunner has no support for LORA") - if self.prompt_adapter_config is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "prompt_adapter_config") - if model_input.inputs_embeds is not None: - raise ValueError("TP1DraftModelRunner has no support for " - "inputs_embeds") - if model_input.multi_modal_kwargs: - raise ValueError( - "TP1DraftModelRunner has no support for multi_modal_kwargs" - ) - else: - if self.lora_config: - assert model_input.lora_requests is not None - assert model_input.lora_mapping is not None - self.set_active_loras(model_input.lora_requests, - model_input.lora_mapping) - - if self.prompt_adapter_config: - assert model_input.prompt_adapter_requests is not None - assert model_input.prompt_adapter_mapping is not None - self.set_active_prompt_adapters( - model_input.prompt_adapter_requests, - model_input.prompt_adapter_mapping) - - self.attn_state.begin_forward(model_input) - - # Detect exec mode - assert model_input.attn_metadata is not None - use_cuda_graph = False - if model_input.attn_metadata.num_prefills > 0: - # In this case, execute_model(..) was called directly - if num_steps > 1: - raise ValueError( - "execute_model(..) of draft_model_runner can be called " - "directly only with a single-step prefill") - else: - # We can skip CPU samples for spec token generation. - # (We do allow CPU samples for num_steps == 1 to support the - # fallback case, where supports_gpu_multi_step(..) does not pass) - model_input.sampling_metadata.skip_sampler_cpu_output = ( - not is_fallback) - - # Attn attr defines if we use cuda graphs - use_cuda_graph = model_input.attn_metadata.use_cuda_graph - - # Get model - if use_cuda_graph: - if model_input.inputs_embeds is None: - graph_batch_size = model_input.input_tokens.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, False)]) - else: - graph_batch_size = model_input.inputs_embeds.shape[0] - model_executable = ( - self.graph_runners[model_input.virtual_engine][( - graph_batch_size, True)]) - - if previous_hidden_states is not None: - hidden_states = torch.cat([ - previous_hidden_states, - torch.empty([ - graph_batch_size - previous_hidden_states.shape[0], - *previous_hidden_states.shape[1:] - ], - dtype=previous_hidden_states.dtype, - device=previous_hidden_states.device) - ]) - else: - hidden_states = None - else: - model_executable = self.model - hidden_states = previous_hidden_states - - outputs: List[SamplerOutput] = [] - for step in range(num_steps): - multi_modal_kwargs = model_input.multi_modal_kwargs or {} - - model_execute_kwargs = {"previous_hidden_states": hidden_states} \ - if previous_hidden_states is not None else {} - - compute_logits_kwargs = {} - # Run model - if hasattr(self.model.config, "num_nextn_predict_layers"): - # for DeepSeek MTP only to use the corresponding layer for - # each step - spec_step_idx = kwargs.get("spec_step_idx", step) - model_execute_kwargs["spec_step_idx"] = spec_step_idx - compute_logits_kwargs["spec_step_idx"] = spec_step_idx - with set_forward_context(model_input.attn_metadata, - self.vllm_config): - hidden_states = model_executable( - input_ids=model_input.input_tokens, - inputs_embeds=None, - positions=model_input.input_positions, - intermediate_tensors=intermediate_tensors, - **MultiModalKwargs.as_kwargs( - multi_modal_kwargs, - device=self.device, - ), - **model_execute_kwargs, - ) - - # Compute the logits. - logits = self.model.compute_logits(hidden_states, - model_input.sampling_metadata, - **compute_logits_kwargs) - if not self.is_driver_worker: - return [] - # Sample the next token. - output = self.model_runner.sampler( - logits=logits, - sampling_metadata=model_input.sampling_metadata, - ) - outputs.append(output) - - if self.return_hidden_states and is_fallback: - if use_cuda_graph: - indices = model_input.sampling_metadata\ - .selected_token_indices - output.hidden_states = hidden_states[:len(indices)] - else: - output.hidden_states = hidden_states - - if model_input.attn_metadata.num_prefills == 0 \ - and self.indices_of_seq_with_bonus_tokens is not None: - assert output.sampled_token_ids is not None - # output.sampled_token_ids should be of shape (num_seqs, 1) - nums_seqs, num_tokens_per_seq = output.sampled_token_ids.shape - assert num_tokens_per_seq == 1 - count = 0 - for i in range(nums_seqs): - bonus_seq_idx = self.indices_of_seq_with_bonus_tokens[ - count] - if i != bonus_seq_idx: - # The following might cause a cpu->gpu sync - # However, the performance impact is negligible as we - # benchmarked on H100. - output.sampled_token_ids[ - i, :] = model_input.input_tokens[bonus_seq_idx] - else: - count += 1 - - # Prepare inputs for the next step - if step != num_steps - 1: - model_input = self._gpu_advance_step(model_input, outputs[-1]) - - return outputs diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py deleted file mode 100644 index 70ec1590e7ad..000000000000 --- a/vllm/spec_decode/interfaces.py +++ /dev/null @@ -1,99 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from dataclasses import dataclass -from typing import List, Optional, Set, Union - -import torch - -from vllm.sequence import ExecuteModelRequest, PromptLogprobs -from vllm.worker.worker_base import WorkerBase - - -@dataclass -class SpeculativeProposals: - """Datastructure used to represent proposal tokens from some proposer. It - also tracks how many speculative tokens each sequence has. - """ - - # Speculative proposal tokens. - proposal_token_ids: torch.Tensor - - # Probabilities of the proposal tokens according to the proposer. - proposal_probs: torch.Tensor - - # The valid length of each proposal; can be zero. - proposal_lens: torch.Tensor - - # A flag to mark that there's no available proposals - no_proposals: bool = False - - def __repr__(self): - return (f"SpeculativeProposals(" - f"proposal_token_ids={self.proposal_token_ids}, " - f"proposal_probs={self.proposal_probs.shape}, " - f"proposal_lens={self.proposal_lens})") - - -@dataclass -class SpeculativeScores: - """Datastructure used to represent the scores of speculative tokens - according to the scoring model. - """ - - # Probabilities of the speculative tokens according to the scoring model. - probs: torch.Tensor - - # Log-probabilities of the speculative tokens according to the scoring - # model. These values can be used to generate Logprob objects that are - # returned to the user. - logprobs: torch.Tensor - - # Token ids sampled from the scoring model. Used for speculative bonus - # tokens and also non-speculative normal decoding. - token_ids: torch.Tensor - - # Optional last hidden states from the scoring model. - hidden_states: Optional[torch.Tensor] = None - - # Scoring model may also return logprobs for prompt tokens - # for each request, when chunked prefill is enabled. - prompt_logprobs: Optional[List[PromptLogprobs]] = None - - def __repr__(self): - return (f"SpeculativeScores(" - f"probs={self.probs.shape}, " - f"token_ids={self.token_ids.shape})") - - -class SpeculativeProposer(ABC): - - @abstractmethod - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - # If set, this contains all sequence IDs that were assigned - # bonus tokens in their last forward pass. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - raise NotImplementedError - - -class SpeculativeScorer(ABC): - - def __init__(self, scorer_worker: WorkerBase, - device: Union[torch.device, str], vocab_size: int): - self._scorer_worker = scorer_worker - if isinstance(device, torch.device): - device = device.type - self._device = device - self._vocab_size = vocab_size - - @abstractmethod - def score_proposals( - self, - execute_model_req: ExecuteModelRequest, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - raise NotImplementedError diff --git a/vllm/spec_decode/medusa_worker.py b/vllm/spec_decode/medusa_worker.py deleted file mode 100644 index 82b5a79fa7cb..000000000000 --- a/vllm/spec_decode/medusa_worker.py +++ /dev/null @@ -1,138 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import weakref -from typing import List, Optional, Set, Tuple - -import torch - -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase -from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker_base import DelegateWorkerBase - - -class MedusaWorker(NonLLMProposerWorkerBase, DelegateWorkerBase): - """Worker for Medusa. - """ - - def __init__(self, *args, **kwargs): - DelegateWorkerBase.__init__(self, *args, **kwargs) - # Lazy initialization list. - self._proposer: Top1Proposer - - def init_device(self): - self.worker.init_device() - - self._proposer = Top1Proposer( - weakref.proxy(self), # type: ignore[arg-type] - self.device, - self.vocab_size, - max_proposal_len=self.max_model_len, - ) - - def set_include_gpu_probs_tensor(self): - pass - - def set_should_modify_greedy_probs_inplace(self): - pass - - @torch.inference_mode() - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # Unused parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass to generate sample_len future tokens. - Returns the list of sampler output, one per layer, along with indicator - of whether torch tensor in sampler output need to be transposed in - latter sampler_output_to_torch logic. - - For medusa worker, this indicator shall be False. - """ - self._raise_if_unsupported(execute_model_req) - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - seq_lens, query_lens = self._prepare_input_tensors( - seq_group_metadata_list) - - generators = self.model_runner.get_generators( - execute_model_req.finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.model_runner.pin_memory, generators) - - model_outputs = self.model_runner.model.generate_proposals( - previous_hidden_states=execute_model_req.previous_hidden_states. - hidden_states, - sampling_metadata=sampling_metadata) - - return model_outputs, False - - def _prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[List[int], List[int]]: - if not seq_group_metadata_list: - return [], [] - - seq_lens: List[int] = [] - query_lens: List[int] = [] - - for seq_group_metadata in seq_group_metadata_list: - is_prompt = seq_group_metadata.is_prompt - - for seq_data in seq_group_metadata.seq_data.values(): - seq_data_len = seq_data.get_len() - if is_prompt: - context_len = seq_data.get_num_computed_tokens() - seq_len = min( - seq_data_len, - context_len + seq_group_metadata.token_chunk_size) - seq_lens.append(seq_len) - query_lens.append(seq_len - context_len) - else: - seq_lens.append(seq_data_len) - query_lens.append(1) - - return seq_lens, query_lens - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - - return self._proposer.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - def _raise_if_unsupported( - self, - execute_model_req: ExecuteModelRequest, - ) -> None: - """MedusaWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([ - execute_model_req.blocks_to_swap_in, - execute_model_req.blocks_to_swap_out, - execute_model_req.blocks_to_copy - ]): - raise NotImplementedError( - "MedusaWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in - execute_model_req.seq_group_metadata_list): - raise NotImplementedError( - "MedusaWorker does not support beam search.") diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py deleted file mode 100644 index a4784cad962d..000000000000 --- a/vllm/spec_decode/metrics.py +++ /dev/null @@ -1,213 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time -from typing import Callable, Optional, Union - -import msgspec -import torch - -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeBaseSampler) -from vllm.platforms import current_platform -from vllm.utils import is_pin_memory_available - - -class SpecDecodeWorkerMetrics( - msgspec.Struct, - omit_defaults=True, # type: ignore[call-arg] - array_like=True): # type: ignore[call-arg] - """Dataclass holding metrics emitted from the spec decode worker. - """ - - # The empirical acceptance rate of the proposal method on a per-token basis. - # This is useful for evaluating how well the proposal method aligns with the - # scoring method. - draft_acceptance_rate: float - - # The empirical efficiency, measured as the number of tokens emitted by the - # system divided by the number of tokens that could be emitted by the system - # if the proposal method were perfect. - system_efficiency: float - - # The number of speculative tokens produced by the proposal method. - draft_tokens: int - - # The number of tokens emitted by the entire system. - emitted_tokens: int - - # The number of tokens accepted by the scoring model and verification - # routine, e.g. Llama2-70B and lossless rejection sampling. - # - # NOTE: Any token accepted by the verification routine is considered - # accepted (regardless of if the speculative prefix is also accepted). The - # user will usually see less accepted tokens. This metric is helpful when - # evaluating alignment of the proposal method with the scoring model. - accepted_tokens: int - - # The number of speculative tokens per sequence. - num_spec_tokens: int - - -Timer = Callable[[], float] - - -class AsyncMetricsCollector: - """Class which copies rejection/typical-acceptance sampler metrics - from the device to CPU on a non-default Torch stream. - """ - - def __init__(self, - spec_decode_sampler: SpecDecodeBaseSampler, - timer: Optional[Timer] = None, - collect_interval_s: float = 5.0): - self.spec_decode_sampler = spec_decode_sampler - self._timer = time.time if timer is None else timer - - self._rank: Optional[int] = None - - # We don't have a device set yet. - self._copy_stream: Optional[torch.cuda.Stream] = None - - self._in_flight_copy: Optional[torch.cuda.Event] = None - - pin_memory = is_pin_memory_available() - self._aggregate_num_accepted_tokens = torch.tensor( - 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) - self._aggregate_num_emitted_tokens = torch.tensor( - 0, dtype=torch.long, device="cpu", pin_memory=pin_memory) - self._aggregate_num_draft_tokens = 0 - - self._rejsample_metrics_collect_interval_s = collect_interval_s - self._last_metrics_collect_time = self._timer() - - def init_gpu_tensors(self, rank: int) -> None: - self._rank = rank - self._copy_stream = torch.cuda.Stream() - - def init_tensors(self, - rank: int, - device_type: Union[torch.device, str] = 'cuda') -> None: - self._rank = rank - if isinstance(device_type, torch.device): - device_type = device_type.type - stream = current_platform.Stream - if stream is not None: - self._copy_stream = stream() - - def maybe_collect_rejsample_metrics( - self, k: int) -> Optional[SpecDecodeWorkerMetrics]: - # Skip for any platform that doesn't have device Event - if current_platform.Event is None: - return None - - # If a copy was initiated in the previous call, collect and return. - if self._in_flight_copy is not None: - ready_event = self._in_flight_copy - self._in_flight_copy = None - return self._collect_rejsample_metrics(k, ready_event) - - # Otherwise, check if we should start a new copy. - if self._should_collect_rejsample_metrics(self._timer()): - assert self._in_flight_copy is None - self._in_flight_copy = self._copy_rejsample_metrics_async() - - return None - - def _should_collect_rejsample_metrics(self, now: float) -> bool: - """Return whether or not this iteration should print sampling - metrics. - """ - if self._rank != 0: - return False - - return now - self._last_metrics_collect_time >= self._rejsample_metrics_collect_interval_s # noqa: E501 - - def _copy_rejsample_metrics_async(self) -> torch.cuda.Event: - """Copy rejection/typical-acceptance sampling metrics - (number of accepted tokens, etc) to CPU asynchronously. - - Returns a device event recording when the copy is complete. - """ - assert self._copy_stream is not None - self._copy_stream.wait_stream(current_platform.current_stream()) - - with current_platform.stream(self._copy_stream): - self._aggregate_num_accepted_tokens.copy_( - self.spec_decode_sampler.num_accepted_tokens, - non_blocking=True) - self._aggregate_num_emitted_tokens.copy_( - self.spec_decode_sampler.num_emitted_tokens, non_blocking=True) - # Number of draft tokens is calculated on CPU, so no copy is - # required. - self._aggregate_num_draft_tokens = ( - self.spec_decode_sampler.num_draft_tokens) - - aggregate_metrics_ready = current_platform.Event() - aggregate_metrics_ready.record(self._copy_stream) - - return aggregate_metrics_ready - - def _collect_rejsample_metrics( - self, k: int, - ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics: - """Create metrics object from statistics copied asynchronously. - - Args: - k: int. The number of speculative tokens; used to determine system - efficiency. - ready_event: torch.cuda.Event. The CUDA event recording when the - async GPU->CPU copy is complete. - """ - - ready_event.synchronize() - - # update time of last collection - self._last_metrics_collect_time = self._timer() - - accepted_tokens = self._aggregate_num_accepted_tokens.item() - emitted_tokens = self._aggregate_num_emitted_tokens.item() - draft_tokens = self._aggregate_num_draft_tokens - - max_num_emitted_tokens = self.get_max_num_emitted_tokens( - draft_tokens, k) - - if draft_tokens > 0: - draft_acceptance_rate = accepted_tokens / draft_tokens - else: - draft_acceptance_rate = float("nan") - - if max_num_emitted_tokens > 0: - system_efficiency = emitted_tokens / max_num_emitted_tokens - else: - system_efficiency = float("nan") - - return SpecDecodeWorkerMetrics( - num_spec_tokens=k, - draft_acceptance_rate=draft_acceptance_rate, - system_efficiency=system_efficiency, - accepted_tokens=accepted_tokens, - draft_tokens=draft_tokens, - emitted_tokens=emitted_tokens, - ) - - @staticmethod - def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int: - """Calculate the number of emitted tokens, assuming all tokens are - accepted. - - This is equal to the number of sequences that have been speculated on, - times (speculation len + 1). The +1 comes from the bonus token. - """ - # Determine the number of sequences that have been speculated on. Since - # the batch size can be variable, we divide by k. - assert draft_tokens % k == 0 - total_num_spec_seqs = draft_tokens // k - - # A single sequence may emit k accepted tokens and one bonus token in - # the best case. - num_emitted_per_seq_if_all_accepted = k + 1 - - # The max num of emitted tokens is the number of speculated sequences - # times the max emitted per seq. - return total_num_spec_seqs * num_emitted_per_seq_if_all_accepted diff --git a/vllm/spec_decode/mlp_speculator_worker.py b/vllm/spec_decode/mlp_speculator_worker.py deleted file mode 100644 index 8e8c05d26361..000000000000 --- a/vllm/spec_decode/mlp_speculator_worker.py +++ /dev/null @@ -1,94 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set, Tuple - -import torch - -from vllm.model_executor import SamplingMetadata -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase - - -class MLPSpeculatorWorker(NonLLMProposerWorkerBase, MultiStepWorker): - """Worker for MLPSpeculator models. - - Not currently compatible with LoRA or chunked prefill. - """ - - @torch.inference_mode() - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # Unused parameter. MLPSpeculatorWorker does not use the KV Cache and - # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass to generate sample_len future tokens. - Returns the list of sampler output, one per layer, along with indicator - of whether torch tensor in sampler output need to be transposed in - latter sampler_output_to_torch logic. - - For mlp spec worker, this indicator shall be True. - """ - self._raise_if_unsupported(execute_model_req) - - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - (input_tokens, seq_lens, - query_lens) = self._prepare_input_tensors(seq_group_metadata_list) - - generators = self.model_runner.get_generators( - execute_model_req.finished_requests_ids) - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, seq_lens, query_lens, self.device, - self.model_runner.pin_memory, generators) - - model_outputs = self.model_runner.model.generate_proposals( - input_ids=input_tokens, - previous_hidden_states=execute_model_req.previous_hidden_states. - hidden_states, - num_predict_tokens=sample_len, - sampling_metadata=sampling_metadata) - - assert len(model_outputs) == sample_len - - return model_outputs, True - - def _prepare_input_tensors( - self, - seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], - ) -> Tuple[torch.Tensor, List[int], List[int]]: - if not seq_group_metadata_list: - return torch.empty(0, device=self.device), [], [] - - input_tokens: List[int] = [] - seq_lens: List[int] = [] - query_lens: List[int] = [] - - for seq_group_metadata in seq_group_metadata_list: - is_prompt = seq_group_metadata.is_prompt - - for seq_data in seq_group_metadata.seq_data.values(): - seq_data_len = seq_data.get_len() - if is_prompt: - context_len = seq_data.get_num_computed_tokens() - seq_len = min( - seq_data_len, - context_len + seq_group_metadata.token_chunk_size) - tokens = seq_data.get_token_ids()[context_len:seq_len] - seq_lens.append(seq_len) - input_tokens.extend(tokens) - query_lens.append(seq_len - context_len) - else: - seq_lens.append(seq_data_len) - input_tokens.append(seq_data.get_last_token_id()) - query_lens.append(1) - - input_tokens_tensor = torch.tensor(input_tokens, - dtype=torch.long, - device=self.device) - return input_tokens_tensor, seq_lens, query_lens diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py deleted file mode 100644 index 18e7b055a678..000000000000 --- a/vllm/spec_decode/mqa_scorer.py +++ /dev/null @@ -1,160 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from vllm.sequence import (ExecuteModelRequest, SequenceData, - SequenceGroupMetadata, get_all_seq_ids) -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeScorer, SpeculativeScores) - -SeqId = int -TargetSeqId = int - - -class MQAScorer(SpeculativeScorer): - - def score_proposals( - self, - execute_model_req: ExecuteModelRequest, - proposals: SpeculativeProposals, - ) -> SpeculativeScores: - target_seq_group_metadata_list = [] - target_seq_id_start = max( - get_all_seq_ids(execute_model_req.seq_group_metadata_list)) + 1 - all_proposal_tokens = proposals.proposal_token_ids.tolist() - all_proposal_lengths = proposals.proposal_lens.tolist() - for i, seq_group_metadata in enumerate( - execute_model_req.seq_group_metadata_list): - if all_proposal_lengths[i] == 0: - # Keep prompt seqs untouched (keep computed_tokens for chunks). - target_seq_group_metadata_list.append(seq_group_metadata) - continue - - seq_data_dict = seq_group_metadata.seq_data - assert len(seq_data_dict) == 1 - seq_id = next(iter(seq_data_dict.keys())) - - seq_data: SequenceData = seq_data_dict[seq_id] - prompt_token_ids = seq_data.get_prompt_token_ids() - output_token_ids = seq_data.get_output_token_ids() - proposal_token_ids = all_proposal_tokens[ - i][:all_proposal_lengths[i]] - new_output_token_ids = [*output_token_ids, *proposal_token_ids] - - target_seq_id = target_seq_id_start + i - new_seq_data = SequenceData.from_seqs( - prompt_token_ids=prompt_token_ids, - output_token_ids=new_output_token_ids, - ) - new_seq_data.update_num_computed_tokens( - len(prompt_token_ids) + len(output_token_ids) - 1) - - # Ensure that the new decode sequence has at least one token. - assert len(output_token_ids) >= 1 - new_seq_data_dict = {target_seq_id: new_seq_data} - - new_seq_group_metadata = SequenceGroupMetadata( - request_id=seq_group_metadata.request_id, - is_prompt=seq_group_metadata.is_prompt, - seq_data=new_seq_data_dict, - sampling_params=seq_group_metadata.sampling_params, - block_tables={ - target_seq_id: seq_group_metadata.block_tables[seq_id], - }, - lora_request=None, - ) - target_seq_group_metadata_list.append(new_seq_group_metadata) - - target_sampler_output = self._scorer_worker.execute_model( - execute_model_req=execute_model_req.clone( - seq_group_metadata_list=target_seq_group_metadata_list)) - - target_sampler_output = target_sampler_output[0] - - k = execute_model_req.num_lookahead_slots - bs = len(execute_model_req.seq_group_metadata_list) - target_token_ids = target_sampler_output.sampled_token_ids - target_probs = target_sampler_output.sampled_token_probs - target_logprobs = target_sampler_output.logprobs - prompt_logprobs = None - - # If all requests have the same number of query tokens, we can avoid - # the for loop to build output for better performance. - if min(all_proposal_lengths) == k: - # Regular decodes only. - assert all(not sg.is_prompt - for sg in target_seq_group_metadata_list - if sg.is_prompt) - bs, _ = proposals.proposal_token_ids.shape - all_tokens = target_token_ids.reshape(bs, k + 1) - all_probs = target_probs.reshape(bs, k + 1, self._vocab_size) - all_logprobs = target_logprobs.reshape(bs, k + 1, self._vocab_size) - else: - # We either have decodes with different lens or prefill+decodes. - all_tokens = target_token_ids.new_full(size=(bs, k + 1), - fill_value=-1) - all_probs = target_probs.new_zeros(*all_tokens.shape, - self._vocab_size) - all_logprobs = target_logprobs.new_full(size=all_probs.shape, - fill_value=-float("inf")) - target_token_ids = target_token_ids.flatten() - - # When prompt logprobs is enabled, lens of returned tensors go from - # n_sampled (requests with do_sample=True) to n_prompt+n_prefills. - # We adjust stride accordingly to get the generated tokens and - # their probs, but pass on prompt_logprobs as is, since it may be - # that n_prompts >> K. - has_prompt_log = any((sg.sampling_params.prompt_logprobs - and sg.sampling_params.prompt_logprobs > 0) - for sg in target_seq_group_metadata_list) - # TODO (NickLucche) we should surface `disable_logprobs` as to not - # break abstraction to get its value. - if (not self._scorer_worker.model_runner.disable_logprobs\ - and has_prompt_log): - prompt_logprobs = [ - o.prompt_logprobs for o in target_sampler_output.outputs - ] - - # Split loop into prefill|decode for readability. - start_loc, i = 0, 0 - while i < len(target_seq_group_metadata_list - ) and target_seq_group_metadata_list[i].is_prompt: - seq_meta = target_seq_group_metadata_list[i] - end_loc = start_loc - if has_prompt_log: - end_loc += seq_meta.token_chunk_size - elif seq_meta.do_sample: - end_loc += 1 - - # Skip chunks with no output tokens. - if seq_meta.do_sample: - # Get sampled token (last position in chunk) and its prob. - all_tokens[i, 0] = target_token_ids[end_loc - 1] - all_probs[i, 0] = target_probs[end_loc - 1] - all_logprobs[i, 0] = target_logprobs[end_loc - 1] - - i += 1 - start_loc = end_loc - # Decodes. - while i < len(target_seq_group_metadata_list): - proposed_len, seq_meta = all_proposal_lengths[ - i], target_seq_group_metadata_list[i] - output_len = proposed_len + 1 - end_loc = start_loc + output_len - all_tokens[ - i, :output_len] = target_token_ids[start_loc:end_loc] - all_probs[i, :output_len] = target_probs[start_loc:end_loc] - all_logprobs[ - i, :output_len] = target_logprobs[start_loc:end_loc] - start_loc = end_loc - i += 1 - - hidden_states = None - if target_sampler_output.hidden_states is not None: - hidden_states = target_sampler_output.hidden_states.reshape( - bs, (k + 1), -1) - - return SpeculativeScores(probs=all_probs, - token_ids=all_tokens, - logprobs=all_logprobs, - hidden_states=hidden_states, - prompt_logprobs=prompt_logprobs) diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py deleted file mode 100644 index 4a9bbe44d89a..000000000000 --- a/vllm/spec_decode/multi_step_worker.py +++ /dev/null @@ -1,423 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copy -import weakref -from typing import Dict, List, Set, Tuple - -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.platforms import current_platform -from vllm.sequence import (ExecuteModelRequest, HiddenStates, SequenceData, - SequenceGroupMetadata) - -if current_platform.is_cuda_alike(): - from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner - -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeProposer) -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.top1_proposer import Top1Proposer -from vllm.worker.worker_base import DelegateWorkerBase - - -class MultiStepWorker(ProposerWorkerBase, DelegateWorkerBase): - """The MultiStepWorker is equivalent to a Worker except that it allows - multiple forward passes in a single call, assuming the scheduler has - allocated enough space to store the additional KV. This reduces overhead - by invoking the scheduler less. - - The MultiStepWorker does not support cache swap operations, or beam search. - Cache swap operations do not require large modifications. On the other hand, - beam search requires memory allocations during sequence forks and thus - requires more thought for MultiStepWorker support. - """ - - def __init__(self, *args, **kwargs): - DelegateWorkerBase.__init__(self, *args, **kwargs) - # Lazy initialization list. - self._proposer: SpeculativeProposer - - def init_device(self) -> None: - self.worker.init_device() - self._proposer = Top1Proposer( - weakref.proxy(self), # type: ignore[arg-type] - self.device, - self.vocab_size, - max_proposal_len=self.max_model_len, - ) - - def set_include_gpu_probs_tensor(self) -> None: - # Need include_gpu_probs_tensor for MultiStepWorker - self.model_runner.sampler.include_gpu_probs_tensor = True - if hasattr(self.model_runner.model, "sampler"): - (self.model_runner.model.sampler.include_gpu_probs_tensor) = True - - def set_should_modify_greedy_probs_inplace(self) -> None: - self.model_runner.sampler.should_modify_greedy_probs_inplace = True - if hasattr(self.model_runner.model, "sampler"): - (self.model_runner.model.sampler.should_modify_greedy_probs_inplace - ) = True - - @torch.inference_mode() - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - """Run the model forward pass sample_len times. Returns the list of - sampler output, one per model forward pass, along with indicator of - whether torch tensor in sampler output need to be transposed in latter - sampler_output_to_torch logic. - - For multi step worker, this indicator shall be True. - """ - self._raise_if_unsupported(execute_model_req) - # Expand the batch for sequences with a bonus token. - # Perform a forward pass on the expanded batch and filter the - # response to retain only the original sequences' responses. - expanded_request, indices_of_seq_with_bonus_tokens =\ - self._expand_execute_model_request( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - # Run model sample_len times. - model_outputs: List[SamplerOutput] = [] - if current_platform.is_cuda_alike() and isinstance( - self.model_runner, TP1DraftModelRunner - ) and self.model_runner.supports_gpu_multi_step(expanded_request): - # Here we run the draft_model_runner with multi-step prepare - # on the GPU directly - expanded_request.num_steps = sample_len - self.model_runner.set_indices_of_seq_with_bonus_tokens( - indices_of_seq_with_bonus_tokens) - model_outputs = self.execute_model( - execute_model_req=expanded_request) - else: - # Here we run multi-step directly, with every step prepared - # on the CPU. - # TODO: Remove this branch once DraftModelRunner supports TP>1 - # and other restrictions that are part of DraftModelRunner's - # supports_gpu_multi_step(..) - if expanded_request.previous_hidden_states is not None: - self.worker.model_runner.return_hidden_states = True - for _ in range(sample_len): - model_output: List[SamplerOutput] = self.worker.execute_model( - execute_model_req=expanded_request) - assert (len(model_output) == 1 - ), "composing multistep workers not supported" - model_output = model_output[0] - self._maybe_update_previous_hidden_states( - model_output, expanded_request) - - self._append_new_tokens( - model_output, expanded_request.seq_group_metadata_list, - indices_of_seq_with_bonus_tokens) - model_outputs.append(model_output) - - # move indices to device to avoid stream sync - indices_of_seq_with_bonus_tokens = torch.tensor( - indices_of_seq_with_bonus_tokens, device=self.device) - filtered_model_outputs = self._filter_model_output( - model_outputs, indices_of_seq_with_bonus_tokens) - return filtered_model_outputs, True - - @staticmethod - def _maybe_update_previous_hidden_states( - model_output: SamplerOutput, - expanded_request: ExecuteModelRequest) -> None: - """ - Updates the previous hidden states in an expanded request - in-place with the hidden states from the model output. - """ - if expanded_request.previous_hidden_states is not None: - expanded_request.previous_hidden_states = HiddenStates( - model_output.hidden_states, - expanded_request.seq_group_metadata_list) - - @staticmethod - def _expand_execute_model_request( - execute_model_req: ExecuteModelRequest, - seq_with_bonus_token_in_last_step: set, - ) -> Tuple[ExecuteModelRequest, List[int]]: - """ - Expands the execute model request based on sequences with bonus - tokens. - - For each sequence with a bonus token, this method creates a new - sequence without the bonus token and adds it to the execute model - request. The original sequence groups are also retained. The indices - of the original sequence groups are returned for further processing. - - Args: - execute_model_req (ExecuteModelRequest): The original execute - model request. - seq_with_bonus_token_in_last_step (set): Set of sequence IDs that - contain bonus tokens. - - Returns: - Tuple[ExecuteModelRequest, List[int]]: The updated execute model - request with expanded sequences and a list of indices corresponding - to the original sequence groups. - """ - updated_seq_group_metadata_list: List[SequenceGroupMetadata] = [] - updated_execute_model_req = execute_model_req.clone( - updated_seq_group_metadata_list) - indices_of_original_sequence_groups = [] - for seq_group in execute_model_req.seq_group_metadata_list: - seq_group_has_bonus_tokens = False - for seq_id, _ in seq_group.seq_data.items(): - # Identify sequences with bonus tokens in the sequence group. - if seq_id in seq_with_bonus_token_in_last_step: - seq_group_has_bonus_tokens = True - break - if seq_group_has_bonus_tokens: - #Create new sequences without the last bonus token. These new - # sequence have the same sequence id as the original sequence. - # We create a new sequence group and add them there. - updated_seq_group_without_bonus_token = \ - MultiStepWorker._copy_seq_metadata_excluding_last_token( - seq_group, seq_with_bonus_token_in_last_step) - updated_seq_group_metadata_list.append( - updated_seq_group_without_bonus_token) - # Add the original sequence group. - updated_seq_group_metadata_list.append( - MultiStepWorker._shallow_copy_seq_group_metadata(seq_group)) - # Record the index of the original sequence group. - indices_of_original_sequence_groups.append( - len(updated_seq_group_metadata_list) - 1) - - updated_execute_model_req.seq_group_metadata_list =\ - updated_seq_group_metadata_list - - if isinstance(updated_execute_model_req.previous_hidden_states, - HiddenStates): - updated_execute_model_req.previous_hidden_states\ - .expand_with_bonus_tokens(seq_with_bonus_token_in_last_step) - - return updated_execute_model_req, indices_of_original_sequence_groups - - @staticmethod - def _filter_model_output( - expanded_batch_outputs: List[SamplerOutput], - output_indices_to_retain: torch.Tensor) -> List[SamplerOutput]: - """ - Filters the model output to include only the specified sequence - outputs. This method contracts the expanded batch output from the - model to retain the outputs of only those sequences indicated by the - provided indices. - - Args: - expanded_batch_output (List[SamplerOutput]): The expanded output - batch from the model. - output_indices_to_retain (torch.Tensor): Indices of the model - outputs to retain. - - Returns: - List[SamplerOutput]: A list containing the filtered model - outputs for the specified indices. - """ - return [ - SamplerOutput( - outputs=[ - expanded_batch_output.outputs[i] - for i in output_indices_to_retain - ] if len(expanded_batch_output.outputs) > 0 else [], - sampled_token_probs=( - expanded_batch_output. - sampled_token_probs[output_indices_to_retain] - if expanded_batch_output.sampled_token_probs is not None - else None), - logprobs=( - expanded_batch_output.logprobs[output_indices_to_retain] - if expanded_batch_output.logprobs is not None else None), - sampled_token_ids=(expanded_batch_output. - sampled_token_ids[output_indices_to_retain] - if expanded_batch_output.sampled_token_ids - is not None else None)) - for expanded_batch_output in expanded_batch_outputs - ] - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: set, - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - return self._proposer.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - @staticmethod - def _append_new_tokens( - model_output: List[SamplerOutput], - seq_group_metadata_list: List[SequenceGroupMetadata], - indices_of_seq_with_bonus_tokens: List[int]) -> None: - """Given model output from a single run, append the tokens to the - sequences. This is normally done outside of the worker, but it is - required if the worker is to perform multiple forward passes. - """ - count = 0 - for index, (seq_group_metadata, sequence_group_outputs) in enumerate( - zip(seq_group_metadata_list, model_output)): - seq_group_metadata.is_prompt = False - - for seq_output in sequence_group_outputs.samples: - # NOTE: Beam search is not supported, so we can assume that - # parent_seq_id == seq_id. - seq = seq_group_metadata.seq_data[seq_output.parent_seq_id] - - token_id = seq_output.output_token - token_logprob = seq_output.logprobs[token_id] - # Determine the actual token ID to be generated, - # considering bonus tokens - if index != indices_of_seq_with_bonus_tokens[count]: - bonus_seq_metadata = seq_group_metadata_list[ - indices_of_seq_with_bonus_tokens[count]] - _, bonus_token_seq_data = next( - iter(bonus_seq_metadata.seq_data.items())) - token_id = bonus_token_seq_data.output_token_ids[-1] - else: - count += 1 - - seq.append_token_id(token_id, token_logprob.logprob, - seq_output.output_embed) - seq.update_num_computed_tokens(1) - - @staticmethod - def _shallow_copy_seq_group_metadata( - seq_group_metadata: SequenceGroupMetadata, ) -> SequenceGroupMetadata: - """Copy input data structures to remove side-effects when input data - structures are shared with other modules. - - Helpful when the vLLM scheduler runs in the same process as the worker. - The alternative is deep-copying (or other form of deep copy); this has - performance downsides. - """ - # Shallow-copy the SequenceGroupMetadata. This allows us to - # append tokens and change is_prompt without external side-effects. - # We must shallow-copy seq_group_metadata as is_prompt could change. - new_seq_group_metadata = copy.copy(seq_group_metadata) - - # We must shallow-copy seq_data as we will append token ids - new_seq_data: Dict[int, SequenceData] = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - new_seq_data[seq_id] = copy.copy(old_seq_data) - new_seq_data[seq_id].output_token_ids =\ - old_seq_data.output_token_ids[:] - - new_seq_group_metadata.seq_data = new_seq_data - return new_seq_group_metadata - - @staticmethod - def _copy_seq_metadata_excluding_last_token( - seq_group_metadata: SequenceGroupMetadata, - seq_ids_to_copy: Set[int], - ) -> SequenceGroupMetadata: - """ - Creates a shallow copy of the given SequenceGroupMetadata, retaining - only the sequence IDs specified in seq_ids_to_copy. For each of these - sequence IDs, all output_token_ids except the last one are copied. - Sequence IDs not in seq_ids_to_copy are excluded from the copy. - - Parameters: - seq_group_metadata (SequenceGroupMetadata): The original sequence - group metadata. - seq_ids_to_copy (Set[int]): The set of sequence IDs to include in the - copy. - - Returns: - SequenceGroupMetadata: A shallow copy of the sequence group metadata - with the specified modifications. - """ - # Shallow-copy the SequenceGroupMetadata. - new_seq_group_metadata = copy.copy(seq_group_metadata) - # Shallow-copy seq_data and modify the output_token_ids. - new_seq_data: Dict[int, SequenceData] = {} - for seq_id, old_seq_data in seq_group_metadata.seq_data.items(): - if (seq_id in seq_ids_to_copy): - new_seq_data[seq_id] = copy.copy(old_seq_data) - # Copy all the output token ids except the last. - # Also reduce num_computed_tokens by 1 since we are not - # including the last output token. - # NOTE: num_computed_tokens is not directly used by the - # speculative decoding workers, as it is only relevant for - # chunked prefill, which is disabled for speculative decoding. - # However, to maintain consistency in num_computed_tokens, - # we update it here. - new_seq_data[seq_id].output_token_ids =\ - old_seq_data.output_token_ids[:-1] - new_seq_data[seq_id].update_num_computed_tokens(-1) - new_seq_group_metadata.seq_data = new_seq_data - return new_seq_group_metadata - - def _assert_enough_kv_space( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - num_steps: int) -> None: - """Assert there are enough physical blocks per sequence to store the - current KV plus additional KV from num_steps tokens. - """ - assert self.model_runner.block_size is not None - for seq_group_metadata in seq_group_metadata_list: - # Only one seq_id is guaranteed because there is no beam search. - seq_id = list(seq_group_metadata.seq_data.keys())[0] - seq = seq_group_metadata.seq_data[seq_id] - - # After num_steps, the seq len will be the current seq len - # plus one token per step. - final_seq_len = seq.get_len() + num_steps - - # We will have final_seq_len - 1 KV because vLLM saves KV for a - # token in the iteration after the token was generated. - required_num_kv_slots = final_seq_len - 1 - - # The allocated number of kv slots is the number of allocated blocks - # times the number of slots of block. - number_physical_blocks = len( - seq_group_metadata.block_tables[seq_id]) - allocated_kv_slots = (number_physical_blocks * - self.model_runner.block_size) - - if required_num_kv_slots > allocated_kv_slots: - request_id = seq_group_metadata.request_id - raise ValueError( - "The worker attempted to run " - f"{num_steps} times but found insufficient KV space for " - f"{request_id=} {seq_id=}. ({allocated_kv_slots=} " - f"{required_num_kv_slots=}).") - - def _raise_if_unsupported( - self, - execute_model_req: ExecuteModelRequest, - ) -> None: - """MultiStepWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([ - execute_model_req.blocks_to_swap_in, - execute_model_req.blocks_to_swap_out, - execute_model_req.blocks_to_copy - ]): - raise NotImplementedError( - "MultiStepWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in - execute_model_req.seq_group_metadata_list): - raise NotImplementedError( - "MultiStepWorker does not support beam search.") - - def maybe_load_lm_head_weight( - self, - lm_head_weight: torch.Tensor, - ) -> None: - weight_loader = getattr( - self.worker.model_runner.model_runner.model.lm_head.weight, - "weight_loader", default_weight_loader) - weight_loader( - self.worker.model_runner.model_runner.model.lm_head.weight, - lm_head_weight) diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py deleted file mode 100644 index 7a1a0e56dc00..000000000000 --- a/vllm/spec_decode/ngram_worker.py +++ /dev/null @@ -1,196 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import weakref -from typing import List, Optional, Set, Tuple - -import torch -import torch.nn as nn - -from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase -from vllm.spec_decode.top1_proposer import Top1Proposer - - -class _DummyModel(nn.Module): - pass - - -class NGramWorker(NonLLMProposerWorkerBase): - """NGramWorker provides a light drafter without need for model. - - Current NGramWorker only implements prompt lookup decoding, - and in future we may also do RAG type drafter and other scenarios - which don't rely on LLM model to give proposals. - """ - - def __init__( - self, - vllm_config: VllmConfig, - local_rank: int, - device_type: str = "cuda", - **kwargs, - ): - super().__init__(vllm_config) - - # Get local_rank/vocab_size from kwargs attribute - self.local_rank = local_rank - self.device_type = device_type - - # Lazy initialization list. - self._proposer: Top1Proposer - - def set_ngram_window_size(self, ngram_prompt_lookup_min: int, - ngram_prompt_lookup_max: int): - # Search valid candidate window between - # ngram_prompt_lookup_min/ngram_prompt_lookup_max - self.ngram_prompt_lookup_max = ngram_prompt_lookup_max - self.ngram_prompt_lookup_min = ngram_prompt_lookup_min - - def init_device(self): - self.device = torch.device(f"{self.device_type}:{self.local_rank}") - - # Current NGramWorker only supports Top1Proposer - self._proposer = Top1Proposer( - weakref.proxy(self), # type: ignore[arg-type] - device=self.device, - vocab_size=self.vocab_size, - ) - - def load_model(self) -> None: - pass # Dummy - - def get_model(self) -> nn.Module: - return _DummyModel() - - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # Unused parameter. NGramWorker does not use the KV Cache and - # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[Optional[List[Optional[SamplerOutput]]], bool]: - """NGram match algo to pick proposal candidate. Returns the list of - sampler output, one per SequenceGroupMetadata. - - For ngram worker, we already done needed transposed internal, so the - indicator pass to sampler_output_to_torch shall be False. - """ - self._raise_if_unsupported(execute_model_req) - - has_spec_out = False - token_id_list: List[Optional[torch.Tensor]] = [] - token_prob_list: List[Optional[torch.Tensor]] = [] - for idx, seq_group_metadata in enumerate( - execute_model_req.seq_group_metadata_list): - seq_data = next(iter(seq_group_metadata.seq_data.values())) - - seq_len = seq_data.get_len() - # When seq_len is less than 3072 (3K), we use CPU to perform - # the ngram match. Otherwise, we use the device specified in - # the model config (normally GPU). 3072 is a rough threshold - # based on profiling on H100, and it can be adjusted based - # on the actual performance on different hardware. - cur_device = "cpu" if seq_len < 3072 else self.device - input_ids = torch.as_tensor(seq_data.get_token_ids(), - dtype=torch.long, - device=cur_device) - input_length = seq_data.get_len() - - for ngram_size in range( - min(self.ngram_prompt_lookup_max, input_length - 1), - self.ngram_prompt_lookup_min - 1, - -1, - ): - ngram_tensor = input_ids[-ngram_size:] - if ngram_size == 1: - # Do not match itself and do not use unfold and all - matches = (input_ids[:-1] == ngram_tensor) - else: - windows = input_ids.unfold(dimension=0, - size=ngram_size, - step=1) - # Do not match itself - matches = (windows[:-1] == ngram_tensor).all(dim=-1) - - # first_match includes "values" (bool), indicating whether - # the match is found, and "indices", indicating the index - # of the first match. - first_match = matches.max(dim=-1) - if first_match.values.item(): - proposal_start_idx = first_match.indices.add_(ngram_size) - spec_indices = ( - proposal_start_idx).repeat(sample_len) + torch.arange( - sample_len, device=cur_device) - spec_indices.clamp_(max=input_ids.shape[-1] - 1) - res = input_ids.gather(dim=-1, - index=spec_indices).to(self.device) - token_id_list.append(res) - token_prob_list.append( - torch.nn.functional.one_hot( - res, - num_classes=self.vocab_size).to(torch.float32)) - has_spec_out = True - break - else: - token_id_list.append(None) - token_prob_list.append(None) - - if not has_spec_out: - return None, False - - outputs: List[Optional[SamplerOutput]] = [] - for idx in range(len(execute_model_req.seq_group_metadata_list)): - if token_id_list[idx] is None: - outputs.append(None) - else: - outputs.append( - SamplerOutput( - outputs=None, - sampled_token_probs=token_prob_list[idx], - logprobs=torch.zeros((sample_len, self.vocab_size), - dtype=torch.float32, - device=self.device), - sampled_token_ids=token_id_list[idx], - )) - - return outputs, False - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - # Unused parameter. NGramWorker does not use the KV Cache and - # therefore does not need this parameter. - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - return self._proposer.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - def _raise_if_unsupported( - self, - execute_model_req: ExecuteModelRequest, - ) -> None: - """NGramWorker does not yet implement support for cache swap - operations or beam search. - """ - if any([ - execute_model_req.blocks_to_swap_in, - execute_model_req.blocks_to_swap_out, - execute_model_req.blocks_to_copy - ]): - raise NotImplementedError( - "NGramWorker does not support cache operations") - - if any( - len(seq_group_metadata.seq_data.keys()) != 1 - for seq_group_metadata in - execute_model_req.seq_group_metadata_list): - raise NotImplementedError( - "NGramWorker does not support beam search.") diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py deleted file mode 100644 index fb44275aa935..000000000000 --- a/vllm/spec_decode/proposer_worker_base.py +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from abc import ABC, abstractmethod -from typing import List, Optional, Set, Tuple - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.interfaces import SpeculativeProposer -from vllm.worker.worker_base import LoRANotSupportedWorkerBase - - -class ProposerWorkerBase(LoRANotSupportedWorkerBase, SpeculativeProposer): - """Interface for proposer workers""" - - @abstractmethod - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - # A set containing all sequence IDs that were assigned bonus tokens - # in their last forward pass. This set is used to backfill the KV cache - # with the key-value pairs of the penultimate token in the sequences. - # This parameter is only used by the MultiStepWorker, which relies on - # the KV cache for token generation. It is not used by workers that - # do not utilize the KV cache. - seq_ids_with_bonus_token_in_last_step: Set[int] - ) -> Tuple[Optional[List[SamplerOutput]], bool]: - raise NotImplementedError - - def set_include_gpu_probs_tensor(self) -> None: - """Implementation optional""" - pass - - def set_should_modify_greedy_probs_inplace(self) -> None: - """Implementation optional""" - pass - - -class NonLLMProposerWorkerBase(ProposerWorkerBase, ABC): - """Proposer worker which does not use a model with kvcache""" - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """get_spec_proposals is used to get the proposals""" - return [] - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """This is never called on the proposer, only the target model""" - raise NotImplementedError - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - pass - - def get_cache_block_size_bytes(self) -> int: - return 0 diff --git a/vllm/spec_decode/smaller_tp_proposer_worker.py b/vllm/spec_decode/smaller_tp_proposer_worker.py deleted file mode 100644 index 91256cab6e79..000000000000 --- a/vllm/spec_decode/smaller_tp_proposer_worker.py +++ /dev/null @@ -1,196 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set, Tuple - -import torch -import torch.nn as nn - -from vllm.distributed.parallel_state import (get_tp_group, - init_model_parallel_group, - patch_tensor_parallel_group) -from vllm.logger import init_logger -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.sequence import ExecuteModelRequest -from vllm.spec_decode.interfaces import SpeculativeProposals -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase - -logger = init_logger(__name__) - - -class _DummyModel(nn.Module): - pass - - -class SmallerTpProposerWorker(ProposerWorkerBase): - """Class which allows a speculative draft model to run with smaller tensor - parallel degree than target model. - This reduces the communication overhead of small draft models. - - To implement this feature, this class differs behavior based on is_dummy - flag, where dummy means worker that does not participate draft generation. - Participating workers use a smaller tp group by patching vLLM's tensor - parallel group temporarily during forward passes of draft models. - """ - - @classmethod - def maybe_wrap_worker(cls, worker, draft_tensor_parallel_size: int, - target_tensor_parallel_size: int): - """Wrap the worker in a SmallerTpProposerWorker if necessary. - """ - if draft_tensor_parallel_size == target_tensor_parallel_size: - return worker - - # gpu ranks that will generate draft tokens together - draft_ranks = list(range(draft_tensor_parallel_size)) - - logger.info("Wrapping {%s} in {%s}", type(worker), cls) - return cls(worker, draft_ranks) - - def __init__(self, worker: MultiStepWorker, draft_ranks: List[int]): - """Create a SmallerTpProposerWorker. - - Args: - worker (~vllm.spec_decode.multi_step_worker.MultiStepWorker): an - actual worker wrapped with this class - draft_ranks (List[int]): if this value is given, only the GPU ranks - written in this value participate in draft generation - """ - self._worker = worker - self._draft_ranks = draft_ranks - - # init during init_device - self._is_dummy = False - self._tp_group = None - - def _patch_tensor_parallel_group(self): - """Temporarily patch the global tp group state with its own tp group - state. - """ - return patch_tensor_parallel_group(self._tp_group) - - def init_device(self) -> None: - self._is_dummy = get_tp_group().rank not in self._draft_ranks - - # dummy workers do nothing - if self._is_dummy: - return - - # creates tp process group containing only a subset of gpu ranks - local_rank = get_tp_group().local_rank - tp_backend = torch.distributed.get_backend(get_tp_group().device_group) - self._tp_group = init_model_parallel_group([self._draft_ranks], - local_rank, tp_backend) - - with self._patch_tensor_parallel_group(): - self._worker.init_device() - - def set_include_gpu_probs_tensor(self) -> None: - if self._is_dummy: - return - - # Need include_gpu_probs_tensor for multi_step_worker - self._worker.set_include_gpu_probs_tensor() - - def set_should_modify_greedy_probs_inplace(self) -> None: - if self._is_dummy: - return - - self._worker.set_should_modify_greedy_probs_inplace() - - def load_model(self) -> None: - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - self._worker.load_model() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - if self._is_dummy: - # this case is not used now - return -1, -1 - - with self._patch_tensor_parallel_group(): - return self._worker.determine_num_available_blocks() - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - self._worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) - - def sampler_output( - self, - execute_model_req: ExecuteModelRequest, - sample_len: int, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> Tuple[List[SamplerOutput], bool]: - # Do not check _is_dummy, as it's always called by get_spec_proposals - return self._worker.sampler_output( - execute_model_req, sample_len, - seq_ids_with_bonus_token_in_last_step) - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Produce speculations given an input batch of sequences. The number of - speculative tokens per sequence is determined by max_proposal_len. - """ - if self._is_dummy: - return SpeculativeProposals(None, None, None) - - with self._patch_tensor_parallel_group(): - return self._worker.get_spec_proposals( - execute_model_req, seq_ids_with_bonus_token_in_last_step) - - def get_model(self) -> nn.Module: - if self._is_dummy: - return _DummyModel() - - with self._patch_tensor_parallel_group(): - return self._worker.get_model() - - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - if self._is_dummy: - return [] - - with self._patch_tensor_parallel_group(): - return self._worker.execute_model(execute_model_req) - - def get_cache_block_size_bytes(self) -> int: - if self._is_dummy: - # by returning zero, target worker can use the entire kv cache space - return 0 - - return self._worker.get_cache_block_size_bytes() - - @property - def vocab_size(self) -> int: - return self._worker.vocab_size - - def maybe_load_lm_head_weight( - self, - lm_head_weight: torch.Tensor, - ) -> None: - if self._is_dummy: - return - - with self._patch_tensor_parallel_group(): - weight_loader = getattr( - self._worker.worker.model_runner.model_runner.model.\ - lm_head.weight, - "weight_loader", - default_weight_loader) - weight_loader( - self._worker.worker.model_runner.model_runner.model.\ - lm_head.weight, - lm_head_weight) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py deleted file mode 100644 index 7dda1cbfe230..000000000000 --- a/vllm/spec_decode/spec_decode_worker.py +++ /dev/null @@ -1,1326 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import copy -from collections import defaultdict -from functools import cached_property -from typing import Any, Dict, List, Optional, Set, Tuple, Type - -import torch -import torch.nn as nn - -from vllm.config import ParallelConfig, SpeculativeConfig, VllmConfig -from vllm.distributed.communication_op import (broadcast_tensor_dict, - get_tp_group, - tensor_model_parallel_gather) -from vllm.distributed.parallel_state import model_parallel_is_initialized -from vllm.logger import init_logger -from vllm.model_executor.layers.rejection_sampler import RejectionSampler -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.layers.spec_decode_base_sampler import ( - SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler) -from vllm.model_executor.layers.typical_acceptance_sampler import ( - TypicalAcceptanceSampler) -from vllm.platforms import current_platform -from vllm.sequence import (VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, ExecuteModelRequest, - HiddenStates, SequenceGroupMetadata, - get_all_seq_ids_and_request_ids) -from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer - -if current_platform.is_cuda_alike(): - from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner - -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeScorer, SpeculativeScores) -from vllm.spec_decode.medusa_worker import MedusaWorker -from vllm.spec_decode.metrics import AsyncMetricsCollector -from vllm.spec_decode.mlp_speculator_worker import MLPSpeculatorWorker -from vllm.spec_decode.mqa_scorer import MQAScorer -from vllm.spec_decode.multi_step_worker import MultiStepWorker -from vllm.spec_decode.ngram_worker import NGramWorker -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.smaller_tp_proposer_worker import SmallerTpProposerWorker -from vllm.spec_decode.target_model_runner import TargetModelRunner -from vllm.spec_decode.util import (Timer, create_logprobs_output, - create_sequence_group_output, - get_all_num_logprobs, - get_sampled_token_logprobs, nvtx_range, - split_batch_by_proposal_len) -from vllm.utils import resolve_obj_by_qualname -from vllm.worker.worker_base import LoRANotSupportedWorkerBase, WorkerBase - -logger = init_logger(__name__) - - -def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": - """Helper method that is the entrypoint for Executors which use - WorkerWrapper. It constructs a SpecDecodeWorker from the speculative config. - """ - vllm_config: VllmConfig = kwargs.get("vllm_config") - speculative_config: SpeculativeConfig = vllm_config.speculative_config - assert speculative_config is not None - - if vllm_config.parallel_config.pipeline_parallel_size > 1: - raise NotImplementedError("Speculative decoding is currently " - "incompatible with pipeline parallelism") - - draft_worker_kwargs = kwargs.copy() - - kwargs["model_runner_cls"] = TargetModelRunner - target_worker_config = copy.deepcopy(vllm_config) - target_worker_config.parallel_config.worker_cls =\ - target_worker_config.parallel_config.sd_worker_cls - cls = resolve_obj_by_qualname( - target_worker_config.parallel_config.worker_cls) - target_worker = cls(*args, **kwargs) - # Set the disable_logprobs variable in the TargetModelRunner instance - # as per its value specified in the SpeculativeConfig. - target_worker.model_runner.disable_logprobs =\ - speculative_config.disable_logprobs - - draft_worker_config = copy.deepcopy(vllm_config) - draft_worker_config.model_config = speculative_config.draft_model_config - draft_worker_config.quant_config = VllmConfig._get_quantization_config( - draft_worker_config.model_config, - vllm_config.load_config, - ) - speculative_config.draft_parallel_config.worker_cls =\ - draft_worker_config.parallel_config.sd_worker_cls - draft_worker_config.parallel_config = speculative_config.draft_parallel_config # noqa - # TODO allow draft-model specific load config. - - # Override draft-model specific worker args. - draft_worker_kwargs.update( - vllm_config=draft_worker_config, - ngram_prompt_lookup_max=speculative_config.prompt_lookup_max, - ngram_prompt_lookup_min=speculative_config.prompt_lookup_min, - ) - - spec_decode_worker = SpecDecodeWorker.create_worker( - scorer_worker=target_worker, - draft_worker_kwargs=draft_worker_kwargs, - disable_mqa_scorer=speculative_config.disable_mqa_scorer, - disable_by_batch_size=speculative_config.disable_by_batch_size, - draft_token_acceptance_method=speculative_config.acceptance_method, - typical_acceptance_sampler_posterior_threshold=speculative_config. - posterior_threshold, - typical_acceptance_sampler_posterior_alpha=speculative_config. - posterior_alpha, - disable_logprobs=speculative_config.disable_logprobs, - disable_log_stats=speculative_config.disable_log_stats, - num_speculative_tokens=speculative_config.num_speculative_tokens, - ) - - return spec_decode_worker - - -# Reminder: Please update docs/features/compatibility_matrix.md -# If the feature combo become valid -class SpecDecodeWorker(LoRANotSupportedWorkerBase): - """Worker which implements speculative decoding. - - Speculative decoding reduces decoding per-token latency by using a proposal - method, such as a small draft model, to speculate ahead of a larger LLM. The - probabilities of the speculative tokens are then determined by the larger - LLM, after which some verification routine determines which (if any) of the - speculative tokens are accepted by the larger LLM. - - See https://github.com/vllm-project/vllm/pull/2188 and - https://github.com/vllm-project/vllm/pull/3103 for more info. - - The current implementation has the following limitations: - * Only draft-model proposal is implemented (contributions for more forms are - welcome!). - * Only top-1 proposal and scoring are implemented. Tree-attention is left as - future work. - * All sequences in a batch must have the same proposal length, or zero. This - can be improved by having per-sequence speculation in the future. - * The scoring forward pass is done without an MQA kernel, which is - suboptimal especially as the batch size, proposal length, and sequence - lengths grow. Contributions to add a MQA scoring are welcome once - correctness tests pass. - More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit. - """ - - @classmethod - def create_worker( - cls, - scorer_worker: WorkerBase, - draft_worker_kwargs: Dict[str, Any], - disable_mqa_scorer: bool, - disable_by_batch_size: Optional[int], - draft_token_acceptance_method: str, - typical_acceptance_sampler_posterior_threshold: float, - typical_acceptance_sampler_posterior_alpha: float, - disable_logprobs: bool, - disable_log_stats: bool, - num_speculative_tokens: int, - ) -> "SpecDecodeWorker": - - allow_zero_draft_token_step = True - enable_lm_head_weight_load = False - num_spec_prefill_steps = 1 - ngram_prompt_lookup_max = ( - draft_worker_kwargs.pop("ngram_prompt_lookup_max")) - ngram_prompt_lookup_min = ( - draft_worker_kwargs.pop("ngram_prompt_lookup_min")) - draft_model_config = draft_worker_kwargs["vllm_config"].model_config - draft_parallel_config: ParallelConfig = draft_worker_kwargs[ - 'vllm_config'].parallel_config - if ngram_prompt_lookup_max > 0: - draft_worker_kwargs[ - "device_type"] = scorer_worker.device_config.device.type - proposer_worker = NGramWorker(**draft_worker_kwargs) - proposer_worker.set_ngram_window_size(ngram_prompt_lookup_min, - ngram_prompt_lookup_max) - else: - draft_tp = draft_parallel_config.tensor_parallel_size - target_tp = scorer_worker.parallel_config.tensor_parallel_size - - if draft_model_config.hf_config.model_type == "mlp_speculator": - proposer_worker = MLPSpeculatorWorker(**draft_worker_kwargs) - elif draft_model_config.hf_config.model_type == "medusa": - proposer_worker = MedusaWorker(**draft_worker_kwargs) - else: - if draft_tp == 1: - if current_platform.is_cuda_alike(): - draft_worker_kwargs[ - "model_runner_cls"] = TP1DraftModelRunner - else: - if draft_model_config.hf_config.model_type == "eagle": - raise NotImplementedError( - f"{draft_model_config.hf_config.model_type} " - "does not support TP > 1 yet") - - allow_zero_draft_token_step = False - - # Load lm_head weight for eagle in init_device - if draft_model_config.hf_config.model_type == "eagle": - enable_lm_head_weight_load = True - - proposer_worker = MultiStepWorker(**draft_worker_kwargs) - if draft_model_config.hf_config.model_type == "deepseek_mtp": - num_spec_prefill_steps = \ - draft_model_config.hf_config.n_predict - - proposer_worker = SmallerTpProposerWorker.maybe_wrap_worker( - proposer_worker, draft_tp, target_tp) - - logger.info("Configuring SpecDecodeWorker with proposer=%s", - type(proposer_worker)) - - spec_decode_sampler: SpecDecodeBaseSampler = None - if draft_token_acceptance_method == "rejection_sampler": - spec_decode_sampler = RejectionSampler() - elif draft_token_acceptance_method == "typical_acceptance_sampler": - spec_decode_sampler = TypicalAcceptanceSampler( - posterior_threshold=\ - typical_acceptance_sampler_posterior_threshold, - posterior_alpha=typical_acceptance_sampler_posterior_alpha, - ) - logger.info( - "[Speculative Decoding] Configuring" - " SpecDecodeWorker with sampler=%s", type(spec_decode_sampler)) - - if not disable_mqa_scorer: - if scorer_worker.model_runner.attn_backend.get_name( - ) != "FLASH_ATTN": - disable_mqa_scorer = True - logger.info( - "[Speculative Decoding] Disabling MQA scorer as the " - "MQA is only available with flash attn backend.") - - if draft_model_config and \ - draft_model_config.max_model_len < \ - scorer_worker.model_config.max_model_len: - disable_mqa_scorer = True - logger.info( - "[Speculative Decoding] Disabling MQA scorer as the " - "draft model max_model_len is smaller than the target " - "model max_model_len.") - - if not scorer_worker.model_runner.model_config.enforce_eager: - disable_mqa_scorer = True - logger.info( - "[Speculative Decoding] Disabling MQA scorer as the " - "target model is not running in eager mode.") - - return SpecDecodeWorker( - proposer_worker, - scorer_worker, - disable_mqa_scorer=disable_mqa_scorer, - disable_logprobs=disable_logprobs, - disable_log_stats=disable_log_stats, - disable_by_batch_size=disable_by_batch_size, - spec_decode_sampler=spec_decode_sampler, - allow_zero_draft_token_step=allow_zero_draft_token_step, - enable_lm_head_weight_load=enable_lm_head_weight_load, - num_spec_prefill_steps=num_spec_prefill_steps) - - def __init__( - self, - proposer_worker: ProposerWorkerBase, - scorer_worker: WorkerBase, - spec_decode_sampler: SpecDecodeBaseSampler, - disable_mqa_scorer: bool = False, - disable_logprobs: bool = False, - disable_log_stats: bool = False, - metrics_collector: Optional[AsyncMetricsCollector] = None, - disable_by_batch_size: Optional[int] = None, - allow_zero_draft_token_step: Optional[bool] = True, - enable_lm_head_weight_load: Optional[bool] = False, - num_spec_prefill_steps: int = 1, - ): - """ - Create a SpecDecodeWorker. - - Args: - proposer_worker: A worker that can produce speculative tokens for - sequences. - scorer_worker: A worker that produces probabilities of speculative - tokens according to some base model. Typically a vanilla vLLM - Worker. - spec_decode_sampler: A Torch module used to perform acceptance - sampling of the draft tokens in the verification step of - speculative decoding. Currently we support two different - types of sampler namely RejectionSampler and - TypicalAcceptanceSampler. 'spec_decode_sampler' is either an - instance of RejectionSampler or TypicalAcceptanceSampler. - disable_mqa_scorer: If set to True, disable the MQA scorer and use - the BatchExpansionTop1Scorer instead. - disable_logprobs: If set to True, token log probabilities will - not be output in both the draft worker and the target worker. - If set to False, log probabilities will be output by both. - disable_log_stats: If set to True, disable periodic printing of - speculative stage times. - disable_by_batch_size: If the batch size is larger than this, - disable speculative decoding for new incoming requests. - metrics_collector: Helper class for collecting metrics; can be set - for testing purposes. - allow_zero_draft_token_step: whether to allow a step where the draft - model generates no draft token; should disallow when the tp of - draft model is larger than 1 (TODO: #5814) - enable_lm_head_weight_load: whether to load lm_head weight for - draft models like eagle. - num_spec_prefill_steps: number of speculative prefill steps to run - before the speculative decoding starts. This is only used when - the draft model is a deepseek_mtp model that requires prefill - kv cache separately for each MTP layer. - """ - self.proposer_worker = proposer_worker - self.scorer_worker = scorer_worker - scorer_runner = getattr(self.scorer_worker, "model_runner", None) - self.generators = scorer_runner.get_generators( - ) if scorer_runner else None - self.disable_by_batch_size = disable_by_batch_size or float("inf") - self.spec_decode_sampler = spec_decode_sampler - self._allow_zero_draft_token_step = allow_zero_draft_token_step - self._enable_lm_head_weight_load = enable_lm_head_weight_load - self._metrics = AsyncMetricsCollector( - self.spec_decode_sampler - ) if metrics_collector is None else metrics_collector - # Tracks the sequence IDs that received a bonus token ID in - # their last forward pass. Needed only if KV cache is being - # used for token generation such as in the case of MultiStepWorker. - self._seq_with_bonus_token_in_last_step: Set[int] = set() - # Tracks the currently active request ids and the sequence IDs - # corresponding to them - self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set) - # Tracks if the proposer worker uses the KV cache or not. - - self.probs_dtype = self.spec_decode_sampler.probs_dtype - self.token_id_dtype = self.spec_decode_sampler.token_id_dtype - # Lazy initialization. - self.scorer: SpeculativeScorer - self.disable_mqa_scorer = disable_mqa_scorer - - # Hidden states from target model to pass to proposer - # in the subsequent step. - self.previous_hidden_states: Optional[HiddenStates] = None - self._disable_logprobs = disable_logprobs - self._disable_log_stats = disable_log_stats - self._num_spec_prefill_steps = num_spec_prefill_steps - - def init_device(self) -> None: - """Initialize both scorer and proposer models. - """ - # The scorer worker model is initialized first in case the proposer - # model has a smaller TP degree than the target worker. - self.scorer_worker.init_device() - self.proposer_worker.init_device() - - # NOTE(cade): load_model is not part of the WorkerBase interface. - self.scorer_worker.load_model() - self.proposer_worker.load_model() - - if self._enable_lm_head_weight_load: - # NOTE(Shangming): gather lm_head weight when tp enabled - target_lm_head_weight: torch.Tensor = tensor_model_parallel_gather( - self.scorer_worker.model_runner.model_runner.model.lm_head.\ - weight.data, - dim=0, - ) - - self.proposer_worker.maybe_load_lm_head_weight( - target_lm_head_weight) - - self._metrics.init_tensors(self.rank, device_type=self.device) - if model_parallel_is_initialized(): - self.spec_decode_sampler.init_tensors(get_tp_group().local_rank, - device_type=self.device) - else: - self.spec_decode_sampler.init_tensors(self.rank, - device_type=self.device) - - scorer_cls: Type[SpeculativeScorer] - if self.disable_mqa_scorer: - scorer_cls = BatchExpansionTop1Scorer - logger.info("[Speculative Decoding] Use batch " - "expansion for scoring proposals.") - else: - scorer_cls = MQAScorer - logger.info( - "[Speculative Decoding] Use MQA scorer for scoring proposals.") - - self.scorer = scorer_cls(scorer_worker=self.scorer_worker, - device=self.device, - vocab_size=self._vocab_size) - - self._configure_model_sampler_for_spec_decode() - - def load_model(self, *args, **kwargs): - pass - - def _configure_model_sampler_for_spec_decode(self): - """Configure model sampler to emit GPU tensors. This allows spec decode - to keep data on device without transferring to CPU and serializing, - which significantly reduces overhead of sampling during verification. - - NOTE(cade): This breaks abstraction boundaries pretty badly. The better - design is to have the "move to CPU and serialize" sampling decision be - done outside of the model/sampler; this way the "last-mile" worker - object which interfaces with the scheduler can serialize and incur the - performance hit as necessary. This allows us to run the worker several - iterations in a row without incurring the "move to CPU and serialize" - performance penalty. - - Since this requires a large change to vLLM, we defer it to later and - temporarily accept this broken abstraction boundary. - - NOTE(cade): This will require a special check if the proposer worker - does not have a sampler (e.g. ngram speculation). - """ - (self.scorer_worker.model_runner.sampler.include_gpu_probs_tensor - ) = True - (self.scorer_worker.model_runner.sampler. - should_modify_greedy_probs_inplace) = True - self.proposer_worker.set_include_gpu_probs_tensor() - self.proposer_worker.set_should_modify_greedy_probs_inplace() - - def determine_num_available_blocks(self) -> Tuple[int, int]: - """Determine the number of cache blocks to use. - - This is done by profiling the scorer model (which is typically the - larger of the two). Then the total memory which would be used by the - scorer cache is divided evenly between the proposer and scorer model KV, - such that the number of blocks is equal in both KV caches. - """ - num_gpu_blocks, num_cpu_blocks = ( - self.scorer_worker.determine_num_available_blocks()) - - scorer_cache_block_size_bytes = ( - self.scorer_worker.get_cache_block_size_bytes()) - proposer_cache_block_size_bytes = ( - self.proposer_worker.get_cache_block_size_bytes()) - - new_num_gpu_blocks = split_num_cache_blocks_evenly( - scorer_cache_block_size_bytes, proposer_cache_block_size_bytes, - num_gpu_blocks) - return new_num_gpu_blocks, num_cpu_blocks - - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """Initialize the cache engine of the scorer and proposer workers. - """ - self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, - num_cpu_blocks=num_cpu_blocks) - - def get_model(self) -> nn.Module: - return self.scorer_worker.get_model() - - @torch.inference_mode() - def execute_model( - self, - execute_model_req: Optional[ExecuteModelRequest] = None - ) -> List[SamplerOutput]: - """Perform speculative decoding on the input batch. - """ - if self.rank != self._driver_rank: - self._run_non_driver_rank() - return [] - - if execute_model_req is None: - # This signals that there's no more requests to process for now. - # All workers are running infinite loop with broadcast_tensor_dict, - # and it stops the loop when the driver broadcasts an empty input. - # Send an empty input to notify all other workers to stop their - # execution loop. - broadcast_tensor_dict({}, src=0) - return [] - - self._track_finished_requests(execute_model_req) - disable_all_speculation = self._should_disable_all_speculation( - execute_model_req) - num_lookahead_slots = execute_model_req.num_lookahead_slots - all_prompt = True - atleast_one_prompt = False - all_zero_spec_tokens = True - for sgm in execute_model_req.seq_group_metadata_list: - all_prompt = all_prompt and sgm.is_prompt - atleast_one_prompt = atleast_one_prompt or sgm.is_prompt - all_zero_spec_tokens = all_zero_spec_tokens and ( - sgm.num_speculative_tokens == 0) - - if all_prompt and execute_model_req.seq_group_metadata_list: - assert num_lookahead_slots == 0, ( - "Prompt only runs should have num_lookahead_slots equal to 0. " - "This should never happen, please file a bug at " - "https://github.com/vllm-project/vllm/issues") - # Speculative decoding is disabled in the following cases: - # 1. Prefill phase: Speculative decoding is not - # used during the prefill phase. - # 2. Auto-disable enabled: The running queue size exceeds - # the specified threshold. - # 3. No request: There are no requests in the batch, or - # none of the requests in the batch have spec decoding enabled. - # In any of these cases, the proposer and scorer workers - # are called normally. - # We expect `num_speculative_tokens` to be None for prefills. - no_spec = (num_lookahead_slots == 0 or disable_all_speculation - or all_zero_spec_tokens) - - # Broadcast how many lookahead slots are scheduled for this step, and - # whether all speculation is disabled, to all non-driver workers. - - # This is required as if the number of draft model runs changes - # dynamically, the non-driver workers won't know unless we perform a - # communication to inform them. - - # no_spec is used to signal non-driver worker about prefill vs decode - # stage. This is needed to ensure that order of execution of proposer - # and scorer is same in both driver and non-driver workers (i.e., - # scorer -> proposer for prefill and proposer -> scorer in decode). This - # order is needed to support models like EAGLE that take scorer states - # as inputs. - broadcast_dict = dict( - num_lookahead_slots=num_lookahead_slots, - no_spec=no_spec, - disable_all_speculation=disable_all_speculation, - # When both chunked prefill and speculative decoding are enabled - # it is possible that the same batch contains both prefill - # and decodes. If that happens in the scorer we run the batch - # as one single forward pass. However, in the proposer we - # run them as 2 different batches - one for prefill and - # the other for decodes. The variable indicates to the non-driver - # worker that there are prefills as part of the speculative batch - # and hence it needs to run an extra prefill forward pass. - run_spec_proposer_for_prefill=atleast_one_prompt, - ) - broadcast_tensor_dict(broadcast_dict, src=self._driver_rank) - - assert execute_model_req.seq_group_metadata_list is not None, ( - "speculative decoding requires non-None seq_group_metadata_list") - - self._maybe_disable_speculative_tokens( - disable_all_speculation, execute_model_req.seq_group_metadata_list) - - if no_spec: - return self._run_no_spec(execute_model_req, - skip_proposer=disable_all_speculation) - return self._run_speculative_decoding_step(execute_model_req, - num_lookahead_slots) - - @torch.inference_mode() - def start_worker_execution_loop(self) -> None: - """Execute model loop to perform speculative decoding - in parallel worker.""" - while self._run_non_driver_rank(): - pass - - def _should_disable_all_speculation( - self, execute_model_req: ExecuteModelRequest) -> bool: - # When the batch size is too large, disable speculative decoding - # to stop trading off throughput for latency. - return (execute_model_req.running_queue_size - >= self.disable_by_batch_size) - - def _maybe_disable_speculative_tokens( - self, disable_all_speculation: bool, - seq_group_metadata_list: List[SequenceGroupMetadata]) -> None: - if not disable_all_speculation: - return - - for seq_group_metadata in seq_group_metadata_list: - # Once num_speculative_tokens is set to 0, the spec decode - # of this request will be disabled forever. - # TODO(comaniac): We currently store spec decoding specific - # state in the global data structure, but we should maintain - # this state within spec decode worker. - seq_group_metadata.num_speculative_tokens = 0 - - def _serialize_sampler_output_no_logprobs( - self, execute_model_req: ExecuteModelRequest, - sampler_output: SamplerOutput) -> List[SamplerOutput]: - """ - Creates and returns a `SamplerOutput` with only the token IDs being - serialized to CPU and populated in `CompletionSequenceGroupOutput`. - All other parameters in `CompletionSequenceGroupOutput` related to log - probabilities are skipped. - - Args: - execute_model_req (ExecuteModelRequest): The model request that - was executed. - sampler_output (SamplerOutput): The output from the sampler with - only GPU tensors populated. - - Returns: - SamplerOutput: A new `SamplerOutput` instance containing a list of - `CompletionSequenceGroupOutput` objects with only token IDs - populated. - """ - seq_output_prompt_logprobs = [ - seq.is_prompt and seq.sampling_params.prompt_logprobs is not None - and seq.sampling_params.prompt_logprobs > 0 - for seq in execute_model_req.seq_group_metadata_list - ] - # ignore slots for prompt tokens that are filled with INVALID_TOKEN_ID - sampled_token_ids_list = (sampler_output.sampled_token_ids[torch.where( - # subtracting is faster than testing for equality - sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID)[0]] \ - if any(seq_output_prompt_logprobs) else \ - sampler_output.sampled_token_ids).tolist() - - seq_data_entries = [ - (seq_id, seq_data) for sg in \ - execute_model_req.seq_group_metadata_list \ - for seq_id, seq_data in sg.seq_data.items() - ] - completion_seq_group_output_list: List[ - CompletionSequenceGroupOutput] = [] - output_index = 0 - # Make sure the non-terminal prefill chunks are still aligned with - # their own empty output. - for idx, seq_group_meta in enumerate( - execute_model_req.seq_group_metadata_list): - needs_prompt_logprobs = seq_output_prompt_logprobs[idx] - seq_id, seq_data = seq_data_entries[idx] - if needs_prompt_logprobs: - prompt_token_ids = seq_data.get_prompt_token_ids() - - # Some of these sequences may belong to non-terminal chunks, - # which may still have to report logprobs for prompts. - start = 1 if seq_data._num_computed_tokens == 0 \ - else seq_data._num_computed_tokens - end = (seq_data._num_computed_tokens + \ - seq_group_meta.token_chunk_size) - prompt_token_ids = prompt_token_ids[start:end] - prompt_logprobs = [ - create_logprobs_output( - token_id=p_token_id, - token_id_logprob_rank=-1, - token_id_logprob=0.0, - topk_token_ids=[], - topk_logprobs=[], - ) for p_token_id in prompt_token_ids - ] - else: - prompt_logprobs = None - - # Since we can get chunks here, we dont always have a sampled token - # (only on last chunk) but we still have to provide an output. - if not seq_group_meta.do_sample: - completion_seq_group_output_list.append( - CompletionSequenceGroupOutput( - samples=[], prompt_logprobs=prompt_logprobs)) - continue - - # Sequence with output. - completion_seq_group_output_list.append( - create_sequence_group_output( - token_id=sampled_token_ids_list[output_index][0], - token_id_logprob_rank=-1, - token_id_logprob=0.0, - seq_id=seq_id, - topk_token_ids=[], - topk_logprobs=[], - prompt_logprobs=prompt_logprobs)) - output_index += 1 - - return [SamplerOutput(outputs=completion_seq_group_output_list)] - - @nvtx_range("spec_decode_worker._run_no_spec") - def _run_no_spec(self, execute_model_req: ExecuteModelRequest, - skip_proposer: bool) -> List[SamplerOutput]: - """Run a single generation step without any speculation. The input is - sent to the proposer and scorer model so that the KV cache is consistent - between the two. When skip_proposer is True, the proposer model is - not called, meaning that the kv-cache in proposer for requests is not - updated, so they cannot enable spec decode in the rest decoding. - """ - - sampler_output = self.scorer_worker.execute_model(execute_model_req) - assert len(sampler_output) == 1 - sampler_output = sampler_output[0] - - # Store hidden states from target model execution, BxD. - hidden_states = sampler_output.hidden_states - if hidden_states is not None: - # Only decodes and prefill terminal chunks need a hidden state. - seq_group_meta_with_hidden = [ - sg for sg in execute_model_req.seq_group_metadata_list - if sg.do_sample - ] - if any(seq.is_prompt for seq in seq_group_meta_with_hidden): - # Drop hidden_states with no prediction (eg non-terminal chunks) - hidden_states = hidden_states[ - torch.where(sampler_output.sampled_token_ids - - VLLM_INVALID_TOKEN_ID)[0]] - if self.previous_hidden_states is None and len( - seq_group_meta_with_hidden): - self.previous_hidden_states = HiddenStates( - hidden_states, seq_group_meta_with_hidden) - elif self.previous_hidden_states and len( - seq_group_meta_with_hidden): - self.previous_hidden_states.update(hidden_states, - seq_group_meta_with_hidden) - self.previous_hidden_states.prune(seq_group_meta_with_hidden) - - if not skip_proposer: - # We prepare the prefill hidden states here so that there no - # additional complexity in worker for spec_decode vs non_spec_decode - # flow and execute_model doesn't need additional modifications. - execute_model_req.previous_hidden_states = \ - prepare_prefill_hidden_states( - sampler_output.prefill_hidden_states) - for i in range(self._num_spec_prefill_steps): - execute_model_req.spec_step_idx = i - self.proposer_worker.execute_model(execute_model_req) - - sampler_output_to_return = (self._serialize_sampler_output_no_logprobs( - execute_model_req=execute_model_req, sampler_output=sampler_output) - if self._disable_logprobs else - [sampler_output]) - - # Clear device tensors from sampler output. This reduces communication - # overhead when the engine runs in a different process than the workers. - sampler_output.sampled_token_probs = None - sampler_output.sampled_token_ids = None - sampler_output.logprobs = None - return sampler_output_to_return - - def _run_non_driver_rank(self) -> bool: - """Run proposer and verifier model in non-driver workers. This is used - for both speculation cases (num_lookahead_slots>0) and non-speculation - cases (e.g. prefill). - - Returns True if there are remaining sequences to process. - """ - assert self.rank != self._driver_rank - - data = broadcast_tensor_dict(src=self._driver_rank) - if not data: - return False - num_lookahead_slots = data["num_lookahead_slots"] - - # In case of prefill, scorer_worker has to be run before proposer so - # that the hidden states can be propagated to proposer when needed. - if data["no_spec"]: - self.scorer_worker.execute_model() - - if not data["disable_all_speculation"]: - # Even if num_lookahead_slots is zero, we want to run the - # proposer model as it may have KV. - # - # We run the proposer once per lookahead slot. In the future we - # should delegate how many times it runs to the proposer. - for _ in range(max(num_lookahead_slots, 1)): - self.proposer_worker.execute_model() - - if not data["no_spec"]: - self.scorer_worker.execute_model() - if data["run_spec_proposer_for_prefill"]: - self.proposer_worker.execute_model() - - return True - - @nvtx_range("spec_decode_worker._run_speculative_decoding_step") - def _run_speculative_decoding_step( - self, execute_model_req: ExecuteModelRequest, - num_lookahead_slots: int) -> List[SamplerOutput]: - """Execute a single step of speculative decoding. - - This invokes the proposer worker to get k speculative tokens for each - sequence, then scores each speculative token using the scoring worker. - - When `enable_chunked_prefill` is set, scorer will batch decodes and - prefills, while proposer will sync its KV-cache by running an extra - forward on prefills. - - Returns a list of SamplerOutput, each containing a single token per - sequence. - """ - # With prefill chunking, expect requests to have prompts first - # so that backend gets prefill|decode. - assert num_lookahead_slots == execute_model_req.num_lookahead_slots - - # Pass last hidden states from target model to proposer - execute_model_req.previous_hidden_states = self.previous_hidden_states - self.previous_hidden_states = None - - with Timer() as proposal_timer: - # Generate proposals using draft worker. - proposals = self.proposer_worker.get_spec_proposals( - execute_model_req, self._seq_with_bonus_token_in_last_step) - - if not self._allow_zero_draft_token_step and proposals.no_proposals: - #TODO: Fix it #5814 - raise RuntimeError("Cannot handle cases where distributed draft " - "workers generate no tokens") - - execute_model_req.previous_hidden_states = None - - with Timer() as scoring_timer: - proposal_scores = self.scorer.score_proposals( - execute_model_req, - proposals, - ) - - _, (non_spec_seqs, non_spec_indices) = split_batch_by_proposal_len( - execute_model_req.seq_group_metadata_list, proposals.proposal_lens) - # With prefill chunking enabled, `non_spec_seqs` contains prefills too: - # discard decodes that have already been processed by proposer. - non_spec_indices = [ - idx for idx in non_spec_indices - if execute_model_req.seq_group_metadata_list[idx].is_prompt - ] - if len(non_spec_indices): - all_hidden_states = proposal_scores.hidden_states - if all_hidden_states is not None: - prefill_hidden_states = all_hidden_states[non_spec_indices] - execute_model_req.previous_hidden_states = \ - prepare_prefill_hidden_states(prefill_hidden_states) - # Sync proposer KV cache for prefills. - prefill_req = execute_model_req.clone(non_spec_seqs) - # TODO avoid sampling here? - self.proposer_worker.execute_model(prefill_req) - - with Timer() as verification_timer: - accepted_token_ids, target_logprobs = self._verify_tokens( - execute_model_req.seq_group_metadata_list, proposal_scores, - proposals, execute_model_req.num_lookahead_slots) - - stage_times = (proposal_timer.elapsed_time_ms / num_lookahead_slots, - scoring_timer.elapsed_time_ms, - verification_timer.elapsed_time_ms) - - return self._create_output_sampler_list( - execute_model_req.seq_group_metadata_list, - accepted_token_ids, - target_logprobs=target_logprobs, - prompt_logprobs=proposal_scores.prompt_logprobs - if not self._disable_logprobs else None, - k=execute_model_req.num_lookahead_slots, - stage_times=stage_times) - - @nvtx_range("spec_decode_worker._verify_tokens") - def _verify_tokens( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_scores: SpeculativeScores, - proposals: SpeculativeProposals, - max_proposal_len: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Determine which speculative tokens are accepted using the - probabilities of each token according to the proposer and scorer models. - - Returns a tuple of Tensors, one for the accepted token ids and one for - the logprobs according to the scoring model. - """ - proposal_lens_list = proposals.proposal_lens.tolist() - - # vLLM currently only supports proposal lens equal to zero or the batch - # proposal len. This adds some complexity (splitting the batch into spec - # and non spec sequences) and should be removed in the future. It can be - # done by supporting per-sequence proposal lens. - (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len( - seq_group_metadata_list, proposal_lens_list) - original_indices = spec_indices + non_spec_indices - - # Get probabilities of target model, including bonus tokens. - proposal_verifier_probs = proposal_scores.probs[spec_indices] - - # Get non-speculative sampled tokens from target model. - non_spec_token_ids = proposal_scores.token_ids[non_spec_indices] - - # Get bonus tokens from target model. - bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:] - - # Get probabilities according to proposal method. - proposal_probs = proposals.proposal_probs[spec_indices] - - # Get proposed tokens. - proposal_token_ids = proposals.proposal_token_ids[spec_indices] - - # Sampler arguments - sampler_extra_kwargs: Dict[str, Any] = {} - if self.generators and isinstance(self.spec_decode_sampler, - SpecDecodeStochasticBaseSampler): - sampler_extra_kwargs["seeded_seqs"] = { - idx: self.generators[sgm.request_id] - for idx, sgm in enumerate(seq_group_metadata_list) - if sgm.sampling_params.seed is not None - } - - accepted_token_ids = self.spec_decode_sampler( - target_with_bonus_probs=proposal_verifier_probs, - bonus_token_ids=bonus_token_ids, - draft_probs=proposal_probs, - draft_token_ids=proposal_token_ids, - **sampler_extra_kwargs, - ) - # Append output tokens from non-speculative sequences to - # the accepted token ids tensor. - non_spec_token_ids = non_spec_token_ids.expand(-1, max_proposal_len + - 1).clone() - non_spec_token_ids[:, 1:] = -1 - accepted_token_ids = torch.cat( - [accepted_token_ids, non_spec_token_ids]) - logprobs = proposal_scores.logprobs - # Rearrange so that results are in the order of the original seq group - # metadata. - accepted_token_ids[original_indices] = accepted_token_ids.clone() - - # B x K+1 x D - hidden_states = proposal_scores.hidden_states - if hidden_states is not None: - # Only get terminal hidden states for next step - terminal_metadata = [ - sg for sg in seq_group_metadata_list if sg.do_sample - ] - - # Contract hidden states based on accepted tokens - hs_size = hidden_states.shape[-1] - accepted_index = accepted_token_ids + 1 # Convert -1 to 0 - accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) # b - # Drop non-terminal prefill chunks hidden states. - hidden_states = hidden_states[accepted_index != - VLLM_INVALID_TOKEN_ID] - accepted_index = accepted_index[accepted_index != - VLLM_INVALID_TOKEN_ID] - assert len(accepted_index) == hidden_states.shape[0] == len( - terminal_metadata) - index = accepted_index[:, None, None].expand(-1, 1, - hs_size) # b x 1 x d - second_last_token_hidden_states = hidden_states[:, -2] # b x d - hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d - # Store hidden states from target model for subsequent decode step - self.previous_hidden_states = HiddenStates( - hidden_states, terminal_metadata, - second_last_token_hidden_states) - return accepted_token_ids, logprobs - - def _create_output_sampler_list( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - accepted_token_ids: torch.Tensor, # shape: [batch_size, k+1] - target_logprobs: torch.Tensor, # shape: [batch_size, k+1, vocab_size] - prompt_logprobs: Optional[ - torch.Tensor], # shape: [nprompt_tokens, vocab_size] - k: int, - stage_times: Tuple[float, float, float], - ) -> List[SamplerOutput]: - """Given the accepted token ids, create a list of SamplerOutput. - - The output is padded with -1 tokens such that each sequence has - the same number of outputs. - """ - batch_size, num_steps = accepted_token_ids.shape - accepted_token_ids_by_step = accepted_token_ids.transpose(0, 1) - if self._disable_logprobs: - # We are skipping the logprobs. Hence don't serialize the - # logprobs related tensors from the GPU. Instead create - # empty/dummy lists. - (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, - topk_logprobs_by_step, topk_indices_by_step) =\ - self._create_dummy_logprob_lists( - batch_size, num_steps, - self.scorer_worker.model_config.max_logprobs) - else: - # Organize input tensors by step instead of by sequence. - target_logprobs_by_step = target_logprobs.transpose(0, 1) - # Serialize all tensors into Python lists. - (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, - topk_logprobs_by_step, topk_indices_by_step) =\ - self._create_logprob_lists_from_tensors( - target_logprobs_by_step, accepted_token_ids_by_step, - self.scorer_worker.model_config.max_logprobs) - - # Get the sequence ids and num_logprobs (sampling parameter) in the - # batch. - seq_ids, request_ids_seq_ids_mapping = get_all_seq_ids_and_request_ids( - seq_group_metadata_list) - - num_logprobs_per_seq = get_all_num_logprobs(seq_group_metadata_list) - - # Serialize tensor to CPU Python list. - accepted_token_ids_by_step = accepted_token_ids_by_step.tolist() - - # Construct the output on a per-step, per-sequence basis. - # Non-terminal prefill chunks will end up here as rows with just -1s - # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]] while - # terminal chunks will only have one generated token at time 0. - sampler_output_list: List[SamplerOutput] = [] - - # Prefills are not multi-step (return at most 1 token), in order to - # avoid padding or repetition to fit decodes, we separate them. - for i, sg in enumerate(seq_group_metadata_list): - if not sg.is_prompt: - # Requests are ordered as prefills|decodes=>no more prefills. - break - num_logprobs = num_logprobs_per_seq[i] - seq_kwargs = dict(token_id=-1, - token_id_logprob_rank=0, - token_id_logprob=-float('inf'), - topk_token_ids=[-1] * num_logprobs, - topk_logprobs=[-float('inf')] * num_logprobs, - seq_id=seq_ids[i]) - # Terminal chunk, has token. - if sg.do_sample: - seq_kwargs.update( - dict( - token_id=accepted_token_ids[i][0].item(), - token_id_logprob_rank=accepted_token_id_ranks_by_step[ - 0][i], - token_id_logprob=accepted_token_id_logprobs_by_step[0] - [i], - topk_token_ids=topk_indices_by_step[0][i] - [:num_logprobs], - # output only so step is 0 - topk_logprobs=topk_logprobs_by_step[0][i] - [:num_logprobs], - )) - needs_plogs = (sg.sampling_params.prompt_logprobs - and sg.sampling_params.prompt_logprobs > 0) - plogs = None - if prompt_logprobs is not None: - # Even non-terminal prompt chunks can have logprobs here. - plogs = prompt_logprobs[i] - elif needs_plogs: - # Prompt logprobs are requested but `_disable_logprobs` is set. - seq_data = next(iter(sg.seq_data.values())) - # Get only the tokens in this chunk! - prompt_token_ids = seq_data.get_prompt_token_ids() - prompt_token_ids = prompt_token_ids[ - seq_data. - _num_computed_tokens:seq_data._num_computed_tokens + - sg.token_chunk_size] - - is_first_chunk = seq_data._num_computed_tokens == 0 - # There's no prob generated for the first token in a sequence. - if is_first_chunk: - prompt_token_ids = prompt_token_ids[1:] - plogs = [ - create_logprobs_output( - token_id=p_token_id, - token_id_logprob_rank=-1, - token_id_logprob=0.0, - topk_token_ids=[], - topk_logprobs=[], - ) for p_token_id in prompt_token_ids - ] - seq_kwargs.update(dict(prompt_logprobs=plogs)) - - sampler_output_list.append( - SamplerOutput( - outputs=[create_sequence_group_output( - **seq_kwargs)])) # type: ignore - - # Decodes, create one SamplerOutput per-step (at most K+1). - for step_index in range(num_steps): - if all(token_id == -1 for sg, token_id in zip( - seq_group_metadata_list, - accepted_token_ids_by_step[step_index]) - if not sg.is_prompt): - break - - step_output_token_ids: List[CompletionSequenceGroupOutput] = [] - for sequence_index in range(batch_size): - seq_meta = seq_group_metadata_list[sequence_index] - # Prompts already processed above. - if seq_meta.is_prompt: - continue - - # Each sequence may have a different num_logprobs; retrieve it. - num_logprobs = num_logprobs_per_seq[sequence_index] - step_output_token_ids.append( - create_sequence_group_output( - token_id=accepted_token_ids_by_step[step_index] - [sequence_index], - token_id_logprob_rank=accepted_token_id_ranks_by_step[ - step_index][sequence_index], - token_id_logprob=accepted_token_id_logprobs_by_step[ - step_index][sequence_index], - seq_id=seq_ids[sequence_index], - topk_token_ids=topk_indices_by_step[step_index] - [sequence_index][:num_logprobs], - topk_logprobs=topk_logprobs_by_step[step_index] - [sequence_index][:num_logprobs], - step_index=step_index)) - sampler_output_list.append( - SamplerOutput(outputs=step_output_token_ids)) - - # Populate the data structures needed to keep track of sequences with - # bonus tokens. - self._track_sequences_with_bonus_tokens(seq_ids, - request_ids_seq_ids_mapping, - accepted_token_ids_by_step) - maybe_rejsample_metrics = ( - self._metrics.maybe_collect_rejsample_metrics(k)) - if maybe_rejsample_metrics is not None: - sampler_output_list[ - 0].spec_decode_worker_metrics = maybe_rejsample_metrics - - # Log time spent in each stage periodically. - # This is periodic because the rejection sampler emits metrics - # periodically. - self._maybe_log_stage_times(*stage_times) - # First `n_prefills` entries will contain prefills SamplerOutput when - # chunked prefill is enabled, the rest is decodes in multi-step format. - return sampler_output_list - - def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float, - scoring_time_ms: float, - verification_time_ms: float) -> None: - """Log the speculative stage times. If stat logging is disabled, do - nothing. - """ - if self._disable_log_stats: - return - - logger.info( - "SpecDecodeWorker stage times: " - "average_time_per_proposal_tok_ms=%.02f " - "scoring_time_ms=%.02f verification_time_ms=%.02f", - average_time_per_proposal_tok_ms, scoring_time_ms, - verification_time_ms) - - def _create_dummy_logprob_lists( - self, - batch_size: int, - num_steps: int, - num_top_k: int, - ) -> Tuple[List[List[int]], List[List[float]], - List[List[List[Optional[float]]]], - List[List[List[Optional[int]]]]]: - """ - Creates and returns four dummy lists representing token probabilities - and their ranks. - - This method initializes and returns: - - The ranks of the accepted tokens, shaped (num_steps, batch_size) - - The log probabilities of the accepted tokens, - shaped (num_steps, batch_size) - - The log probabilities of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - The token IDs of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - Args: - batch_size (int): The size of the batch. - num_steps (int): The number of steps in the sequence. - num_top_k (int): The number of top-k token log probabilities to - return. - - Returns: - A tuple containing four dummy lists as described above. - """ - accepted_token_id_ranks_by_step = [[-1] * batch_size - for _ in range(num_steps)] - accepted_token_id_logprobs_by_step = [[0.0] * batch_size - for _ in range(num_steps)] - topk_logprobs_by_step: List[List[List[Optional[float]]]] = [[ - [None] * num_top_k for _ in range(batch_size) - ] for _ in range(num_steps)] - topk_indices_by_step: List[List[List[Optional[int]]]] = [[ - [None] * num_top_k for _ in range(batch_size) - ] for _ in range(num_steps)] - return (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, topk_logprobs_by_step, - topk_indices_by_step) - - def _create_logprob_lists_from_tensors( - self, - target_logprobs_by_step: torch.Tensor, - accepted_token_ids_by_step: torch.Tensor, - num_top_k: int, - ) -> Tuple[List[List[int]], List[List[float]], - List[List[List[Optional[float]]]], - List[List[List[Optional[int]]]]]: - """ - Creates and returns four lists representing token probabilities and - their ranks. - - This method initializes and returns four lists containing: - - The ranks of the accepted tokens, shaped (num_steps, batch_size) - - The log probabilities of the accepted tokens, - shaped (num_steps, batch_size) - - The log probabilities of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - The token IDs of the top k tokens, - shaped (num_steps, batch_size, num_top_k) - - Args: - target_logprobs_by_step (torch.Tensor): Tensor representing the - log probabilities of the target model, - shaped (num_steps, batch_size, vocab_size) - accepted_token_ids_by_step (torch.Tensor): Tensor representing - the accepted token_ids, shaped (num_steps, batch_size) - num_top_k (int): The number of top-k token log probabilities to - return. - - Returns: - A tuple containing the lists as described above. - """ - # Serialize all tensors to CPU Python lists. - # Get the logprobs/rank of the accepted tokens. - (accepted_token_id_ranks_by_step_tensor, - accepted_token_id_logprobs_by_step_tensor - ) = get_sampled_token_logprobs( - logprob_tensor=target_logprobs_by_step, - sampled_token_ids=accepted_token_ids_by_step, - ) - # Get the top-k logprobs (which may or may not include the - # logprob of the accepted token). - (topk_logprobs_by_step_tensor, - topk_indices_by_step_tensor) = target_logprobs_by_step.topk( - k=num_top_k, - dim=-1, - ) - accepted_token_id_ranks_by_step = ( - accepted_token_id_ranks_by_step_tensor.tolist()) - accepted_token_id_logprobs_by_step = ( - accepted_token_id_logprobs_by_step_tensor.tolist()) - topk_logprobs_by_step = topk_logprobs_by_step_tensor.tolist() - topk_indices_by_step = topk_indices_by_step_tensor.tolist() - return (accepted_token_id_ranks_by_step, - accepted_token_id_logprobs_by_step, topk_logprobs_by_step, - topk_indices_by_step) - - def _track_finished_requests(self, execute_model_req: ExecuteModelRequest): - """ - Removes the finished requests and their associated sequence ids from - internal book keeping data structures. - """ - for finished_request in execute_model_req.finished_requests_ids: - for seq_id in self._request_id_seq_id_mapping[finished_request]: - self._seq_with_bonus_token_in_last_step.discard(seq_id) - del self._request_id_seq_id_mapping[finished_request] - - def _track_sequences_with_bonus_tokens( - self, seq_ids: List[int], - request_ids_seq_ids_mapping: Dict[str, Set[int]], - accepted_token_ids_by_step: List[List[int]]): - """ - Updates the internal data structures which keep track of sequences - which have been assigned bonus tokens in their last forward pass. - """ - for seq_index, seq_id in enumerate(seq_ids): - last_token_id = accepted_token_ids_by_step[-1][seq_index] - if last_token_id == -1: - self._seq_with_bonus_token_in_last_step.discard(seq_id) - else: - self._seq_with_bonus_token_in_last_step.add(seq_id) - for request_id, sequences in request_ids_seq_ids_mapping.items(): - self._request_id_seq_id_mapping[request_id].update(sequences) - - @cached_property - def _vocab_size(self) -> int: - """Get the vocab size of the model and make sure it's consistent between - draft and target workers. - """ - vocab_sizes = [ - worker.vocab_size - for worker in [self.proposer_worker, self.scorer_worker] - ] - assert all(vocab_sizes[0] == vocab_size for vocab_size in vocab_sizes) - return vocab_sizes[0] - - @property - def rank(self): - return self.scorer_worker.rank - - @property - def device(self): - return self.scorer_worker.device - - @property - def _driver_rank(self) -> int: - return 0 - - def get_cache_block_size_bytes(self): - """Return the size of a cache block in bytes. - - This function is only used to compose workers within a SpecDecodeWorker. - We leave composing a SpecDecodeWorker within a SpecDecodeWorker - undefined for now, although it could be implemented in the future. - See https://arxiv.org/abs/2308.04623. - """ - raise NotImplementedError - - def start_profile(self): - if isinstance(self.scorer_worker, WorkerBase): - self.scorer_worker.start_profile() - - def stop_profile(self): - if isinstance(self.scorer_worker, WorkerBase): - self.scorer_worker.stop_profile() - - -def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, - proposer_cache_block_size_bytes: int, - total_num_gpu_blocks: int) -> int: - """Given total_num_gpu_blocks, the number of GPU blocks that could be - allocate to the target model, this function calculates how many blocks - should be given to the draft and target model. - - Note that usually the block size, in bytes, of each model is different, - as it's a function of number of KV/layer, number of heads, and hidden - dimension size. - - Since the target and draft models allocate the same number of blocks, we - simply calculate the number of blocks where if allocated by both models, - the total memory usage from KV cache is no larger than the number of - blocks allocatable by the target model alone. - """ - new_num_gpu_blocks = int( - total_num_gpu_blocks * scorer_cache_block_size_bytes / - (proposer_cache_block_size_bytes + scorer_cache_block_size_bytes)) - - return new_num_gpu_blocks - - -def prepare_prefill_hidden_states( - prefill_hidden_states: torch.Tensor) -> HiddenStates: - # For prefill step in proposer, we run the model for N-1 tokens - # because Nth token will be processed in the first decode step. For - # N-1 tokens, the input should be 0:N-1 hidden states which should - # be concatanated with 1:N token (since output of scorer has to be - # the input for proposer). Therefore, we shift the hidden states to - # align n-1th hidden state with nth token. - return HiddenStates(prefill_hidden_states.roll( - shifts=1, dims=0)) if prefill_hidden_states is not None else None diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py deleted file mode 100644 index ca89eb60ac58..000000000000 --- a/vllm/spec_decode/target_model_runner.py +++ /dev/null @@ -1,45 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional - -from vllm.sequence import SequenceGroupMetadata -from vllm.worker.model_runner_base import (ModelRunnerBase, - ModelRunnerInputBase, - ModelRunnerWrapperBase) - - -class TargetModelRunner(ModelRunnerWrapperBase): - """Specialized model runner for speculative decoding target model. - In speculative decoding, the log probabilities selected finally may not - be the same ones as selected by the target model sampling. This means - that the time spent in the log probability calculation of the target model - is time wasted, since we calculate log probabilities after deciding which - tokens are accepted. For this reason disabling log probabilities in the - target model will make decode faster. The model runner sets the - SamplingMetadata parameters according to whether log probabilities are - requested or not. - """ - - def __init__(self, model_runner: ModelRunnerBase): - # An internal boolean member variable to indicate if token log - # probabilities are needed or not. - super().__init__(model_runner) - self.disable_logprobs = True - - def prepare_model_input( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - virtual_engine: int = 0, - finished_requests_ids: Optional[List[str]] = None, - ) -> ModelRunnerInputBase: - model_input: ModelRunnerInputBase =\ - self.model_runner.prepare_model_input( - seq_group_metadata_list, virtual_engine, finished_requests_ids) - # If token log probabilities is disabled then skip generating sampler - # CPU output. We directly serialize the GPU sampled_token_id tensors - # as needed. If log probabilities is enabled then synchronize all the - # sampling related tensors which includes the logprobs tensors. - model_input.sampling_metadata.skip_sampler_cpu_output = ( - self.disable_logprobs) - return model_input diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py deleted file mode 100644 index afd91b42b943..000000000000 --- a/vllm/spec_decode/top1_proposer.py +++ /dev/null @@ -1,275 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import List, Optional, Set, Tuple - -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata -from vllm.spec_decode.interfaces import (SpeculativeProposals, - SpeculativeProposer) -from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase -from vllm.spec_decode.util import sampler_output_to_torch - - -class Top1Proposer(SpeculativeProposer): - """Helper class which separates out sequences which would exceed the max - model length when speculated upon. - - This allows combinations of models such as JackFram/llama-68m draft with - meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of - 2048 while Llama2-13b has max_position_embeddings of 4096. - - We treat the sequences which exceed the proposal draft model length as - "non-spec sequences". Essentially they skip the draft model and go through - normal decoding in the target model. - - Currently, only proposal_lens of 0 and k are supported, where k is a global - batch proposal length. In the future vLLM should support per-sequence - proposal lengths. - """ - - def __init__( - self, - worker: ProposerWorkerBase, - device: str, - vocab_size: int, - max_proposal_len: Optional[int] = None, - ): - self._worker = worker - self._device = device - self.max_proposal_len = max_proposal_len - self._vocab_size = vocab_size - - def get_spec_proposals( - self, - execute_model_req: ExecuteModelRequest, - seq_ids_with_bonus_token_in_last_step: Set[int], - ) -> SpeculativeProposals: - """Get speculative proposals given the input batch. - - Sequences which would exceed the max model length are skipped during - speculation. - """ - proposal_len = execute_model_req.num_lookahead_slots - seq_group_metadata_list = execute_model_req.seq_group_metadata_list - - # Split speculative- and non-speculative- sequences. - ( - proposal_lens, - nonzero_proposal_len_seqs, - nonzero_proposal_len_indices, - ) = self._split_by_proposal_len(seq_group_metadata_list, proposal_len) - - if nonzero_proposal_len_seqs: - # Speculate tokens using the draft worker for the speculative - # sequences. - # If sampler_transposed is true, then maybe_sampler_output's - # token_ids is like [batch] format in proposal_len size list, - # while if it is false, the format would be [proposal_len] - # in batch size list - hidden_states = execute_model_req.previous_hidden_states - if hidden_states is not None: - hidden_states.prune(nonzero_proposal_len_seqs) - nonzero_execute_model_req = ExecuteModelRequest( - seq_group_metadata_list=nonzero_proposal_len_seqs, - num_lookahead_slots=proposal_len, - previous_hidden_states=hidden_states, - ) - maybe_sampler_output, transposed = self._worker.sampler_output( - execute_model_req=nonzero_execute_model_req, - sample_len=proposal_len, - seq_ids_with_bonus_token_in_last_step=\ - seq_ids_with_bonus_token_in_last_step, - ) - ( - proposal_lens, - maybe_sampler_output, - nonzero_proposal_len_indices, - ) = self._remove_no_proposal_seqs(proposal_lens, - maybe_sampler_output, - nonzero_proposal_len_indices, - transposed) - else: - # If no sequences can be speculated, set sampler output to None. - maybe_sampler_output = None - transposed = False - - # Combine speculative- and non-speculative sequences into the same - # representation. - proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs( - batch_size=len(seq_group_metadata_list), - proposal_len=proposal_len, - maybe_sampler_output=maybe_sampler_output, - proposal_lens=proposal_lens, - nonzero_proposal_len_indices=nonzero_proposal_len_indices, - sampler_transposed=transposed, - ) - - proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens, - proposal_probs=proposal_probs, - proposal_lens=proposal_lens, - no_proposals=maybe_sampler_output - is None) - return proposals - - def _split_by_proposal_len( - self, - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_len: int, - ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]: - """Split sequences by two groups: - 1. Sequences with non-zero proposal length. - 2. Sequences with zero proposal length (due to disabled speculation - or exceed the maximum model length). - """ - - proposal_lens: List[int] = [] - nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = [] - nonzero_proposal_len_indices: List[int] = [] - for i, seq_group_metadata in enumerate(seq_group_metadata_list): - # The speculative decoding for this request has either been disabled - # (e.g. due to high traffic) or this is a prompt request. - if (seq_group_metadata.is_prompt - or seq_group_metadata.num_speculative_tokens == 0): - proposal_lens.append(0) - continue - - seq_data = next(iter(seq_group_metadata.seq_data.values())) - seq_len = seq_data.get_len() - - # Currently only proposal lens of 0 or the global batch proposal len - # are supported. - # If max_proposal_len is defined, then we shall not exceed this - # quota for nonzero_proposal - new_k = 0 - if (self.max_proposal_len is None - or seq_len + proposal_len < self.max_proposal_len): - new_k = proposal_len - nonzero_proposal_len_seqs.append(seq_group_metadata) - nonzero_proposal_len_indices.append(i) - proposal_lens.append(new_k) - seq_group_metadata.num_speculative_tokens = new_k - - return ( - proposal_lens, - nonzero_proposal_len_seqs, - nonzero_proposal_len_indices, - ) - - @staticmethod - def _remove_no_proposal_seqs(proposal_lens, maybe_sampler_output, - nonzero_proposal_len_indices, transposed): - """Remove sequences from nonzero_proposal_len_indices and reset - their proposal_len to 0 the draft worker does not provide a proposal - (maybe_sampler_output=None). This can avoid scoring overheads. - """ - - # If maybe_sampler_output is None, then the draft worker did not - # provide a proposal for any sequence and thus no action needed. - # Also we do not support transposed maybe_sampler_output for now - # because it seems not straightforward for draft workers outputting - # transposed sampler outputs to handle the case of no proposal. - if maybe_sampler_output is None or transposed: - return (proposal_lens, maybe_sampler_output, - nonzero_proposal_len_indices) - - new_proposal_lens: List[int] = [] - new_nonzero_proposal_len_indices: List[int] = [] - new_maybe_sampler_output: List[SamplerOutput] = [] - nonzero_proposal_len_idx_ptr = 0 - seq_idx = 0 - while seq_idx < len( - proposal_lens) and nonzero_proposal_len_idx_ptr < len( - nonzero_proposal_len_indices): - if seq_idx < nonzero_proposal_len_indices[ - nonzero_proposal_len_idx_ptr]: - # Sequence is not in the original nonzero_proposal_len_indices, - # meaning that it has a proposal length of 0 before sending to - # the draft worker. - assert proposal_lens[seq_idx] == 0 - new_proposal_lens.append(0) - else: - # Sequence is in the original nonzero_proposal_len_indices - if maybe_sampler_output[nonzero_proposal_len_idx_ptr] is None: - # but does not have a proposal from the draft worker. - new_proposal_lens.append(0) - else: - # and has a proposal from the draft worker. Add it to the - # new nonzero proposal list and keep the sampler output. - new_proposal_lens.append(proposal_lens[seq_idx]) - new_nonzero_proposal_len_indices.append(seq_idx) - new_maybe_sampler_output.append( - maybe_sampler_output[nonzero_proposal_len_idx_ptr]) - nonzero_proposal_len_idx_ptr += 1 - seq_idx += 1 - - # The remaining sequences should have proposal length of 0. - new_proposal_lens.extend(proposal_lens[seq_idx:]) - - # We assume sampler_output will not be a list of all Nones. - # In this case this function should not be called. - assert new_maybe_sampler_output - return (new_proposal_lens, new_maybe_sampler_output, - new_nonzero_proposal_len_indices) - - def _merge_outputs( - self, - batch_size: int, - proposal_len: int, - maybe_sampler_output: Optional[List[SamplerOutput]], - proposal_lens: List[int], - nonzero_proposal_len_indices: List[int], - sampler_transposed: bool, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - """After speculations are produced, merge the speculation results with - the skipped sequences. - """ - if maybe_sampler_output is None: - # If no speculative tokens, the sampler output will be None. - # In this case we return empty proposals. - proposal_tokens = torch.tensor(-1, - dtype=torch.long, - device=self._device).expand( - batch_size, proposal_len) - proposal_probs = torch.tensor(0, - dtype=torch.float32, - device=self._device).expand( - batch_size, proposal_len, - self._vocab_size) - proposal_lens_tensor = torch.tensor(0, - dtype=torch.long, - device=self._device).expand( - len(proposal_lens)) - return proposal_tokens, proposal_probs, proposal_lens_tensor - - sampler_output = maybe_sampler_output - proposal_tokens, proposal_probs, *_ = sampler_output_to_torch( - sampler_output, sampler_transposed) - - # Now, reformat the output GPU tensors such that each sequence has - # a proposal. the proposal can be empty, e.g. [-1, -1, -1] - - entire_proposal_tokens = proposal_tokens.new_full( - size=(batch_size, *proposal_tokens.shape[1:]), - fill_value=-1, - ) - entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens - entire_proposal_probs = proposal_probs.new_zeros( - batch_size, - *proposal_probs.shape[1:], - ) - entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs - - proposal_tokens, proposal_probs = ( - entire_proposal_tokens, - entire_proposal_probs, - ) - - proposal_lens_tensor = torch.zeros(batch_size, - dtype=torch.long, - device=self._device) - proposal_lens_tensor[nonzero_proposal_len_indices] = proposal_len - - return proposal_tokens, proposal_probs, proposal_lens_tensor diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py deleted file mode 100644 index 22d2a4833acf..000000000000 --- a/vllm/spec_decode/util.py +++ /dev/null @@ -1,277 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import time -from contextlib import contextmanager -from typing import Dict, List, Optional, Sequence, Tuple - -import torch - -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform -from vllm.sequence import (CompletionSequenceGroupOutput, Logprob, - PromptLogprobs, SequenceGroupMetadata, - SequenceOutput) - -SeqId = int - - -def get_all_num_logprobs( - seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]: - """Given a list of SequenceGroupMetadata, create a list of all num_logprobs. - - If the sampling params do not call for any logprobs, return 0 for that - sequence. - """ - - all_num_logprobs: List[int] = [] - for seq_group_metadata in seq_group_metadata_list: - num_logprobs = seq_group_metadata.sampling_params.logprobs - if num_logprobs is None: - num_logprobs = 0 - all_num_logprobs.append(num_logprobs) - - return all_num_logprobs - - -def get_sampled_token_logprobs( - # shape [num_steps, batch_size, vocab_size] - logprob_tensor: torch.Tensor, - sampled_token_ids: torch.Tensor, # shape [num_steps, batch_size] -) -> Tuple[torch.Tensor, torch.Tensor]: - """Get the logprobs for the sampled tokens. Returns the ranks and logprobs. - """ - num_steps, batch_size, vocab_size = logprob_tensor.shape - - selected_logprobs = logprob_tensor[ - torch.arange(num_steps).unsqueeze(1), - torch.arange(batch_size), - sampled_token_ids, - ] - expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand( - -1, -1, vocab_size) - sampled_token_ids_ranks = (logprob_tensor - > expanded_selected_logprobs).sum(-1).add_(1) - - return sampled_token_ids_ranks, selected_logprobs - - -def create_logprobs_output( - token_id: int, - token_id_logprob_rank: int, - token_id_logprob: float, - topk_token_ids: List[Optional[int]], - topk_logprobs: List[Optional[float]], -) -> Dict[int, Logprob]: - """Create a Logprob Dict for a token given the sampling results. - - Args: - token_id (int): The sampled token for the sequence. - token_id_logprob_rank (int): The logprob rank of the sampled token. - token_id_logprob (float): The logprob value of the sampled token. - topk_token_ids (List[Optional[int]]): The list of top-k token ids. - topk_logprobs (List[Optional[float]]): The list of top-k logprobs. - """ - # vLLM logprobs always include the sampled token. In addition, the user may - # request topk-logprobs (where top-k varies per user up to max_logprobs). - logprobs: Dict[int, Logprob] = { - token_id: Logprob( - logprob=token_id_logprob, - rank=token_id_logprob_rank, - ), - } - logprobs.update({ - topk_token_id: Logprob( - logprob=topk_logprob if topk_logprob is not None else 0.0, - rank=topk_index + 1, - ) - for topk_index, (topk_token_id, topk_logprob) \ - in enumerate(zip(topk_token_ids, topk_logprobs)) \ - if topk_token_id is not None - }) - - return logprobs - - -def create_sequence_group_output( - token_id: int, - token_id_logprob_rank: int, - token_id_logprob: float, - seq_id: SeqId, - topk_token_ids: List[Optional[int]], - topk_logprobs: List[Optional[float]], - prompt_logprobs: Optional[PromptLogprobs] = None, - step_index: Optional[int] = 0) -> CompletionSequenceGroupOutput: - """Create a SequenceGroupOutput given the sampling results. - - Args: - token_id (int): The sampled token for the sequence. - token_id_logprob_rank (int): The logprob rank of the sampled token. - token_id_logprob (float): The logprob value of the sampled token. - seq_id (int): The sequence id. - topk_token_ids (List[Optional[int]]): The list of top-k token ids. - topk_logprobs (List[Optional[float]]): The list of top-k logprobs. - step_index: (Optional[int]): The index of the speculative token. - """ - - logprobs = create_logprobs_output( - token_id, - token_id_logprob_rank, - token_id_logprob, - topk_token_ids, - topk_logprobs, - ) - - return CompletionSequenceGroupOutput(samples=[ - SequenceOutput(parent_seq_id=seq_id, - output_token=token_id, - logprobs=logprobs) - ], - prompt_logprobs=prompt_logprobs, - step_index=step_index) - - -def split_batch_by_proposal_len( - seq_group_metadata_list: List[SequenceGroupMetadata], - proposal_lens: List[int], -) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[ - List[SequenceGroupMetadata], List[int]]]: - """Utility function that splits a batch based on whether the proposal len is - zero or not. We should remove this once vLLM supports per-sequence proposal - lens in a batch. - """ - - nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) - zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], []) - for i, (seq_group, proposal_len) in enumerate( - zip(seq_group_metadata_list, proposal_lens)): - seq_groups, indices = nonzero_lists if proposal_len else zero_lists - seq_groups.append(seq_group) - indices.append(i) - return nonzero_lists, zero_lists - - -def sampler_output_to_torch( - sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool -) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: - """Utility function which converts a list of SamplerOutput to tensors. - - sampler_transposed here is used as the indicator for whether - we need do additional tensor transpose logic here. - - Returns: - sampled_token_ids: torch.Tensor - shape: [batch_size, len(sampler_output_list)] - - sampled_token_probs: torch.Tensor - shape: [batch_size, len(sampler_output_list), vocab_size] - """ - - # shape: [batch_size, num_sampler_output, vocab_size] - sampled_token_probs = torch.stack( - [ - sampler_output.sampled_token_probs - for sampler_output in sampler_output_list - ], - dim=0, - ) - - # shape: [batch_size, num_sampler_output, vocab_size] - sampled_token_logprobs = torch.stack( - [sampler_output.logprobs for sampler_output in sampler_output_list], - dim=0, - ) - - # shape: [batch_size, num_sampler_output] - sampled_token_ids = torch.stack( - [ - sampler_output.sampled_token_ids.flatten() - for sampler_output in sampler_output_list - ], - dim=0, - ) - - if sampler_transposed: - sampled_token_probs = sampled_token_probs.transpose(0, 1) - sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1) - sampled_token_ids = sampled_token_ids.transpose(0, 1) - - if sampler_output_list[0].hidden_states is not None: - # shape: [batch_size, num_sampler_output, hidden_dim] - sampled_hidden_states = torch.stack( - [ - sampler_output.hidden_states - for sampler_output in sampler_output_list - ], - dim=0, - ) - - if sampler_transposed: - sampled_hidden_states = sampled_hidden_states.transpose(0, 1) - else: - sampled_hidden_states = None - - return (sampled_token_ids, sampled_token_probs, sampled_token_logprobs, - sampled_hidden_states) - - -def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int, - vocab_size: int, device: str) -> None: - """Helper method which mocks out the GPU tensors in SamplerOutput with dummy - values. This will be removed in PR 7/9. - https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer - """ - values = [ - sampler_output.sampled_token_probs, sampler_output.sampled_token_ids - ] - assert all(v is None for v in values) or not any(v is None for v in values) - if not any(v is None for v in values): - # Do nothing if the tensors are already created (usually in unit tests). - return - - # Softmax to ensure valid probs. - sampler_output.sampled_token_probs = torch.nn.functional.softmax( - torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device), - dim=-1) - - sampler_output.sampled_token_ids = torch.randint(low=10, - high=100, - size=(batch_size, ), - dtype=torch.long, - device=device) - - -@contextmanager -def nvtx_range(msg, *args, **kwargs): - """ - Context manager / decorator that pushes an NVTX range at the beginning - of its scope, and pops it at the end. If extra arguments are given, - they are passed as arguments to msg.format(). - - If running with cuda graphs, you must enable nsys cuda graph profiling. - - Arguments: - msg (string): message to associate with the range - """ - if current_platform.is_cuda_alike(): - torch.cuda.nvtx.range_push(msg.format(*args, **kwargs)) - try: - yield - finally: - torch.cuda.nvtx.range_pop() - else: - yield - - -class Timer: - """Basic timer context manager for measuring CPU time. - """ - - def __enter__(self): - self.start_time = time.time() - return self - - def __exit__(self, exc_type, exc_value, traceback): - self.end_time = time.time() - self.elapsed_time_s = self.end_time - self.start_time - self.elapsed_time_ms = self.elapsed_time_s * 1000 diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py index fb2e8a1df705..5445a333c493 100644 --- a/vllm/transformers_utils/configs/eagle.py +++ b/vllm/transformers_utils/configs/eagle.py @@ -6,7 +6,6 @@ from transformers import AutoConfig, PretrainedConfig -import vllm.envs as envs from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config @@ -44,28 +43,25 @@ def __init__(self, self.truncated_vocab_size = self.model.vocab_size if \ truncated_vocab_size is None else truncated_vocab_size - if not envs.VLLM_USE_V1: - kwargs["architectures"] = ["EAGLEModel"] + # Eagle model name should follow naming convention of + # LlamaForCausalLM -> EagleLlamaForCausalLM + if method == "eagle": + assert self.model is not None, \ + "model should not be None when method is eagle" + kwargs["architectures"] = [ + f"Eagle{arch}" if not arch.startswith("Eagle") \ + else arch for arch in self.model.architectures + ] + elif method == "eagle3": + assert self.model is not None, \ + "model should not be None when method is eagle3" + kwargs["architectures"] = [ + f"Eagle3{arch}" if not arch.startswith("Eagle3") \ + else arch for arch in self.model.architectures + ] else: - # Eagle model name should follow naming convention of - # LlamaForCausalLM -> EagleLlamaForCausalLM - if method == "eagle": - assert self.model is not None, \ - "model should not be None when method is eagle" - kwargs["architectures"] = [ - f"Eagle{arch}" if not arch.startswith("Eagle") \ - else arch for arch in self.model.architectures - ] - elif method == "eagle3": - assert self.model is not None, \ - "model should not be None when method is eagle3" - kwargs["architectures"] = [ - f"Eagle3{arch}" if not arch.startswith("Eagle3") \ - else arch for arch in self.model.architectures - ] - else: - raise ValueError(f"Invalid method {method}. \ - Supported methods are eagle and eagle3.") + raise ValueError(f"Invalid method {method}. \ + Supported methods are eagle and eagle3.") super().__init__(**kwargs) diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index c382b29ad199..55705062d396 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -397,8 +397,6 @@ def execute_model( model_input, worker_input, kwargs = inputs num_steps = worker_input.num_steps - if execute_model_req is not None and execute_model_req.spec_step_idx: - kwargs["spec_step_idx"] = execute_model_req.spec_step_idx self.execute_worker(worker_input) From dcc6cfb991cd76369aad96e04424f29c8fecdbd8 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 19 Jul 2025 11:39:51 +0530 Subject: [PATCH 09/57] [Kernel][Performance] Tweak MoE Batched silu_mul_fp8_quant_deep_gemm kernel (#21193) Signed-off-by: Varun Sundar Rabindranath Co-authored-by: Varun Sundar Rabindranath --- .../layers/fused_moe/batched_deep_gemm_moe.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py index 628aa5c7bb06..3ccddb52998b 100644 --- a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py @@ -55,6 +55,7 @@ def _silu_mul_fp8_quant_deep_gemm( # Meta --------------------------------------------------------------- BLOCK: tl.constexpr, + NUM_STAGES: tl.constexpr, ): G = H // GROUP_SIZE @@ -73,8 +74,7 @@ def _silu_mul_fp8_quant_deep_gemm( cols = cols.to(tl.int64) mask_h = cols < BLOCK - t = tl.zeros([], tl.int64) - while t < n_tokens: + for t in tl.range(0, n_tokens, num_stages=NUM_STAGES): base_i_offset = (e * stride_i_e + t * stride_i_t + g * GROUP_SIZE * stride_i_h) base_yq_offset = (e * stride_yq_e + t * stride_yq_t + @@ -102,8 +102,6 @@ def _silu_mul_fp8_quant_deep_gemm( tl.store(y_q_ptr + base_yq_offset + cols * stride_yq_h, y_q, mask=mask) tl.store(y_s_ptr + base_ys_offset, y_s) - t += 1 - def silu_mul_fp8_quant_deep_gemm( y: torch.Tensor, # (E, T, 2*H) float32 @@ -180,7 +178,8 @@ def silu_mul_fp8_quant_deep_gemm( fp8_max, is_blackwell_deep_gemm_used(), BLOCK=group_size, - num_warps=4, + NUM_STAGES=8, + num_warps=1, ) return y_q, y_s From 468e2400feff561a7e8b5d4c455612662448fe72 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 19 Jul 2025 02:18:48 -0400 Subject: [PATCH 10/57] [BugFix][CPU] Fix `TorchSDPABackendImpl` doesn't have `use_irope` (#21200) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9620bf6a7957..47b14d076ea6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2668,7 +2668,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: # TODO: Support other attention modules, e.g., cross-attention if attn_module.attn_type == AttentionType.DECODER: use_local_attention = (self.attention_chunk_size is not None - and attn_module.impl.use_irope) + and getattr(attn_module.impl, + "use_irope", False)) if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, From 37bd8d6e4c6e37e11ac69cc8844c57ab45dcee3c Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Sat, 19 Jul 2025 02:25:22 -0400 Subject: [PATCH 11/57] [Bug] DeepGemm: Fix TypeError: per_block_cast_to_fp8() missing 1 required positional argument: 'use_ue8m0' for SM100 (#21187) Signed-off-by: yewentao256 --- vllm/utils/deep_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 56326c9315ba..8b5713e02c95 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -99,7 +99,7 @@ def fp8_m_grouped_gemm_nt_masked(*args, **kwargs): def per_block_cast_to_fp8(x, *args, **kwargs): if _per_block_cast_impl is not None and is_blackwell_deep_gemm_used(): - return _per_block_cast_impl(x) + return _per_block_cast_impl(x, use_ue8m0=True) # TODO: refactor the `per_block_cast_to_fp8` from tests to vllm utils from tests.kernels.quant_utils import per_block_cast_to_fp8 as _pbcf return _pbcf(x, *args, **kwargs) From 3e04107d97aeb6360fcfb684665b66c94135079b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=A2=85=EA=B3=A4?= <149566442+Deepfocused@users.noreply.github.com> Date: Sat, 19 Jul 2025 15:25:44 +0900 Subject: [PATCH 12/57] [Model] EXAONE 4.0 model support (#21060) Signed-off-by: Deepfocused Signed-off-by: woongsik --- docs/models/supported_models.md | 1 + tests/models/registry.py | 1 + vllm/model_executor/models/exaone4.py | 547 ++++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + vllm/transformers_utils/config.py | 8 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/exaone4.py | 252 +++++++++ 7 files changed, 809 insertions(+), 3 deletions(-) create mode 100644 vllm/model_executor/models/exaone4.py create mode 100644 vllm/transformers_utils/configs/exaone4.py diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index cfd525ab9314..887f754a3d1c 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -331,6 +331,7 @@ Specified using `--task generate`. | `Ernie4_5_ForCausalLM` | Ernie4.5 | `baidu/ERNIE-4.5-0.3B-PT`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Ernie4_5_MoeForCausalLM` | Ernie4.5MoE | `baidu/ERNIE-4.5-21B-A3B-PT`, `baidu/ERNIE-4.5-300B-A47B-PT`, etc. |✅︎| ✅︎ | ✅︎ | | `ExaoneForCausalLM` | EXAONE-3 | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Exaone4ForCausalLM` | EXAONE-4 | `LGAI-EXAONE/EXAONE-4.0-32B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Fairseq2LlamaForCausalLM` | Llama (fairseq2 format) | `mgleize/fairseq2-dummy-Llama-3.2-1B`, etc. | ✅︎ | ✅︎ | ✅︎ | | `FalconForCausalLM` | Falcon | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. | | ✅︎ | ✅︎ | | `FalconMambaForCausalLM` | FalconMamba | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. | | ✅︎ | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 3ffa7f81a1ad..095e6f590119 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -169,6 +169,7 @@ def check_available_online( "Ernie4_5_MoeForCausalLM": _HfExamplesInfo("baidu/ERNIE-4.5-21B-A3B-PT", trust_remote_code=True), "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"), # noqa: E501 + "Exaone4ForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-4.0-32B"), # noqa: E501 "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"), # noqa: E501 "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"), "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-0.5B-Base", diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py new file mode 100644 index 000000000000..97aeb6fd7b17 --- /dev/null +++ b/vllm/model_executor/models/exaone4.py @@ -0,0 +1,547 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 + +# Adapted from +# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/modeling_exaone4.py +# Copyright 2025 The LG CNS Gen AI Solution Delivery Team. +# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only Exaone model compatible with HuggingFace weights.""" + +from collections.abc import Iterable +from typing import Any, Optional, Union + +import torch +from torch import nn + +from vllm.attention import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.configs.exaone4 import Exaone4Config + +from .interfaces import SupportsLoRA, SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, extract_layer_index, + is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + + +class Exaone4GatedMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + input_size=hidden_size, + output_sizes=[intermediate_size] * 2, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", + ) + self.down_proj = RowParallelLinear( + input_size=intermediate_size, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.down_proj", + ) + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class Exaone4Attention(nn.Module): + + def __init__( + self, + config: Exaone4Config, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 1000000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 8192, + quant_config: Optional[QuantizationConfig] = None, + bias: bool = False, + cache_config: Optional[CacheConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + # MistralConfig has an optional head_dim introduced by Mistral-Nemo + self.head_dim = getattr(config, "head_dim", None) + if self.head_dim is None: + self.head_dim = self.hidden_size // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.qkv_proj = QKVParallelLinear( + hidden_size=hidden_size, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_kv_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + self.o_proj = RowParallelLinear( + input_size=self.total_num_heads * self.head_dim, + output_size=hidden_size, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.o_proj", + ) + + self.q_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + self.k_norm = RMSNorm(self.head_dim, eps=config.rms_norm_eps) + + is_neox_style = True + if quant_config is not None and quant_config.get_name() == "gguf": + is_neox_style = False + + self.apply_all_layers = False # apply rotary embeddings to every layer. + layer_idx = extract_layer_index(prefix) + interleaved_sliding_window = getattr(config, + "interleaved_sliding_window", + 4096) + sliding_window_pattern = getattr(config, "sliding_window_pattern", + "LLLG") + + if sliding_window_pattern: + layer_has_sliding_window = ( + layer_idx + 1) % sliding_window_pattern.__len__() != 0 + else: + layer_has_sliding_window = False + self.apply_all_layers = True + + if layer_has_sliding_window: + self.sliding_window = interleaved_sliding_window + else: + self.sliding_window = None + + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + is_neox_style=is_neox_style, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + per_layer_sliding_window=self.sliding_window, + prefix=f"{prefix}.attn", + ) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + q = q.unflatten(-1, (self.num_heads, self.head_dim)) + q = self.q_norm(q) + q = q.flatten(-2, -1) + k = k.unflatten(-1, (self.num_kv_heads, self.head_dim)) + k = self.k_norm(k) + k = k.flatten(-2, -1) + + if self.sliding_window or self.apply_all_layers: + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class Exaone4DecoderLayer(nn.Module): + + def __init__( + self, + config: Exaone4Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 1000000) + rope_scaling = getattr(config, "rope_scaling", None) + if rope_scaling is not None and getattr( + config, "original_max_position_embeddings", None): + rope_scaling["original_max_position_embeddings"] = ( + config.original_max_position_embeddings) + max_position_embeddings = getattr(config, "max_position_embeddings", + 8192) + # Support abacusai/Smaug-72B-v0.1 with attention_bias + # Support internlm/internlm-7b with bias + attention_bias = getattr(config, "attention_bias", False) or getattr( + config, "bias", False) + + self.self_attn = Exaone4Attention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=getattr(config, "num_key_value_heads", + config.num_attention_heads), + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + quant_config=quant_config, + bias=attention_bias, + cache_config=cache_config, + prefix=f"{prefix}.self_attn", + ) + self.mlp = Exaone4GatedMLP( + hidden_size=self.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + bias=getattr(config, "mlp_bias", False), + prefix=f"{prefix}.mlp", + ) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_feedforward_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + residual = hidden_states + + # Self Attention + hidden_states = self.self_attn( + positions=positions, + hidden_states=hidden_states, + ) + + # Use post-LN + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = residual + hidden_states + + residual = hidden_states + + # Fully Connected + hidden_states = self.mlp(hidden_states) + + # Use post-LN + hidden_states = self.post_feedforward_layernorm(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states, residual + + +@support_torch_compile +class Exaone4Model(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.quant_config = quant_config + lora_vocab = ((lora_config.lora_extra_vocab_size * + (lora_config.max_loras or 1)) if lora_config else 0) + self.vocab_size = config.vocab_size + lora_vocab + if get_pp_group().is_first_rank or (config.tie_word_embeddings + and get_pp_group().is_last_rank): + self.embed_tokens = VocabParallelEmbedding( + self.vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + quant_config=quant_config, + ) + else: + self.embed_tokens = PPMissingLayer() + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: Exaone4DecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ), + prefix=f"{prefix}.layers", + ) + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for layer in self.layers[self.start_layer:self.end_layer]: + hidden_states, residual = layer( + positions, + hidden_states, + residual, + ) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states = self.norm(hidden_states) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".qkv_proj", ".q_proj", "q"), + (".qkv_proj", ".k_proj", "k"), + (".qkv_proj", ".v_proj", "v"), + (".gate_up_proj", ".gate_proj", 0), + (".gate_up_proj", ".up_proj", 1), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if "rotary_emb.inv_freq" in name: + continue + if ("rotary_emb.cos_cached" in name + or "rotary_emb.sin_cached" in name): + # Models trained using ColossalAI may include these tensors in + # the checkpoint. Skip them. + continue + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class Exaone4ForCausalLM(nn.Module, SupportsLoRA, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + embedding_modules = { + "embed_tokens": "input_embeddings", + "lm_head": "output_embeddings", + } + embedding_padding_modules = ["lm_head"] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + lora_config = vllm_config.lora_config + + self.config = config + self.lora_config = lora_config + self.quant_config = quant_config + + self.model = Exaone4Model( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model"), + ) + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = config.vocab_size + if lora_config: + self.unpadded_vocab_size += lora_config.lora_extra_vocab_size + self.lm_head = ParallelLMHead( + self.unpadded_vocab_size, + config.hidden_size, + org_num_embeddings=config.vocab_size, + padding_size=DEFAULT_VOCAB_PADDING_SIZE + # We need bigger padding if using lora for kernel + # compatibility + if not lora_config else lora_config.lora_vocab_padding_size, + quant_config=quant_config, + ) + if config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, + logit_scale) + else: + self.lm_head = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + model_output = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return model_output + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + # With tie_word_embeddings, we can skip lm_head.weight + # The weight might appear unnecessarily in the files if the model is + # processed with quantization, LoRA, fine-tuning, etc. + skip_prefixes=(["lm_head."] + if self.config.tie_word_embeddings else None), + ) + return loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index d5233c28b19b..2ca37867b88c 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -57,6 +57,7 @@ "Ernie4_5_ForCausalLM": ("ernie45", "Ernie4_5_ForCausalLM"), "Ernie4_5_MoeForCausalLM": ("ernie45_moe", "Ernie4_5_MoeForCausalLM"), "ExaoneForCausalLM": ("exaone", "ExaoneForCausalLM"), + "Exaone4ForCausalLM": ("exaone4", "Exaone4ForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), "Fairseq2LlamaForCausalLM": ("fairseq2_llama", "Fairseq2LlamaForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index dc35d212766c..2e66dc16b47a 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -31,9 +31,10 @@ # yapf: disable from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, DbrxConfig, DeepseekVLV2Config, - EAGLEConfig, ExaoneConfig, - JAISConfig, KimiVLConfig, - MedusaConfig, MiniMaxText01Config, + EAGLEConfig, Exaone4Config, + ExaoneConfig, JAISConfig, + KimiVLConfig, MedusaConfig, + MiniMaxText01Config, MiniMaxVL01Config, MllamaConfig, MLPSpeculatorConfig, MPTConfig, NemotronConfig, NVLM_D_Config, @@ -87,6 +88,7 @@ def _get_hf_token() -> Optional[str]: "medusa": MedusaConfig, "eagle": EAGLEConfig, "exaone": ExaoneConfig, + "exaone4": Exaone4Config, "minimax_text_01": MiniMaxText01Config, "minimax_vl_01": MiniMaxVL01Config, "nemotron": NemotronConfig, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 734f1e09d0fd..5d84d648f1c5 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -7,6 +7,7 @@ from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.exaone import ExaoneConfig +from vllm.transformers_utils.configs.exaone4 import Exaone4Config # RWConfig is for the original tiiuae/falcon-40b(-instruct) and # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. @@ -40,6 +41,7 @@ "MedusaConfig", "EAGLEConfig", "ExaoneConfig", + "Exaone4Config", "MiniMaxText01Config", "MiniMaxVL01Config", "MllamaConfig", diff --git a/vllm/transformers_utils/configs/exaone4.py b/vllm/transformers_utils/configs/exaone4.py new file mode 100644 index 000000000000..a22ebaa6bd6b --- /dev/null +++ b/vllm/transformers_utils/configs/exaone4.py @@ -0,0 +1,252 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 + +# Copied from +# https://github.com/lgai-exaone/transformers/blob/add-exaone4/src/transformers/models/exaone4/configuration_exaone4.py +# Copyright 2025 The LG CNS Gen AI Solution Delivery Team. +# Copyright 2025 The LG AI Research and HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from transformers.configuration_utils import (PretrainedConfig, + layer_type_validation) +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +def check_is_sliding(config, layer_idx): + """ + Check if the current layer is a sliding window attention (local attention) layer. + """ + if config.sliding_window is None: + return False + if config.layer_types is not None: + return config.layer_types[layer_idx] == "sliding_attention" + if isinstance(config.sliding_window_pattern, int): + return ((layer_idx + 1) % config.sliding_window_pattern) != 0 + elif isinstance(config.sliding_window_pattern, str): + assert isinstance(config.sliding_window, int), ( + f"Sliding window must be positive integer, but got {config.sliding_window}" + ) + return (layer_idx != config.num_hidden_layers - 1 + and config.sliding_window_pattern[layer_idx % len( + config.sliding_window_pattern)] == "L") + else: + logger.warning_once( + "Sliding window is set, but none of `sliding_window_pattern` or `layer_types` is set. " + "Defaulting to use 'full_attention' for all layers.") + return False + + +class Exaone4Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to + instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the EXAONE-4.0-Instruct [LGAI-EXAONE/EXAONE-4.0-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-Instruct) + NOTE: `EXAONE-4.0-Instruct` is a placeholder model ID. The exact model ID will be updated in the future. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model + outputs. Read the documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 102400): + Vocabulary size of the EXAONE 4.0 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Exaone4Model`]. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to `hidden_size * 4`): + Dimensionality of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 32768 for EXAONE 3.5). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if ``config.is_decoder=True``. + bos_token_id (`int`, *optional*, defaults to 0): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + sliding_window (`int`, *optional*): + The size of the sliding window for the sliding window attention. + sliding_window_pattern (`str`, *optional*): + The pattern to use for sliding window attention. Can be one of: + - `None`: No sliding window attention is used + - `int`: Every `sliding_window` layers, use global attention, else use local attention. + - `str`: A sequence of "L" (local attention) and "G" (global attention) characters that defines the + attention pattern. The pattern starts from layer 0 and repeats every `sliding_window` layers. The + final layer always uses global attention regardless of the pattern. + For instance, sliding_window_pattern="LLLG" same as sliding_window=4, which means: + - Layer 0, 1, 2: local attention, + - Layer 3: global attention, + ...(repeated) + layer_types (`list`, *optional*): + Attention pattern for each layer. Prioritized over `sliding_window_pattern`. + + Example: + + ```python + >>> from transformers import Exaone4Model, Exaone4Config + + >>> # Initializing a EXAONE configuration + >>> configuration = Exaone4Config() + + >>> # Initializing a model from configuration + >>> model = Exaone4Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "exaone4" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `LlamaModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + + def __init__( + self, + vocab_size=102400, + hidden_size=4096, + intermediate_size=None, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + bos_token_id=0, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_dropout=0.0, + sliding_window=None, + sliding_window_pattern=None, + layer_types=None, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + if intermediate_size: + self.intermediate_size = intermediate_size + else: + self.intermediate_size = hidden_size * 4 + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_dropout = attention_dropout + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.sliding_window = sliding_window + self.sliding_window_pattern = sliding_window_pattern + + self.layer_types = layer_types + if self.layer_types is None: + self.layer_types = [ + "sliding_attention" + if check_is_sliding(self, i) else "full_attention" + for i in range(self.num_hidden_layers) + ] + layer_type_validation(self.layer_types) + + super().__init__(bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs) + + +__all__ = ["Exaone4Config"] From 3a2cb2649d15021f48901acbddb872671478a1f2 Mon Sep 17 00:00:00 2001 From: Chenyaaang <42742451+Chenyaaang@users.noreply.github.com> Date: Sat, 19 Jul 2025 02:06:59 -0700 Subject: [PATCH 13/57] [Misc][Tools][Benchmark] Add readme file for auto_tune script (#20779) Signed-off-by: Chenyaaang --- benchmarks/auto_tune/README.md | 137 ++++++++++++++++++++++++ benchmarks/{ => auto_tune}/auto_tune.sh | 31 +----- 2 files changed, 138 insertions(+), 30 deletions(-) create mode 100644 benchmarks/auto_tune/README.md rename benchmarks/{ => auto_tune}/auto_tune.sh (81%) diff --git a/benchmarks/auto_tune/README.md b/benchmarks/auto_tune/README.md new file mode 100644 index 000000000000..7732f50b1d22 --- /dev/null +++ b/benchmarks/auto_tune/README.md @@ -0,0 +1,137 @@ +# Automated vLLM Server Parameter Tuning + +This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate. + +## Table of Contents +- [Prerequisites](#prerequisites) +- [Configuration](#configuration) +- [How to Run](#how-to-run) +- [Example Use Cases](#example-use-cases) +- [Output](#output) +- [How It Works](#how-it-works) + +## Prerequisites + +Before running the script, please ensure the following steps are completed: + +1. **Clone vLLM & Set Up Branch**: Clone the vLLM repository and check out to your desired branch. + +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +# git checkout +``` + +1. **Install Environment**: Install or update the correct running environment. For TPU usage, activate your `conda` environment and install the corresponding `torch` and `torch_xla` versions. + +2. **Model Configuration**: If you are using a customized model, ensure its configuration files are correctly placed and accessible. + +## Configuration + +You must set the following variables at the top of the script before execution. + +| Variable | Description | Example Value | +| --- | --- | --- | +| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` | +| `MODEL` | **Required.** The Hugging Face model identifier to be served by vllm. | `"meta-llama/Llama-3.1-8B-Instruct"` | +| `SYSTEM`| **Required.** The hardware you are running on. Choices: `TPU` or `GPU`. (For other systems, it might not support saving profiles) | `"TPU"` | +| `TP` | **Required.** The tensor-parallelism size. | `1` | +| `DOWNLOAD_DIR` | **Required.** Directory to download and load model weights from. | `""` (default download path) | +| `INPUT_LEN` | **Required.** Request input length. | `4000` | +| `OUTPUT_LEN` | **Required.** Request output length. | `16` | +| `MIN_CACHE_HIT_PCT` | Prefix cache hit rate in percentage (0-100). Set to `0` to disable. | `60` | +| `MAX_LATENCY_ALLOWED_MS` | The maximum allowed P99 end-to-end latency in milliseconds. Set to a very large number (e.g., `100000000000`) to effectively ignore the latency constraint. | `500` | +| `NUM_SEQS_LIST` | A space-separated string of `max-num-seqs` values to test. | `"128 256"` | +| `NUM_BATCHED_TOKENS_LIST` | A space-separated string of `max-num-batched-tokens` values to test. | `"1024 2048 4096"` | + +**Note**: The default `NUM_SEQS_LIST` and `NUM_BATCHED_TOKENS_LIST` are set for medium-sized inputs/outputs. For very short contexts (e.g., 20 input, 20 output tokens), you may need to test larger values for `max-num-seqs`. + +## How to Run + +1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section. +2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost. + +``` +cd +bash auto_tune.sh +``` + + Please note that the `bash auto_tune.sh` command cannot contain full or partial path with keyword `vllm`, otherwise `pkill -f vllm` command will also kill this script itself. + +## Example Use Cases + +Here are a few examples of how to configure the script for different goals: + +### 1. Maximize Throughput (No Latency Constraint) +- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens. +- **Configuration**: + +```bash +INPUT_LEN=1800 +OUTPUT_LEN=20 +MIN_CACHE_HIT_PCT=0 +MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number +``` + +#### 2. Maximize Throughput with a Latency Requirement +- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms. +- **Configuration**: + +```bash +INPUT_LEN=1800 +OUTPUT_LEN=20 +MIN_CACHE_HIT_PCT=0 +MAX_LATENCY_ALLOWED_MS=500 +``` + +#### 3. Maximize Throughput with Prefix Caching and Latency Requirements +- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms. +- **Configuration**: + +```bash +INPUT_LEN=1800 +OUTPUT_LEN=20 +MIN_CACHE_HIT_PCT=60 +MAX_LATENCY_ALLOWED_MS=500 +``` + +## Output + +After the script finishes, you will find the results in a new, timestamped directory created inside `$BASE/auto-benchmark/`. + +- **Log Files**: The directory (`$BASE/auto-benchmark/YYYY_MM_DD_HH_MM/`) contains detailed logs for each run: + - `vllm_log_...txt`: The log output from the vLLM server for each parameter combination. + - `bm_log_...txt`: The log output from the `benchmark_serving.py` script for each benchmark run. + +- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found. + +``` +# Example result.txt content +hash:a1b2c3d4... +max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8 +max_num_seqs: 128, max_num_batched_tokens: 4096 does not meet latency requirement 500 +... +best_max_num_seqs: 256, best_num_batched_tokens: 2048, best_throughput: 12.5, profile saved in: /home/user/vllm/auto-benchmark/2024_08_01_10_30/profile +``` + + If it cannot find the best parameters, the final row will be `best_max_num_seqs: 0, best_num_batched_tokens: 0, best_throughput: 0`. This can be due to either the server not starting properly, or the latency requirement being too strict. + +- **Profiler Trace**: A directory named `profile` is created inside the log directory. It contains the profiler trace file (e.g., `.xplane.pb` for TPU or a `.json` trace for GPU) from the single best-performing run. + +## How It Works + +The script follows a systematic process to find the optimal parameters: + +1. **Find Max GPU Memory Utilization**: The script first determines the highest safe `gpu-memory-utilization` (starting from 0.98 and decreasing) that does not cause an Out-Of-Memory (OOM) error when launching the server. This ensures the benchmark runs use the maximum available memory without crashing. + +2. **Iterate and Benchmark**: It then enters a nested loop, iterating through every combination of `max-num-seqs` and `max-num-batched-tokens` provided in the configuration lists. + +3. **Latency-Aware Throughput Search**: For each parameter combination: + - The vLLM server is started. + - A benchmark is first run with an infinite request rate (`--request-rate inf`). + - If the resulting P99 E2E latency is within the `MAX_LATENCY_ALLOWED_MS` limit, this throughput is considered the maximum for this configuration. + - If the latency is too high, the script performs a search by iteratively decreasing the request rate until the latency constraint is met. This finds the highest sustainable throughput for the given parameters and latency requirement. + +4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far. + +5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard. diff --git a/benchmarks/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh similarity index 81% rename from benchmarks/auto_tune.sh rename to benchmarks/auto_tune/auto_tune.sh index b257b57ce06f..159ee1421475 100644 --- a/benchmarks/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -1,36 +1,7 @@ #!/bin/bash # This script aims to tune the best server parameter combinations to maximize throughput for given requirement. -# The current server parameter combination is max_num_seqs and max_num_batched_tokens -# It also supports additional requirement: e2e latency and prefix cache. - -# Pre-requisite: -# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. -# 2. If the model is customized, replace the MODEL's config with the customized config. -# 3. Set variables (ALL REQUIRED) -# BASE: your directory for vllm repo -# MODEL: the model served by vllm -# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support. -# TP: ways of tensor parallelism -# DOWNLOAD_DIR: directory to download and load model weights. -# INPUT_LEN: request input len -# OUTPUT_LEN: request output len -# MIN_CACHE_HIT_PCT: prefix cache rate -# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 -# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with. -# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with. -# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST. -# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. -# 5. The final result will be saved in RESULT file. - - -# Example use cases -# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput? -# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000 -# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter? -# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500 -# 3. If we want to reach 60% prefix cache, what's the best server parameter? -# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500 +# See details in README (benchmarks/auto_tune/README.md). TAG=$(date +"%Y_%m_%d_%H_%M") BASE="" From cf8cc32674f30cc091b551ceb4decd79718ac9e5 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sat, 19 Jul 2025 02:13:41 -0700 Subject: [PATCH 14/57] Fix a couple of Voxtral tests (#21218) Signed-off-by: Huy Do --- tests/models/registry.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/models/registry.py b/tests/models/registry.py index 095e6f590119..5c546a6c86da 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -449,7 +449,11 @@ def check_available_online( tokenizer="Isotr0py/Florence-2-tokenizer", # noqa: E501 trust_remote_code=True), # noqa: E501 "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 - "VoxtralForConditionalGeneration": _HfExamplesInfo("mistralai/Voxtral-Mini-3B-2507", tokenizer_mode="mistral"), # noqa: E501 + "VoxtralForConditionalGeneration": _HfExamplesInfo( + "mistralai/Voxtral-Mini-3B-2507", + tokenizer_mode="mistral", + min_transformers_version="4.54" + ), "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 # [Cross-encoder] From 1eaff2781585ce17b4353059146591acd65719f9 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 19 Jul 2025 17:15:41 +0800 Subject: [PATCH 15/57] [V0 deprecation] Remove long context LoRA (#21169) Signed-off-by: Jee Jee Li --- tests/lora/conftest.py | 5 -- tests/lora/test_peft_helper.py | 11 ++- vllm/config.py | 14 +--- vllm/engine/arg_utils.py | 5 -- vllm/lora/layers.py | 90 ------------------------- vllm/lora/models.py | 80 +++------------------- vllm/lora/peft_helper.py | 9 --- vllm/lora/punica_wrapper/punica_base.py | 45 +++---------- vllm/lora/punica_wrapper/punica_gpu.py | 21 ++---- vllm/lora/punica_wrapper/punica_tpu.py | 14 ---- vllm/lora/punica_wrapper/utils.py | 38 ++--------- vllm/lora/utils.py | 2 - vllm/lora/worker_manager.py | 2 +- 13 files changed, 35 insertions(+), 301 deletions(-) diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 881d5efa6919..909b73933139 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -221,11 +221,6 @@ def phi2_lora_files(): return snapshot_download(repo_id="isotr0py/phi-2-test-sql-lora") -@pytest.fixture(scope="session") -def long_context_lora_files_16k_1(): - return snapshot_download(repo_id="SangBinCho/long_context_16k_testing_1") - - @pytest.fixture def llama_2_7b_engine_extra_embeddings(): cleanup_dist_env_and_memory(shutdown_ray=True) diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index f16589e06b2d..df8696cf58e0 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -38,8 +38,8 @@ ] -def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): - peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1, +def test_peft_helper_pass(sql_lora_files, tmp_path): + peft_helper = PEFTHelper.from_local_dir(sql_lora_files, max_position_embeddings=4096) lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2) peft_helper.validate_legal(lora_config) @@ -56,15 +56,12 @@ def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path): "embed_tokens", "lm_head", ] - assert peft_helper.context_length == 16384 assert peft_helper.vllm_max_position_embeddings == 4096 - assert peft_helper.vllm_long_context_scaling_factor == float( - math.ceil(peft_helper.context_length / - peft_helper.vllm_max_position_embeddings)) + # test RSLoRA rslora_config = dict(use_rslora=True) test_dir = tmp_path / "test_rslora" - shutil.copytree(long_context_lora_files_16k_1, test_dir) + shutil.copytree(sql_lora_files, test_dir) # Load and modify configuration config_path = test_dir / "adapter_config.json" diff --git a/vllm/config.py b/vllm/config.py index 8383a663c75e..384cb584fa9a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3014,12 +3014,7 @@ class LoRAConfig: (added to the base model vocabulary).""" lora_vocab_padding_size: ClassVar[int] = current_platform\ .get_lora_vocab_padding_size() - long_lora_scaling_factors: Optional[tuple[float, ...]] = None - """Specify multiple scaling factors (which can be different from base model - scaling factor - see eg. Long LoRA) to allow for multiple LoRA adapters - trained with those scaling factors to be used at the same time. If not - specified, only adapters trained with the base model scaling factor are - allowed.""" + default_mm_loras: Optional[dict[str, str]] = None """Dictionary mapping specific modalities to LoRA model paths; this field is only applicable to multimodal models and should be leveraged when a @@ -3052,7 +3047,6 @@ def compute_hash(self) -> str: factors.append(self.lora_dtype) factors.append(self.lora_extra_vocab_size) factors.append(self.lora_vocab_padding_size) - factors.append(self.long_lora_scaling_factors) factors.append(self.bias_enabled) hash_str = hashlib.md5(str(factors).encode(), usedforsecurity=False).hexdigest() @@ -3091,11 +3085,6 @@ def verify_with_model_config(self, model_config: ModelConfig): elif isinstance(self.lora_dtype, str): self.lora_dtype = getattr(torch, self.lora_dtype) - def verify_lora_support(self): - if self.long_lora_scaling_factors is not None and envs.VLLM_USE_V1: - raise ValueError( - "V1 LoRA does not support long LoRA, please use V0.") - @config @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) @@ -4564,7 +4553,6 @@ def __post_init__(self): if self.lora_config is not None: self.lora_config.verify_with_cache_config(self.cache_config) self.lora_config.verify_with_model_config(self.model_config) - self.lora_config.verify_lora_support() if self.prompt_adapter_config is not None: self.prompt_adapter_config.verify_with_model_config( self.model_config) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a7fcf6c354e8..d352a22a6d91 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -358,8 +358,6 @@ class EngineArgs: max_cpu_loras: Optional[int] = LoRAConfig.max_cpu_loras lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size - long_lora_scaling_factors: Optional[tuple[float, ...]] = \ - LoRAConfig.long_lora_scaling_factors # PromptAdapter fields enable_prompt_adapter: bool = False max_prompt_adapters: int = PromptAdapterConfig.max_prompt_adapters @@ -723,8 +721,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--lora-dtype", **lora_kwargs["lora_dtype"], ) - lora_group.add_argument("--long-lora-scaling-factors", - **lora_kwargs["long_lora_scaling_factors"]) lora_group.add_argument("--max-cpu-loras", **lora_kwargs["max_cpu_loras"]) lora_group.add_argument("--fully-sharded-loras", @@ -1245,7 +1241,6 @@ def create_engine_config( default_mm_loras=self.default_mm_loras, fully_sharded_loras=self.fully_sharded_loras, lora_extra_vocab_size=self.lora_extra_vocab_size, - long_lora_scaling_factors=self.long_lora_scaling_factors, lora_dtype=self.lora_dtype, max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else None) if self.enable_lora else None diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 779f02646843..c3512ec3dbd4 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -28,8 +28,6 @@ RowParallelLinear) # yapf: enable from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.rotary_embedding import ( - LinearScalingRotaryEmbedding, RotaryEmbedding) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.platforms import current_platform @@ -1193,91 +1191,3 @@ def can_replace_layer( ) -> bool: # Special handling for the LogitsProcessor. return False - - -class LinearScalingRotaryEmbeddingWithLoRA(BaseLayerWithLoRA): - """Implements RoPE-scaled embeddings with linear scaling for - multiple LoRA adapters with a specialized kernel. - - Replace LinearScalingRotaryEmbedding with MultiLinearScalingRotaryEmbedding - which can handle multi lora adapters in a specialized kernel. - """ - - def __init__(self, base_layer: RotaryEmbedding) -> None: - super().__init__() - self.base_layer = base_layer - - @property - def scaling_factors(self): - return self.base_layer.scaling_factors - - @property - def rotary_dim(self): - return self.base_layer.rotary_dim - - def create_lora_weights( - self, - max_loras: int, - lora_config: LoRAConfig, - model_config: Optional[PretrainedConfig] = None, - ) -> None: - scaling_factors = (list(lora_config.long_lora_scaling_factors) - if lora_config.long_lora_scaling_factors else []) - base_scaling_factor = (self.base_layer.scaling_factor if isinstance( - self.base_layer, LinearScalingRotaryEmbedding) else 1.0) - scaling_factors = sorted( - list(set([base_scaling_factor] + scaling_factors))) - self.base_layer = LinearScalingRotaryEmbedding( - self.base_layer.head_size, - self.base_layer.rotary_dim, - self.base_layer.max_position_embeddings, - self.base_layer.base, - self.base_layer.is_neox_style, - scaling_factors, - self.base_layer.dtype, - ) - - def reset_lora(self, index: int): - ... - - def set_lora( - self, - index: int, - lora_a: torch.Tensor, - lora_b: torch.Tensor, - embeddings_tensor: Optional[torch.Tensor], - bias: Optional[torch.Tensor] = None, - ): - ... - - def forward( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - ) -> tuple[torch.Tensor, torch.Tensor]: - return self.base_layer( - positions, - query, - key, - offsets=self.punica_wrapper.long_lora_indices, - ) - - @property - def scaling_factor_to_offset(self) -> dict[float, int]: - return self.base_layer.scaling_factor_to_offset - - @classmethod - def can_replace_layer( - cls, - source_layer: nn.Module, - lora_config: LoRAConfig, - packed_modules_list: list, - model_config: Optional[PretrainedConfig], - ) -> bool: - """Returns True if the layer can be replaced by this LoRA layer.""" - return (type(source_layer) is LinearScalingRotaryEmbedding - or type(source_layer) is RotaryEmbedding) - - def extra_repr(self) -> str: - return self.base_layer.extra_repr() diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 633674d5fb29..e6b19d4748f4 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,6 @@ import math import os from collections.abc import Sequence -from dataclasses import dataclass, field from typing import Any, Callable, Optional, Union import regex as re @@ -19,9 +18,7 @@ remove_adapter, set_adapter_mapping) from vllm.config import LoRAConfig from vllm.logger import init_logger -from vllm.lora.layers import (BaseLayerWithLoRA, - LinearScalingRotaryEmbeddingWithLoRA, - LoRAMapping) +from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.peft_helper import PEFTHelper from vllm.lora.punica_wrapper import get_punica_wrapper @@ -43,18 +40,6 @@ _GLOBAL_LORA_ID = 0 -@dataclass -class LongContextLoRAContext: - """Context for lora adapters that support long context.""" - # The scaling factors to support long context lora fine tuned models. - scaling_factors: list[float] - # dimension to apply rotary embedding. - rot_dim: int - # offsets to the sin_cos_cache for each lora_id loaded. - # This value is dynamically modified. - offsets_by_lora_id: dict[int, int] = field(default_factory=dict) - - def get_lora_id(): global _GLOBAL_LORA_ID _GLOBAL_LORA_ID += 1 @@ -80,20 +65,16 @@ def __init__( lora_model_id: int, rank: int, loras: dict[str, LoRALayerWeights], - scaling_factor: Optional[float] = None, ) -> None: """ Args: lora_model_id: The integer id for the lora model. rank: lora rank. loras: module name -> weights for lora-replaced layers. - scaling_factor: Scaling factor to support long context lora model. - None if the lora is not tuned for long context support. + """ self.id = lora_model_id - # Scaling factor for long context lora model. None if it is not - # fine tuned for the long context. - self.scaling_factor = scaling_factor + assert ( lora_model_id > 0), f"a valid lora id should be greater than 0, got {self.id}" @@ -192,10 +173,7 @@ def from_lora_tensors( for lora in loras.values(): lora.optimize() - return cls(lora_model_id, - peft_helper.r, - loras, - scaling_factor=peft_helper.vllm_long_context_scaling_factor) + return cls(lora_model_id, peft_helper.r, loras) @classmethod def from_local_checkpoint( @@ -360,24 +338,17 @@ def __init__( self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8 self.lora_index_to_id: list[Optional[int]] = [None] * self.lora_slots self.vocab_size = vocab_size - self.long_lora_context: Optional[LongContextLoRAContext] = None self.punica_wrapper = get_punica_wrapper( max_num_batched_tokens, max_batches=self.max_num_seqs, device=self.device, max_loras=self.lora_config.max_loras) - # Scaling factor -> offset to the sin_cos_cache to it. - # Used for long context lora. - self.scaling_factor_to_offset: dict[float, int] = {} + super().__init__(model) self.supported_lora_modules = get_supported_lora_modules(self.model) assert self.supported_lora_modules, "No supported LoRA modules found in" f" {self.model.__class__.__name__}." - if lora_config.long_lora_scaling_factors: - # We need to replace rotary emb layer to do batch computation - # for long lora. - self.supported_lora_modules.append("rotary_emb") self.packed_modules_mapping = get_packed_modules_mapping(self.model) # Used to indicate whether the model is a multimodal model @@ -454,25 +425,9 @@ def _deactivate_adapter(self, lora_id: int): except ValueError: pass - def _set_long_lora_context(self, lora: LoRAModel): - if self.long_lora_context is None: - return - - if lora.scaling_factor is None: - return - - if (lora.scaling_factor not in self.scaling_factor_to_offset): - raise ValueError(f"Long LoRA scaling factor {lora.scaling_factor}" - " has not been initialized.") - - offsets = self.scaling_factor_to_offset.get(lora.scaling_factor) - if offsets: - self.long_lora_context.offsets_by_lora_id[lora.id] = offsets - def _add_adapter(self, lora: LoRAModel): self._create_merged_loras_inplace(lora) self._registered_adapters[lora.id] = lora - self._set_long_lora_context(lora) def pin_adapter(self, lora_id: int) -> bool: """Pin a LoRAModel in the manager cache.""" @@ -488,7 +443,6 @@ def _set_adapter_mapping(self, mapping: LoRAMapping) -> None: self.lora_slots + 1, self.vocab_size, self.lora_config.lora_extra_vocab_size, - self.long_lora_context, ) def remove_all_adapters(self): @@ -528,13 +482,6 @@ def _parent_module(module_name: str) -> str: from_layer(module, self.lora_slots, self.lora_config, packed_moduled_lst, self.model.config)) - # LinearScalingRotaryEmbeddingWithLoRA is used to handle - # long context lora. Register relevant metadata. - if isinstance(new_module, LinearScalingRotaryEmbeddingWithLoRA): - self.long_lora_context = LongContextLoRAContext( - new_module.scaling_factors, new_module.rotary_dim) - self.scaling_factor_to_offset = \ - new_module.scaling_factor_to_offset # (yard1): TODO make this more robust if "lm_head" in module_name: logits_processor_module_name = 'logits_processor' @@ -574,15 +521,13 @@ def create_dummy_lora( self, lora_id: int, rank: int, - scaling_factor: Optional[float], embedding_modules: Optional[dict[str, str]] = None) -> LoRAModel: """Create zero-initialized LoRAModel for warmup.""" - model = LoRAModel(lora_id, rank, {}, scaling_factor) + model = LoRAModel(lora_id, rank, {}) for module_name, module in self.model.named_modules(): bias_enabled = self.lora_config.bias_enabled if (not self._match_target_modules(module_name) or not isinstance(module, BaseLayerWithLoRA) - or isinstance(module, LinearScalingRotaryEmbeddingWithLoRA) or self._filter_unsupported_mm_module(module_name)): continue parts = module_name.split(".") @@ -723,11 +668,8 @@ def deactivate_adapter(self, adapter_id: int) -> bool: self._deactivate_adapter) def add_adapter(self, adapter: LoRAModel) -> bool: - logger.debug( - "Adding lora. Model id: %d, " - "int id: %d, " - "scaling factor: %s", adapter.id, adapter.id, - adapter.scaling_factor) + logger.debug("Adding lora. Model id: %d, " + "int id: %d", adapter.id, adapter.id) return add_adapter(adapter, self._registered_adapters, self.capacity, self._add_adapter) @@ -772,10 +714,8 @@ def list_adapters(self) -> dict[int, LoRAModel]: def add_adapter(self, lora: LoRAModel) -> bool: """Add a LoRAModel to the manager.""" - logger.debug( - "Adding lora. Model id: %d, " - "int id: %d, " - "scaling factor: %s", lora.id, lora.id, lora.scaling_factor) + logger.debug("Adding lora. Model id: %d, " + "int id: %d", lora.id, lora.id) if lora.id not in self._registered_adapters: self._add_adapter(lora) was_added = True diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 24099bf479de..8b8e5cb7d5fa 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -35,12 +35,9 @@ class PEFTHelper: use_rslora: bool = field(default=False) # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353) use_dora: bool = field(default=False) - # long context lora field - context_length: int = field(default=0) # Extra vllm field, start with 'vllm_' to avoid conflict vllm_lora_scaling_factor: float = field(default=1.0) vllm_max_position_embeddings: Optional[int] = field(default=False) - vllm_long_context_scaling_factor: Optional[float] = field(default=None) def _validate_features(self) -> list[str]: """ @@ -59,12 +56,6 @@ def __post_init__(self): self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) else: self.vllm_lora_scaling_factor = self.lora_alpha / self.r - if self.context_length: - if self.vllm_max_position_embeddings is None: - self.vllm_max_position_embeddings = self.context_length - self.vllm_long_context_scaling_factor = float( - math.ceil(self.context_length / - self.vllm_max_position_embeddings)) @classmethod def from_dict(cls, config_dict: dict) -> "PEFTHelper": diff --git a/vllm/lora/punica_wrapper/punica_base.py b/vllm/lora/punica_wrapper/punica_base.py index 5b4902dcbeb3..b3413de1c816 100644 --- a/vllm/lora/punica_wrapper/punica_base.py +++ b/vllm/lora/punica_wrapper/punica_base.py @@ -17,7 +17,6 @@ if TYPE_CHECKING: # avoid circuit import from vllm.lora.layers import LoRAMapping - from vllm.lora.models import LongContextLoRAContext class PunicaWrapperABC(ABC): @@ -33,7 +32,6 @@ def update_metadata( max_loras: int, vocab_size: int, extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, **kwargs, ) -> None: """ @@ -144,14 +142,11 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, max_num_batched_tokens, dtype=torch.long, device=device) - self._long_lora_indices = torch.empty(max_num_batched_tokens, - dtype=torch.long, - device=device) - # 5 is the number of indices tensors. + # 4 is the number of indices tensors. # base_indices, sampler_indices, sampler_indices_padded, - # embeddings_indices,long_lora_indices - self.indices_len: list[Optional[int]] = [None] * 5 + # embeddings_indices + self.indices_len: list[Optional[int]] = [None] * 4 # these attributes are the information required for sgmv kernel self._seq_start_locs = torch.empty(max_batches, dtype=torch.long, @@ -176,14 +171,12 @@ def _update_base_metadata( max_loras: int, vocab_size: int, extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, ): ( base_indices, sampler_indices, sampler_indices_padded, embeddings_indices, - long_lora_offsets_tensor, indices_len, ) = convert_mapping( mapping, @@ -192,7 +185,6 @@ def _update_base_metadata( vocab_size, extra_vocab_size, self.device, - long_lora_context, ) self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices) self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices) @@ -201,11 +193,7 @@ def _update_base_metadata( self._embeddings_indices[:embeddings_indices. shape[0], :embeddings_indices.shape[1]].copy_( embeddings_indices) - if long_lora_offsets_tensor is not None: - self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_( - long_lora_offsets_tensor) - else: - self._long_lora_indices.zero_() + self.indices_len[:] = indices_len def _update_prefill_metadata(self, @@ -312,28 +300,13 @@ def embeddings_indices(self) -> torch.Tensor: embeddings_indices_len = self.indices_len[3] return self._embeddings_indices[:, :embeddings_indices_len] - @property - def long_lora_indices(self) -> torch.Tensor: - """ - This property provides access to the indices used for long context - lora, specifically for LinearScalingRotaryEmbeddingWithLoRA. - """ - long_lora_len = self.indices_len[4] - return self._long_lora_indices[:long_lora_len] - - def update_metadata( - self, - mapping: "LoRAMapping", - lora_index_to_id: list[Optional[int]], - max_loras: int, - vocab_size: int, - extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, - **kwargs): + def update_metadata(self, mapping: "LoRAMapping", + lora_index_to_id: list[Optional[int]], max_loras: int, + vocab_size: int, extra_vocab_size: int, **kwargs): self._update_base_metadata(mapping, lora_index_to_id, max_loras, - vocab_size, extra_vocab_size, - long_lora_context) + vocab_size, extra_vocab_size) + if mapping.is_prefill: # Update metadata required for prefill-related operators. self._update_prefill_metadata(self.token_lora_indices) diff --git a/vllm/lora/punica_wrapper/punica_gpu.py b/vllm/lora/punica_wrapper/punica_gpu.py index 6b038309d55d..2db0e9fee142 100644 --- a/vllm/lora/punica_wrapper/punica_gpu.py +++ b/vllm/lora/punica_wrapper/punica_gpu.py @@ -7,7 +7,7 @@ https://arxiv.org/abs/2310.18547 """ -from typing import TYPE_CHECKING, Optional, Union, final +from typing import Optional, Union, final import torch @@ -21,10 +21,6 @@ from .punica_base import PunicaWrapperBase -if TYPE_CHECKING: - # avoid circuit import - from vllm.lora.models import LongContextLoRAContext - @final class PunicaWrapperGPU(PunicaWrapperBase): @@ -55,20 +51,13 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, max_num_prompts, device=device) - def update_metadata( - self, - mapping: LoRAMapping, - lora_index_to_id: list[Optional[int]], - max_loras: int, - vocab_size: int, - extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, - **kwargs): + def update_metadata(self, mapping: LoRAMapping, + lora_index_to_id: list[Optional[int]], max_loras: int, + vocab_size: int, extra_vocab_size: int, **kwargs): self.is_prefill = mapping.is_prefill self._update_base_metadata(mapping, lora_index_to_id, max_loras, - vocab_size, extra_vocab_size, - long_lora_context) + vocab_size, extra_vocab_size) # Prepare cuda kernel metadata tensors self.token_mapping_meta.prepare_tensors(self.token_lora_indices) diff --git a/vllm/lora/punica_wrapper/punica_tpu.py b/vllm/lora/punica_wrapper/punica_tpu.py index 6b48268c5006..07dc337a1cc8 100644 --- a/vllm/lora/punica_wrapper/punica_tpu.py +++ b/vllm/lora/punica_wrapper/punica_tpu.py @@ -14,7 +14,6 @@ if TYPE_CHECKING: # avoid circuit import from vllm.lora.layers import LoRAMapping - from vllm.lora.models import LongContextLoRAContext from .punica_base import PunicaWrapperBase @@ -45,7 +44,6 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int, torch.ops.xla.dynamo_set_buffer_donor_(self._sampler_indices_padded, True) torch.ops.xla.dynamo_set_buffer_donor_(self._embeddings_indices, True) - torch.ops.xla.dynamo_set_buffer_donor_(self._long_lora_indices, True) torch.ops.xla.dynamo_set_buffer_donor_(self._lora_indices_per_batch, True) @@ -323,7 +321,6 @@ def _update_base_metadata( max_loras: int, vocab_size: int, extra_vocab_size: int, - long_lora_context: Optional["LongContextLoRAContext"] = None, ): # Make sure we don't accidentally collect outside operations xm.mark_step() @@ -339,7 +336,6 @@ def _update_base_metadata( sampler_indices, sampler_indices_padded, embeddings_indices, - long_lora_offsets_tensor, indices_len, ) = convert_mapping( mapping, @@ -348,7 +344,6 @@ def _update_base_metadata( vocab_size, extra_vocab_size, "cpu", - long_lora_context, ) self._token_lora_indices = self._pad_to_shape( base_indices, self._token_lora_indices.shape, @@ -362,15 +357,6 @@ def _update_base_metadata( self._embeddings_indices = self._pad_to_shape( embeddings_indices, self._embeddings_indices.shape, dims=2).to(self.device) - if long_lora_offsets_tensor is not None: - self._long_lora_indices = self._pad_to_shape( - long_lora_offsets_tensor, - self._long_lora_indices.shape, - dims=1).to(self.device) - else: - zeroed = torch.zeros_like(self._long_lora_indices.cpu(), - dtype=torch.int32) - self._long_lora_indices = zeroed.to(self.device) self.indices_len[:] = indices_len def _update_prefill_metadata(self, diff --git a/vllm/lora/punica_wrapper/utils.py b/vllm/lora/punica_wrapper/utils.py index 8430cb91865f..d22c29da1c61 100644 --- a/vllm/lora/punica_wrapper/utils.py +++ b/vllm/lora/punica_wrapper/utils.py @@ -8,7 +8,6 @@ if TYPE_CHECKING: # avoid circuit import from vllm.lora.layers import LoRAMapping - from vllm.lora.models import LongContextLoRAContext def compute_meta( @@ -49,9 +48,7 @@ def convert_mapping( vocab_size: int, extra_vocab_size: int, device: torch.device, - long_lora_context: Optional["LongContextLoRAContext"] = None, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, - Optional[torch.Tensor], list[int]]: +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, list[int]]: """Converts LoRAMapping to index tensors. Args: @@ -60,7 +57,6 @@ def convert_mapping( max_loras: Maximum number of LoRAs. vocab_size: Model vocab size. extra_vocab_size: Extra vocab size each LoRA can have. - long_lora_context: Passed if there are long context lora in a batch. Returns: A tuple of tensors: @@ -78,21 +74,14 @@ def convert_mapping( requests to embedding indices. First row is for embeddings added by the LoRAs, second row is for the LoRA.lora_a embeddings. - long_lora_indices: Tensor of shape [batch_size] mapping - requests to RoPE offsets and rot dims for long LoRAs. - None if long context lora doesn't exist. indices_len: List of lengths of the above tensors. It contains (base_indices, sampler_indices, sampler_indices_padded, - embeddings_indices, long_lora_indices). + embeddings_indices). """ index_mapping_indices: list[int] = list(mapping.index_mapping).copy() embedding_indices = index_mapping_indices.copy() lora_indices = index_mapping_indices.copy() - long_lora_offsets: Optional[torch.Tensor] = None - if long_lora_context: - long_lora_offsets = torch.zeros(len(index_mapping_indices), - device=device, - dtype=torch.long) + prompt_mapping: list[int] = [ lora_index_to_id.index(x) if x > 0 else -1 for x in mapping.prompt_mapping @@ -104,20 +93,13 @@ def convert_mapping( if index_mapping_indices[i] > 0 else -1) embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0 lora_indices[i] = lora_idx - if long_lora_context: - assert long_lora_offsets is not None - lora_offset: int = long_lora_context.offsets_by_lora_id.get( - index_mapping_indices[i], 0) - long_lora_offsets[i] = lora_offset indices_list: list[Union[list[int], torch.Tensor]] = [ index_mapping_indices, lora_indices, embedding_indices, ] - if long_lora_context: - assert long_lora_offsets is not None - indices_list.append(long_lora_offsets) + indices = torch.tensor(indices_list, dtype=torch.long, device=device) prompt_mapping_tensor = torch.tensor(prompt_mapping, dtype=torch.long, @@ -136,11 +118,7 @@ def convert_mapping( sampler_indices_padded = torch.arange( 0, len(sampler_indices_padded), device=device, dtype=torch.long) + ( sampler_indices_padded * len(sampler_indices_padded)) - long_lora_indices = None - long_lora_indices_len: Optional[int] = None - if long_lora_context: - long_lora_indices = indices[3] - long_lora_indices_len = long_lora_indices.shape[-1] + # Contain length of indices tensors. Used to index into each tensor. indices_len = [ base_indices.shape[-1], @@ -148,17 +126,11 @@ def convert_mapping( sampler_indices_padded.shape[-1], embeddings_indices.shape[-1], ] - if long_lora_indices_len is not None: - indices_len.append(long_lora_indices_len) - else: - # If long_lora doesn't exist,append None - indices_len.append(None) return ( base_indices, sampler_indices, sampler_indices_padded, embeddings_indices, - long_lora_indices, indices_len, ) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 7148ffe14948..ab0a9fbd255d 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -22,7 +22,6 @@ # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, - LinearScalingRotaryEmbeddingWithLoRA, LogitsProcessorWithLoRA, MergedColumnParallelLinearWithLoRA, MergedQKVParallelLinearWithLoRA, @@ -56,7 +55,6 @@ MergedColumnParallelLinearWithShardedLoRA, MergedQKVParallelLinearWithShardedLoRA, RowParallelLinearWithShardedLoRA, - LinearScalingRotaryEmbeddingWithLoRA, } diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 7a4af74cbeb1..248d2954f1ef 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -154,7 +154,7 @@ def add_dummy_lora(self, lora_request: LoRARequest, rank: int) -> bool: lora_request.lora_int_id) else: dummy_lora = self._adapter_manager.create_dummy_lora( - lora_request.lora_int_id, rank, 1, self.embedding_modules) + lora_request.lora_int_id, rank, self.embedding_modules) if self._cached_dummy_lora is None: self._cached_dummy_lora = dummy_lora return self._adapter_manager.add_adapter(dummy_lora) From 18e519ec8640ef66b70bb1b3ceb23e0bb883de0b Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 19 Jul 2025 17:17:16 +0800 Subject: [PATCH 16/57] [Bugfix] Fix ndarray video color from VideoAsset (#21064) Signed-off-by: Isotr0py <2037008807@qq.com> --- tests/multimodal/test_video.py | 103 +++++++++++++++++++++++++-------- tests/multimodal/utils.py | 46 +++++++++++++++ vllm/assets/video.py | 9 ++- 3 files changed, 130 insertions(+), 28 deletions(-) diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 897c9c33461a..05b7b84be7f3 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -1,14 +1,22 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import tempfile +from pathlib import Path + import numpy as np import numpy.typing as npt import pytest +from PIL import Image -from vllm import envs +from vllm.assets.base import get_vllm_public_assets +from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list from vllm.multimodal.image import ImageMediaIO from vllm.multimodal.video import (VIDEO_LOADER_REGISTRY, VideoLoader, VideoMediaIO) +from .utils import cosine_similarity, create_video_from_image, normalize_image + NUM_FRAMES = 10 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3) @@ -59,30 +67,79 @@ def load_bytes(cls, return FAKE_OUTPUT_2 -def test_video_media_io_kwargs(): - envs.VLLM_VIDEO_LOADER_BACKEND = "assert_10_frames_1_fps" - imageio = ImageMediaIO() +def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch): + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps") + imageio = ImageMediaIO() - # Verify that different args pass/fail assertions as expected. - videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0}) - _ = videoio.load_bytes(b"test") - - videoio = VideoMediaIO( - imageio, **{ - "num_frames": 10, - "fps": 1.0, - "not_used": "not_used" - }) - _ = videoio.load_bytes(b"test") - - with pytest.raises(AssertionError, match="bad num_frames"): - videoio = VideoMediaIO(imageio, **{}) + # Verify that different args pass/fail assertions as expected. + videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0}) _ = videoio.load_bytes(b"test") - with pytest.raises(AssertionError, match="bad num_frames"): - videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0}) + videoio = VideoMediaIO( + imageio, **{ + "num_frames": 10, + "fps": 1.0, + "not_used": "not_used" + }) _ = videoio.load_bytes(b"test") - with pytest.raises(AssertionError, match="bad fps"): - videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0}) - _ = videoio.load_bytes(b"test") + with pytest.raises(AssertionError, match="bad num_frames"): + videoio = VideoMediaIO(imageio, **{}) + _ = videoio.load_bytes(b"test") + + with pytest.raises(AssertionError, match="bad num_frames"): + videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0}) + _ = videoio.load_bytes(b"test") + + with pytest.raises(AssertionError, match="bad fps"): + videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0}) + _ = videoio.load_bytes(b"test") + + +@pytest.mark.parametrize("is_color", [True, False]) +@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")]) +def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str): + """ + Test all functions that use OpenCV for video I/O return RGB format. + Both RGB and grayscale videos are tested. + """ + image_path = get_vllm_public_assets(filename="stop_sign.jpg", + s3_prefix="vision_model_images") + image = Image.open(image_path) + with tempfile.TemporaryDirectory() as tmpdir: + if not is_color: + image_path = f"{tmpdir}/test_grayscale_image.png" + image = image.convert("L") + image.save(image_path) + # Convert to gray RGB for comparison + image = image.convert("RGB") + video_path = f"{tmpdir}/test_RGB_video.{ext}" + create_video_from_image( + image_path, + video_path, + num_frames=2, + is_color=is_color, + fourcc=fourcc, + ) + + frames = video_to_ndarrays(video_path) + for frame in frames: + sim = cosine_similarity(normalize_image(np.array(frame)), + normalize_image(np.array(image))) + assert np.sum(np.isnan(sim)) / sim.size < 0.001 + assert np.nanmean(sim) > 0.99 + + pil_frames = video_to_pil_images_list(video_path) + for frame in pil_frames: + sim = cosine_similarity(normalize_image(np.array(frame)), + normalize_image(np.array(image))) + assert np.sum(np.isnan(sim)) / sim.size < 0.001 + assert np.nanmean(sim) > 0.99 + + io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path)) + for frame in io_frames: + sim = cosine_similarity(normalize_image(np.array(frame)), + normalize_image(np.array(image))) + assert np.sum(np.isnan(sim)) / sim.size < 0.001 + assert np.nanmean(sim) > 0.99 diff --git a/tests/multimodal/utils.py b/tests/multimodal/utils.py index 23346509a06f..9a58292f9f4a 100644 --- a/tests/multimodal/utils.py +++ b/tests/multimodal/utils.py @@ -1,7 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import cv2 import numpy as np +import numpy.typing as npt from PIL import Image @@ -31,3 +33,47 @@ def random_audio( ): audio_len = rng.randint(min_len, max_len) return rng.rand(audio_len), sr + + +def create_video_from_image( + image_path: str, + video_path: str, + num_frames: int = 10, + fps: float = 1.0, + is_color: bool = True, + fourcc: str = "mp4v", +): + image = cv2.imread(image_path) + if not is_color: + # Convert to grayscale if is_color is False + image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + height, width = image.shape + else: + height, width, _ = image.shape + + video_writer = cv2.VideoWriter( + video_path, + cv2.VideoWriter_fourcc(*fourcc), + fps, + (width, height), + isColor=is_color, + ) + + for _ in range(num_frames): + video_writer.write(image) + + video_writer.release() + return video_path + + +def cosine_similarity(A: npt.NDArray, + B: npt.NDArray, + axis: int = -1) -> npt.NDArray: + """Compute cosine similarity between two vectors.""" + return (np.sum(A * B, axis=axis) / + (np.linalg.norm(A, axis=axis) * np.linalg.norm(B, axis=axis))) + + +def normalize_image(image: npt.NDArray) -> npt.NDArray: + """Normalize image to [0, 1] range.""" + return image.astype(np.float32) / 255.0 \ No newline at end of file diff --git a/vllm/assets/video.py b/vllm/assets/video.py index 16412121cf0a..8ab0e9760be8 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -59,7 +59,9 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: if idx in frame_indices: # only decompress needed ret, frame = cap.retrieve() if ret: - frames.append(frame) + # OpenCV uses BGR format, we need to convert it to RGB + # for PIL and transformers compatibility + frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) frames = np.stack(frames) if len(frames) < num_frames: @@ -71,10 +73,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: def video_to_pil_images_list(path: str, num_frames: int = -1) -> list[Image.Image]: frames = video_to_ndarrays(path, num_frames) - return [ - Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) - for frame in frames - ] + return [Image.fromarray(frame) for frame in frames] def video_get_metadata(path: str) -> dict[str, Any]: From 59f935300c4818cb10db8a0efadb431a2f169506 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Sat, 19 Jul 2025 05:18:47 -0400 Subject: [PATCH 17/57] [BugFix] Fix potential cuda-graph IMA (#21196) Signed-off-by: Lucas Wilkinson --- vllm/v1/attention/backends/utils.py | 5 ----- vllm/v1/worker/gpu_model_runner.py | 7 ++++++- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 65c3baa6784f..fc8649d587ee 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -59,11 +59,6 @@ class CommonAttentionMetadata: block_table_tensor: torch.Tensor slot_mapping: torch.Tensor - def __post_init__(self): - # Fill unused with -1. Needed for reshape_and_cache in full cuda graph - # mode. - self.slot_mapping[self.num_actual_tokens:].fill_(-1) - M = TypeVar("M") diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 47b14d076ea6..a5c446731144 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -684,7 +684,7 @@ def _prepare_inputs( self.seq_lens[:num_reqs].copy_(self.seq_lens_cpu[:num_reqs], non_blocking=True) - # Fill unused with -1. Needed for reshape_and_cache + # Fill unused with 0 for full cuda graph mode. self.seq_lens[num_reqs:].fill_(0) # Note: pad query_start_loc to be non-decreasing, as kernels # like FlashAttention requires that @@ -704,6 +704,11 @@ def _prepare_inputs( blk_table = self.input_batch.block_table[kv_cache_group_id] blk_table_tensor = blk_table.get_device_tensor()[:num_reqs] slot_mapping = blk_table.slot_mapping[:total_num_scheduled_tokens] + + # Fill unused with -1. Needed for reshape_and_cache in full cuda + # graph mode. + blk_table.slot_mapping[total_num_scheduled_tokens:].fill_(-1) + common_attn_metadata = CommonAttentionMetadata( query_start_loc=self.query_start_loc[:num_reqs + 1], query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1], From 7d94577138e3d4c7bcfd781337ee1e5a2befa685 Mon Sep 17 00:00:00 2001 From: shixianc <49539556+shixianc@users.noreply.github.com> Date: Sat, 19 Jul 2025 02:32:36 -0700 Subject: [PATCH 18/57] Add torch golden impl for moe_align_block_size kernel test (#20653) Signed-off-by: Shixian Cui Co-authored-by: Shixian Cui --- .../kernels/moe/test_moe_align_block_size.py | 367 ++++++++++++++---- 1 file changed, 296 insertions(+), 71 deletions(-) diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index e980422a7b97..12ef9e776c3a 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -1,90 +1,315 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import itertools +"""Tests for the MOE align block size function. + +Run `pytest tests/kernels/moe/test_moe_align_block_size.py`. +""" + +from typing import Optional import pytest import torch -from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( - moe_align_block_size_triton) - - -@pytest.mark.parametrize( - "block_size,num_tokens,topk,num_experts", - list( - itertools.product( - [32, 64, 128, 256], # block_size - [ - 1, - 3, - 7, - 16, - 256, - 2256, - 4096, - ], # num_tokens - [1, 4, 16, 64], # topk - [64, 160, 256, 257, 260, 264], # num_experts - )), -) -def test_moe_align_block_size_compare_implementations(block_size, num_tokens, - topk, num_experts): - topk_ids = torch.stack([ - torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk] - for _ in range(num_tokens) - ]) + moe_align_block_size) +from vllm.platforms import current_platform +from vllm.utils import round_up + +NUM_TOKENS = [1, 3, 7, 16, 256, 2256, 4096] +NUM_EXPERTS = [32, 160, 256, 257, 512] +TOP_KS = [1, 2, 16, 32] +BLOCK_SIZES = [32, 64, 128, 256] +current_platform.seed_everything(0) + + +def _group_tokens_by_expert( + sorted_ids: torch.Tensor, + expert_ids: torch.Tensor, + block_size: int, + valid_length: int, + total_tokens: int, +) -> dict: + num_blocks = valid_length // block_size + expert_tokens: dict[int, list[int]] = {} + + for block_idx in range(num_blocks): + expert_id = expert_ids[block_idx].item() + block_start = block_idx * block_size + block_end = min(block_start + block_size, valid_length) + + block_tokens = sorted_ids[block_start:block_end] + valid_tokens = block_tokens[block_tokens < total_tokens] + + if expert_id not in expert_tokens: + expert_tokens[expert_id] = [] + expert_tokens[expert_id].extend(valid_tokens.tolist()) + return expert_tokens + +def _verify_expert_level_sorting( + actual_sorted_ids: torch.Tensor, + golden_sorted_ids: torch.Tensor, + expert_ids: torch.Tensor, + block_size: int, + valid_length: int, + total_tokens: int, +): + """ + Verify that actual_sorted_ids follows the correct expert-level sorting. + The kerne limplementation may or may not preserve original token order + in topk_ids in the final sorted_ids however this does not impact quality. + """ + # Group tokens by expert from the golden implementation + golden_expert_tokens = _group_tokens_by_expert(golden_sorted_ids, + expert_ids, block_size, + valid_length, total_tokens) + + actual_expert_tokens = _group_tokens_by_expert(actual_sorted_ids, + expert_ids, block_size, + valid_length, total_tokens) + + assert set(golden_expert_tokens.keys()) == set( + actual_expert_tokens.keys()), ( + f"Expert IDs mismatch: golden={set(golden_expert_tokens.keys())}, " + f"actual={set(actual_expert_tokens.keys())}") + + for expert_id in golden_expert_tokens: + golden_tokens = torch.tensor(golden_expert_tokens[expert_id], + device=actual_sorted_ids.device) + actual_tokens = torch.tensor(actual_expert_tokens[expert_id], + device=actual_sorted_ids.device) + assert torch.equal( + torch.sort(golden_tokens)[0], + torch.sort(actual_tokens)[0]), ( + f"Expert {expert_id} token mismatch: " + f"golden={golden_expert_tokens[expert_id]}, " + f"actual={actual_expert_tokens[expert_id]}") + + +def torch_moe_align_block_size( + topk_ids: torch.Tensor, + block_size: int, + num_experts: int, + expert_map: Optional[torch.Tensor] = None, + pad_sorted_ids: bool = False, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Golden torch implementation of moe_align_block_size. + + This function aligns the token distribution across experts to be compatible + with block size for matrix multiplication by sorting tokens by expert and + padding to block boundaries. + """ max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1) + if pad_sorted_ids: + max_num_tokens_padded = round_up(max_num_tokens_padded, block_size) + + flattened_token_indices = torch.arange(topk_ids.numel(), + device=topk_ids.device, + dtype=torch.int32) + flattened_expert_ids = topk_ids.flatten() + sorted_expert_ids, sort_indices = torch.sort(flattened_expert_ids, + stable=True) + sorted_token_indices = flattened_token_indices[sort_indices] + + expert_token_counts = torch.zeros(num_experts, + dtype=torch.int64, + device=topk_ids.device) + for expert_id in range(num_experts): + mask = sorted_expert_ids == expert_id + expert_token_counts[expert_id] = mask.sum() + + expert_padded_counts = torch.zeros(num_experts, + dtype=torch.int64, + device=topk_ids.device) + for expert_id in range(num_experts): + original_count = expert_token_counts[expert_id] + if original_count > 0: + expert_padded_counts[expert_id] = ( + (original_count + block_size - 1) // block_size) * block_size - sorted_ids_cuda = torch.empty((max_num_tokens_padded, ), - dtype=torch.int32, - device=topk_ids.device) - sorted_ids_cuda.fill_(topk_ids.numel()) - max_num_m_blocks = max_num_tokens_padded // block_size - expert_ids_cuda = torch.zeros((max_num_m_blocks, ), - dtype=torch.int32, - device=topk_ids.device) - num_tokens_post_pad_cuda = torch.empty((1), - dtype=torch.int32, - device=topk_ids.device) - - sorted_ids_triton = torch.empty_like(sorted_ids_cuda) - sorted_ids_triton.fill_(topk_ids.numel()) - expert_ids_triton = torch.zeros_like(expert_ids_cuda) - num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_cuda) - - ops.moe_align_block_size( - topk_ids, - num_experts, + sorted_token_ids = torch.full( + (max_num_tokens_padded, ), + topk_ids.numel(), + dtype=torch.int32, + device=topk_ids.device, + ) + max_num_blocks = (max_num_tokens_padded + block_size - 1) // block_size + expert_ids = torch.zeros(max_num_blocks, + dtype=torch.int32, + device=topk_ids.device) + + current_pos = 0 + current_block = 0 + for expert_id in range(num_experts): + expert_mask = sorted_expert_ids == expert_id + expert_tokens = sorted_token_indices[expert_mask] + num_expert_tokens = expert_tokens.shape[0] + + if num_expert_tokens > 0: + sorted_token_ids[current_pos:current_pos + + num_expert_tokens] = (expert_tokens) + + expert_blocks_needed = expert_padded_counts[expert_id] // block_size + expert_ids[current_block:current_block + + expert_blocks_needed] = (expert_id) + + current_pos += expert_padded_counts[expert_id] + current_block += expert_blocks_needed + + total_padded_tokens = expert_padded_counts.sum() + num_tokens_post_pad = torch.tensor([total_padded_tokens], + dtype=torch.int32, + device=topk_ids.device) + + if expert_map is not None: + expert_ids = expert_map[expert_ids] + return sorted_token_ids, expert_ids, num_tokens_post_pad + + +@pytest.mark.parametrize("m", NUM_TOKENS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("num_experts", NUM_EXPERTS) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("pad_sorted_ids", [False, True]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") +def test_moe_align_block_size(m: int, topk: int, num_experts: int, + block_size: int, pad_sorted_ids: bool): + """Test moe_align_block_size without expert mapping""" + topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32) + for i in range(m): + experts = torch.randperm(num_experts, device="cuda")[:topk] + topk_ids[i] = experts + + actual_sorted_ids, actual_expert_ids, actual_num_tokens = ( + moe_align_block_size( + topk_ids=topk_ids, + block_size=block_size, + num_experts=num_experts, + pad_sorted_ids=pad_sorted_ids, + )) + golden_sorted_ids, golden_expert_ids, golden_num_tokens = ( + torch_moe_align_block_size( + topk_ids=topk_ids, + block_size=block_size, + num_experts=num_experts, + pad_sorted_ids=pad_sorted_ids, + )) + + torch.testing.assert_close(actual_num_tokens, + golden_num_tokens, + atol=0, + rtol=0) + torch.testing.assert_close(actual_expert_ids, + golden_expert_ids, + atol=0, + rtol=0) + + # For sorted_token_ids, verify block-level correctness rather than exact + # order Tokens within each expert's blocks can be in any order, but expert + # regions must be correct + _verify_expert_level_sorting( + actual_sorted_ids, + golden_sorted_ids, + actual_expert_ids, block_size, - sorted_ids_cuda, - expert_ids_cuda, - num_tokens_post_pad_cuda, + actual_num_tokens.item(), + m * topk, ) - moe_align_block_size_triton( - topk_ids, - num_experts, + total_tokens = m * topk + assert actual_num_tokens.item() % block_size == 0, ( + "num_tokens_post_pad should be divisible by block_size") + assert actual_num_tokens.item() >= total_tokens, ( + "num_tokens_post_pad should be at least total_tokens") + valid_tokens = actual_sorted_ids[actual_sorted_ids < total_tokens] + assert len(valid_tokens) == total_tokens, ( + f"Should have exactly {total_tokens} valid tokens, " + f"got {len(valid_tokens)}") + assert (actual_expert_ids >= 0).all() and ( + actual_expert_ids + < num_experts).all(), "expert_ids should contain valid expert indices" + + +@pytest.mark.parametrize("m", [16, 32]) +@pytest.mark.parametrize("topk", [2, 4]) +@pytest.mark.parametrize("num_experts", [8]) +@pytest.mark.parametrize("block_size", [64]) +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") +def test_moe_align_block_size_with_expert_map(m: int, topk: int, + num_experts: int, + block_size: int): + """Test moe_align_block_size with expert mapping (EP scenario)""" + topk_ids = torch.zeros((m, topk), device="cuda", dtype=torch.int32) + for i in range(m): + experts = torch.randperm(num_experts, device="cuda")[:topk] + topk_ids[i] = experts + + expert_map = torch.full((num_experts, ), + -1, + device="cuda", + dtype=torch.int32) + local_experts = list(range(0, num_experts, 2)) + for i, expert_id in enumerate(local_experts): + expert_map[expert_id] = i + + actual_sorted_ids, actual_expert_ids, actual_num_tokens = ( + moe_align_block_size( + topk_ids=topk_ids, + block_size=block_size, + num_experts=num_experts, + expert_map=expert_map, + )) + golden_sorted_ids, golden_expert_ids, golden_num_tokens = ( + torch_moe_align_block_size( + topk_ids=topk_ids, + block_size=block_size, + num_experts=num_experts, + expert_map=expert_map, + )) + + torch.testing.assert_close(actual_num_tokens, + golden_num_tokens, + atol=0, + rtol=0) + torch.testing.assert_close(actual_expert_ids, + golden_expert_ids, + atol=0, + rtol=0) + _verify_expert_level_sorting( + actual_sorted_ids, + golden_sorted_ids, + actual_expert_ids, block_size, - sorted_ids_triton, - expert_ids_triton, - num_tokens_post_pad_triton, + actual_num_tokens.item(), + m * topk, ) - assert torch.allclose(expert_ids_cuda, expert_ids_triton), ( - f"Expert IDs mismatch for block_size={block_size}, " - f"num_tokens={num_tokens}, topk={topk}\n" - f"CUDA expert_ids: {expert_ids_cuda}\n" - f"Triton expert_ids: {expert_ids_triton}") - assert torch.allclose( - num_tokens_post_pad_cuda, num_tokens_post_pad_triton), ( - f"Num tokens post pad mismatch for block_size={block_size}, " - f"num_tokens={num_tokens}, topk={topk}\n" - f"CUDA num_tokens_post_pad: {num_tokens_post_pad_cuda}\n" - f"Triton num_tokens_post_pad: {num_tokens_post_pad_triton}") +def test_moe_align_block_size_deterministic(): + m, topk, num_experts, block_size = 128, 2, 32, 64 + + torch.manual_seed(42) + topk_ids = torch.randint(0, + num_experts, (m, topk), + device="cuda", + dtype=torch.int32) + # expect the results to be reproducible + results = [] + for _ in range(5): + sorted_ids, expert_ids, num_tokens = moe_align_block_size( + topk_ids=topk_ids, block_size=block_size, num_experts=num_experts) + results.append( + (sorted_ids.clone(), expert_ids.clone(), num_tokens.clone())) -if __name__ == "__main__": - pytest.main([__file__]) + for i in range(1, len(results)): + assert torch.equal( + results[0][0], + results[i][0]), ("sorted_ids should be deterministic") + assert torch.equal( + results[0][1], + results[i][1]), ("expert_ids should be deterministic") + assert torch.equal( + results[0][2], + results[i][2]), ("num_tokens should be deterministic") From 6d0734c562e759fdb7076d762222b3881e62ab1f Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Sat, 19 Jul 2025 02:33:01 -0700 Subject: [PATCH 19/57] [NVIDIA] Add SM100 Flashinfer MoE blockscale fp8 backend for low latency (#20645) Signed-off-by: kaixih Signed-off-by: mgoin Co-authored-by: mgoin --- vllm/envs.py | 11 +- .../model_executor/layers/fused_moe/config.py | 2 +- .../layers/fused_moe/fused_moe.py | 100 +++++++++++++++++- .../model_executor/layers/quantization/fp8.py | 82 ++++++++++---- .../layers/quantization/modelopt.py | 9 +- vllm/utils/flashinfer.py | 14 ++- 6 files changed, 187 insertions(+), 31 deletions(-) diff --git a/vllm/envs.py b/vllm/envs.py index 261cc7855b70..0896ae3a96c7 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -119,7 +119,8 @@ VLLM_TPU_BUCKET_PADDING_GAP: int = 0 VLLM_TPU_MOST_MODEL_LEN: Optional[int] = None VLLM_USE_DEEP_GEMM: bool = False - VLLM_USE_FLASHINFER_MOE: bool = False + VLLM_USE_FLASHINFER_MOE_FP8: bool = False + VLLM_USE_FLASHINFER_MOE_FP4: bool = False VLLM_XGRAMMAR_CACHE_MB: int = 0 VLLM_MSGPACK_ZERO_COPY_THRESHOLD: int = 256 VLLM_ALLOW_INSECURE_SERIALIZATION: bool = False @@ -854,9 +855,13 @@ def get_vllm_port() -> Optional[int]: "VLLM_USE_DEEP_GEMM": lambda: bool(int(os.getenv("VLLM_USE_DEEP_GEMM", "0"))), + # Allow use of FlashInfer MoE kernels for fused moe ops. + "VLLM_USE_FLASHINFER_MOE_FP8": + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP8", "0"))), + # Allow use of FlashInfer CUTLASS kernels for fused moe ops. - "VLLM_USE_FLASHINFER_MOE": - lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE", "0"))), + "VLLM_USE_FLASHINFER_MOE_FP4": + lambda: bool(int(os.getenv("VLLM_USE_FLASHINFER_MOE_FP4", "0"))), # Control the cache sized used by the xgrammar compiler. The default # of 512 MB should be enough for roughly 1000 JSON schemas. diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 9bebb6a65fce..51c421bd228f 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -191,7 +191,7 @@ def use_deepep_ll_kernels(self): @property def use_flashinfer_cutlass_kernels(self): - return (envs.VLLM_USE_FLASHINFER_MOE + return (envs.VLLM_USE_FLASHINFER_MOE_FP4 and has_flashinfer_cutlass_fused_moe()) @staticmethod diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index aec5d7b252e3..c412f695ae76 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -28,7 +28,7 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP) from vllm.model_executor.layers.fused_moe.utils import ( - _resize_cache, moe_kernel_quantize_input) + _resize_cache, moe_kernel_quantize_input, per_token_group_quant_fp8) from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( dequant_mxfp4) from vllm.platforms import current_platform @@ -1061,6 +1061,104 @@ def inplace_fused_experts_fake( ) +def next_positive_power_of_2(x: int) -> int: + if x < 1: + return 1 + return 1 << (x - 1).bit_length() + + +def _get_tile_tokens_dim(num_tokens, top_k, num_experts): + # Guess tokens per expert assuming perfect expert distribution first. + num_tokens_per_expert = (num_tokens * top_k) // num_experts + # And pad the number to the next power of 2. + tile_tokens_dim = next_positive_power_of_2(num_tokens_per_expert) + # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel. + tile_tokens_dim = min(max(tile_tokens_dim, 8), 64) + return tile_tokens_dim + + +def flashinfer_fused_moe_blockscale_fp8( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor, + x: torch.Tensor, + w13_weight: torch.Tensor, + w13_weight_scale_inv: torch.Tensor, + w2_weight: torch.Tensor, + w2_weight_scale_inv: torch.Tensor, + global_num_experts: int, + top_k: int, + num_expert_group: int, + topk_group: int, + intermediate_size: int, + expert_offset: int, + local_num_experts: int, + block_shape: list[int], + routed_scaling: float = 1.0) -> torch.Tensor: + from vllm.utils.flashinfer import flashinfer_trtllm_fp8_block_scale_moe + assert top_k <= global_num_experts + assert top_k <= 8 + assert topk_group <= 4 + assert global_num_experts > num_expert_group + assert global_num_experts % num_expert_group == 0 + assert global_num_experts % 4 == 0 + assert top_k < (topk_group * global_num_experts / num_expert_group) + assert block_shape == [128, 128] + + a_q, a_sf = per_token_group_quant_fp8(x, block_shape[1]) + # NOTE: scales of hidden states have to be transposed! + a_sf_t = a_sf.t().contiguous() + return flashinfer_trtllm_fp8_block_scale_moe( + routing_logits=routing_logits, + routing_bias=routing_bias, + hidden_states=a_q, + hidden_states_scale=a_sf_t, + gemm1_weights=w13_weight, + gemm1_weights_scale=w13_weight_scale_inv, + gemm2_weights=w2_weight, + gemm2_weights_scale=w2_weight_scale_inv, + num_experts=global_num_experts, + top_k=top_k, + n_group=num_expert_group, + topk_group=topk_group, + intermediate_size=intermediate_size, + local_expert_offset=expert_offset, + local_num_experts=local_num_experts, + routed_scaling_factor=routed_scaling, + tile_tokens_dim=_get_tile_tokens_dim(x.shape[0], top_k, + global_num_experts), + routing_method_type=2, # DeepSeek-styled routing method + ) + + +def flashinfer_fused_moe_blockscale_fp8_fake( + routing_logits: torch.Tensor, + routing_bias: torch.Tensor, + x: torch.Tensor, + w13_weight: torch.Tensor, + w13_weight_scale_inv: torch.Tensor, + w2_weight: torch.Tensor, + w2_weight_scale_inv: torch.Tensor, + global_num_experts: int, + top_k: int, + num_expert_group: int, + topk_group: int, + intermediate_size: int, + expert_offset: int, + local_num_experts: int, + block_shape: list[int], + routed_scaling: float = 1.0) -> torch.Tensor: + return torch.empty_like(x) + + +direct_register_custom_op( + op_name="flashinfer_fused_moe_blockscale_fp8", + op_func=flashinfer_fused_moe_blockscale_fp8, + mutates_args=[], + fake_impl=flashinfer_fused_moe_blockscale_fp8_fake, + tags=(torch.Tag.needs_fixed_stride_order, ), +) + + def outplace_fused_experts( hidden_states: torch.Tensor, w1: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 824dfe15ae25..35d7545d8c6a 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -43,6 +43,7 @@ from vllm.scalar_type import scalar_types from vllm.utils import has_deep_gemm from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used +from vllm.utils.flashinfer import has_flashinfer_moe if TYPE_CHECKING: from vllm.model_executor.models.utils import WeightsMapper @@ -52,6 +53,11 @@ logger = init_logger(__name__) +def _swap_w13_to_w31(x: torch.Tensor) -> torch.Tensor: + return x.reshape(-1, 2, x.shape[-2] // 2, + x.shape[-1]).flip(dims=[1]).reshape(x.shape) + + def _is_col_major(x: torch.Tensor) -> bool: assert x.dim() == 3 b, m, n = x.shape @@ -473,6 +479,11 @@ def __init__(self, quant_config: Fp8Config): self.quant_config = quant_config self.block_quant = self.quant_config.weight_block_size is not None + self.flashinfer_moe_enabled = False + if envs.VLLM_USE_FLASHINFER_MOE_FP8 and has_flashinfer_moe(): + logger.info_once( + "Using FlashInfer MoE FP8 kernels for Fp8MoEMethod.") + self.flashinfer_moe_enabled = True # For GPUs that lack FP8 hardware support, we can leverage the Marlin # kernel for fast weight-only FP8 quantization self.use_marlin = (not current_platform.has_device_capability(89) @@ -674,6 +685,14 @@ def process_weights_after_loading(self, layer: Module) -> None: normalize_e4m3fn_to_e4m3fnuz( layer.w2_weight, layer.w2_weight_scale_inv, layer.w2_input_scale) + elif self.flashinfer_moe_enabled: + # NOTE: weights have to be swapped since the activation is + # applied on different half for flashinfer vs vllm + w13_weight = _swap_w13_to_w31(layer.w13_weight.data) + w13_weight_scale_inv = _swap_w13_to_w31( + layer.w13_weight_scale_inv.data) + w2_weight = layer.w2_weight.data + w2_weight_scale_inv = layer.w2_weight_scale_inv.data else: w13_weight = layer.w13_weight.data w13_weight_scale_inv = layer.w13_weight_scale_inv.data @@ -915,25 +934,25 @@ def apply( assert logical_to_physical_map is not None assert logical_replica_count is not None assert isinstance(layer, FusedMoE) - - topk_weights, topk_ids = FusedMoE.select_experts( - hidden_states=x, - router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias, - indices_type=self.topk_indices_dtype, - enable_eplb=enable_eplb, - expert_map=expert_map, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, - ) + if not self.flashinfer_moe_enabled: + topk_weights, topk_ids = FusedMoE.select_experts( + hidden_states=x, + router_logits=router_logits, + use_grouped_topk=use_grouped_topk, + top_k=top_k, + renormalize=renormalize, + topk_group=topk_group, + num_expert_group=num_expert_group, + custom_routing_function=custom_routing_function, + scoring_func=scoring_func, + e_score_correction_bias=e_score_correction_bias, + indices_type=self.topk_indices_dtype, + enable_eplb=enable_eplb, + expert_map=expert_map, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) if self.rocm_aiter_moe_enabled: from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 @@ -971,6 +990,31 @@ def apply( apply_router_weight_on_input=apply_router_weight_on_input, global_num_experts=global_num_experts, expert_map=expert_map) + elif self.flashinfer_moe_enabled: + # Currently only work with DS models + assert self.block_quant + assert (renormalize and use_grouped_topk + and scoring_func == 'sigmoid' + and custom_routing_function is None) + assert activation == "silu" + return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8( + routing_logits=router_logits.to(torch.float32), + routing_bias=e_score_correction_bias, + x=x, + w13_weight=layer.w13_weight, + w13_weight_scale_inv=layer.w13_weight_scale_inv, + w2_weight=layer.w2_weight, + w2_weight_scale_inv=layer.w2_weight_scale_inv, + global_num_experts=global_num_experts, + top_k=top_k, + num_expert_group=num_expert_group, + topk_group=topk_group, + intermediate_size=layer.intermediate_size_per_partition, + expert_offset=layer.ep_rank * layer.local_num_experts, + local_num_experts=layer.local_num_experts, + block_shape=self.quant_config.weight_block_size, + routed_scaling=1.0, + ) else: return self.fused_experts( hidden_states=x, diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 3807899fc3e5..20def70d1976 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -721,7 +721,7 @@ def __init__(self, quant_config: ModelOptNvFp4Config): self.use_marlin = False self.allow_flashinfer_cutlass = False - if envs.VLLM_USE_FLASHINFER_MOE: + if envs.VLLM_USE_FLASHINFER_MOE_FP4: if self.cutlass_nvfp4_supported and current_platform.is_cuda() \ and current_platform.is_device_capability(100): logger.info_once( @@ -800,10 +800,9 @@ def select_gemm_impl(self, prepare_finalize, assert moe.dp_size > 1 logger.debug_once("Using CutlassExpertsFp4") # Currently CutlassExpertsFp4 doesn't support DP - raise ValueError( - "CutlassExpertsFp4 doesn't support DP. " - "Use flashinfer CUTLASS FusedMoE(VLLM_USE_FLASHINFER_MOE)" - " backend instead.") + raise ValueError("CutlassExpertsFp4 doesn't support DP. " + "Use flashinfer CUTLASS FusedMoE backend instead " + "(set VLLM_USE_FLASHINFER_MOE_FP4=1)") return experts diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index dbd2dc393046..fd8b384a616f 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -64,6 +64,8 @@ def wrapper(*args, **kwargs): # Create lazy wrappers for each function +flashinfer_trtllm_fp8_block_scale_moe = _lazy_import_wrapper( + "flashinfer.fused_moe", "trtllm_fp8_block_scale_moe") flashinfer_cutlass_fused_moe = _lazy_import_wrapper("flashinfer.fused_moe", "cutlass_fused_moe") fp4_quantize = _lazy_import_wrapper("flashinfer", "fp4_quantize") @@ -77,10 +79,16 @@ def wrapper(*args, **kwargs): fallback_fn=lambda *args, **kwargs: contextlib.nullcontext()) +@functools.cache +def has_flashinfer_moe() -> bool: + """Return ``True`` if FlashInfer MoE module is available.""" + return importlib.util.find_spec("flashinfer.fused_moe") is not None + + @functools.cache def has_flashinfer_cutlass_fused_moe() -> bool: """Return ``True`` if FlashInfer CUTLASS fused MoE is available.""" - if not has_flashinfer(): + if not has_flashinfer_moe(): return False # Check if all required functions are available @@ -99,9 +107,11 @@ def has_flashinfer_cutlass_fused_moe() -> bool: __all__ = [ "has_flashinfer", - "has_flashinfer_cutlass_fused_moe", + "flashinfer_trtllm_fp8_block_scale_moe", "flashinfer_cutlass_fused_moe", "fp4_quantize", "fp4_swizzle_blockscale", "autotune", + "has_flashinfer_moe", + "has_flashinfer_cutlass_fused_moe", ] From b3d82108e7fdd98c781e7330335e3b4b0c7c0de5 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Sat, 19 Jul 2025 02:40:38 -0700 Subject: [PATCH 20/57] [Bugfix][Frontend] Fix openai CLI arg `middleware` (#21220) Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/entrypoints/openai/test_cli_args.py | 10 ++++++++++ vllm/entrypoints/openai/cli_args.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 504fd72aa4ae..b20838956d72 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -153,3 +153,13 @@ def test_chat_template_validation_for_sad_paths(serve_parser): args = serve_parser.parse_args(args=["--chat-template", "does/not/exist"]) with pytest.raises(ValueError): validate_parsed_serve_args(args) + + +@pytest.mark.parametrize( + "cli_args, expected_middleware", + [(["--middleware", "middleware1", "--middleware", "middleware2" + ], ["middleware1", "middleware2"]), ([], [])]) +def test_middleware(serve_parser, cli_args, expected_middleware): + """Ensure multiple middleware args are parsed properly""" + args = serve_parser.parse_args(args=cli_args) + assert args.middleware == expected_middleware diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 6456d009b957..28857f8caef8 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -215,6 +215,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: # Special case: Middleware needs append action frontend_kwargs["middleware"]["action"] = "append" + frontend_kwargs["middleware"]["type"] = str + if "nargs" in frontend_kwargs["middleware"]: + del frontend_kwargs["middleware"]["nargs"] + frontend_kwargs["middleware"]["default"] = [] # Special case: Tool call parser shows built-in options. valid_tool_parsers = list(ToolParserManager.tool_parsers.keys()) From e3a0e43d7f98fdd9631e5129005473eb25b98d7b Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Sat, 19 Jul 2025 20:13:55 +0800 Subject: [PATCH 21/57] [bugfix] Fix auto thread-binding when world_size > 1 in CPU backend and refactor code (#21032) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 4 +- docs/getting_started/installation/cpu.md | 10 +- requirements/cpu.txt | 2 - vllm/envs.py | 5 +- vllm/platforms/cpu.py | 64 ++++++ vllm/v1/worker/cpu_model_runner.py | 7 +- vllm/v1/worker/cpu_worker.py | 202 ++++++------------ 7 files changed, 144 insertions(+), 150 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index afe3e4b7ef69..e3d47a0e6c16 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -24,8 +24,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index 14c9984487f5..d77e7383650c 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -94,8 +94,8 @@ Currently, there are no pre-built CPU wheels. ## Related runtime environment variables - `VLLM_CPU_KVCACHE_SPACE`: specify the KV Cache size (e.g, `VLLM_CPU_KVCACHE_SPACE=40` means 40 GiB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. Default value is `0`. -- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads. For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node. By setting to `all`, the OpenMP threads of each rank uses all CPU cores available on the system. Default value is `auto`. -- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `0`. +- `VLLM_CPU_OMP_THREADS_BIND`: specify the CPU cores dedicated to the OpenMP threads, can be set as CPU id lists or `auto` (by default). For example, `VLLM_CPU_OMP_THREADS_BIND=0-31` means there will be 32 OpenMP threads bound on 0-31 CPU cores. `VLLM_CPU_OMP_THREADS_BIND=0-31|32-63` means there will be 2 tensor parallel processes, 32 OpenMP threads of rank0 are bound on 0-31 CPU cores, and the OpenMP threads of rank1 are bound on 32-63 CPU cores. By setting to `auto`, the OpenMP threads of each rank are bound to the CPU cores in each NUMA node respectively. +- `VLLM_CPU_NUM_OF_RESERVED_CPU`: specify the number of CPU cores which are not dedicated to the OpenMP threads for each rank. The variable only takes effect when VLLM_CPU_OMP_THREADS_BIND is set to `auto`. Default value is `None`. If the value is not set and use `auto` thread binding, no CPU will be reserved for `world_size == 1`, 1 CPU per rank will be reserved for `world_size > 1`. - `VLLM_CPU_MOE_PREPACK` (x86 only): whether to use prepack for MoE layer. This will be passed to `ipex.llm.modules.GatedMLPMOE`. Default is `1` (True). On unsupported CPUs, you might need to set this to `0` (False). - `VLLM_CPU_SGL_KERNEL` (x86 only, Experimental): whether to use small-batch optimized kernels for linear layer and MoE layer, especially for low-latency requirements like online serving. The kernels require AMX instruction set, BFloat16 weight type and weight shapes divisible by 32. Default is `0` (False). @@ -123,9 +123,13 @@ export VLLM_CPU_NUM_OF_RESERVED_CPU=1 vllm serve facebook/opt-125m --dtype=bfloat16 ``` +Note, it is recommended to manually reserve 1 CPU for vLLM front-end process when `world_size == 1`. + ### How to decide `VLLM_CPU_OMP_THREADS_BIND`? -- Bind each OpenMP thread to a dedicated physical CPU core respectively, or use auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: +- Default `auto` thread-binding is recommended for most cases. Ideally, each OpenMP thread will be bound to a dedicated physical core respectively, threads of each rank will be bound to a same NUMA node respectively, and 1 CPU per rank will be reserved for other vLLM components when `world_size > 1`. If have any performance problems or unexpected binding behaviours, please try to bind threads as following. + +- On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores: ??? console "Commands" diff --git a/requirements/cpu.txt b/requirements/cpu.txt index df3a3393563a..d80354342bc2 100644 --- a/requirements/cpu.txt +++ b/requirements/cpu.txt @@ -24,6 +24,4 @@ datasets # for benchmark scripts # Intel Extension for PyTorch, only for x86_64 CPUs intel-openmp==2024.2.1; platform_machine == "x86_64" intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218 -py-libnuma; platform_system != "Darwin" -psutil; platform_system != "Darwin" triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile. diff --git a/vllm/envs.py b/vllm/envs.py index 0896ae3a96c7..c5f97de807a7 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -44,7 +44,7 @@ VLLM_PP_LAYER_PARTITION: Optional[str] = None VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" - VLLM_CPU_NUM_OF_RESERVED_CPU: int = 0 + VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None VLLM_CPU_MOE_PREPACK: bool = True VLLM_CPU_SGL_KERNEL: bool = False VLLM_XLA_CACHE_PATH: str = os.path.join(VLLM_CACHE_ROOT, "xla_cache") @@ -442,7 +442,8 @@ def get_vllm_port() -> Optional[int]: # (CPU backend only) CPU cores not used by OMP threads . # Those CPU cores will not be used by OMP threads of a rank. "VLLM_CPU_NUM_OF_RESERVED_CPU": - lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")), + lambda: int(os.getenv("VLLM_CPU_NUM_OF_RESERVED_CPU", "0")) + if "VLLM_CPU_NUM_OF_RESERVED_CPU" in os.environ else None, # (CPU backend only) whether to use prepack for MoE layer. This will be # passed to ipex.llm.modules.GatedMLPMOE. On unsupported CPUs, you might diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index a0aa981f951a..70c339c9bc98 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json import os import platform +import subprocess import sys +from dataclasses import dataclass from importlib.util import find_spec from typing import TYPE_CHECKING, Optional @@ -31,6 +34,35 @@ def get_max_threads(pid=0): raise NotImplementedError("Unsupported OS") +@dataclass +class LogicalCPUInfo: + id: int = -1 + physical_core: int = -1 + numa_node: int = -1 + + @classmethod + def _int(cls, value: str) -> int: + try: + int_value = int(value) + except Exception: + int_value = -1 + return int_value + + @staticmethod + def json_decoder(obj_dict: dict): + id = obj_dict.get("cpu") + physical_core = obj_dict.get("core") + numa_node = obj_dict.get("node") + + if not (id is None or physical_core is None or numa_node is None): + return LogicalCPUInfo( + id=LogicalCPUInfo._int(id), + physical_core=LogicalCPUInfo._int(physical_core), + numa_node=LogicalCPUInfo._int(numa_node)) + else: + return obj_dict + + class CpuPlatform(Platform): _enum = PlatformEnum.CPU device_name: str = "cpu" @@ -240,6 +272,38 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: vllm_config.scheduler_config.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) + @classmethod + def get_allowed_cpu_memory_node_list( + cls) -> tuple[list[int], list[LogicalCPUInfo]]: + assert platform.system() == "Linux" + + # Init LogicalCPUInfo from lscpu + lscpu_output = subprocess.check_output("lscpu -J -e=CPU,CORE,NODE", + shell=True, + text=True) + logical_cpu_list: list[LogicalCPUInfo] = json.loads( + lscpu_output, object_hook=LogicalCPUInfo.json_decoder)['cpus'] + + # Filter CPUs with invalid attributes + logical_cpu_list = [ + x for x in logical_cpu_list + if -1 not in (x.id, x.physical_core, x.numa_node) + ] + + # Filter allowed CPUs + allowed_cpu_id_list = os.sched_getaffinity(0) + logical_cpu_list = [ + x for x in logical_cpu_list if x.id in allowed_cpu_id_list + ] + + # Get allowed NUMA nodes + allowed_numa_nodes = set() + for x in logical_cpu_list: + allowed_numa_nodes.add(x.numa_node) # type: ignore + allowed_numa_nodes_list = sorted(allowed_numa_nodes) + + return allowed_numa_nodes_list, logical_cpu_list + @classmethod def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on CPU.") diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 136a9f08e829..ca94ac8c6054 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -45,9 +45,10 @@ def replace_tensor(obj: Any, cpu_attr_name: str, if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor): replace_tensor(self.input_batch, k, k[:-11]) - for k, v in vars(self.input_batch.block_table).items(): - if k.endswith("_cpu") and isinstance(v, torch.Tensor): - replace_tensor(self.input_batch.block_table, k, k[:-4]) + for block_table in self.input_batch.block_table.block_tables: + for k, v in vars(block_table).items(): + if k.endswith("_cpu") and isinstance(v, torch.Tensor): + replace_tensor(block_table, k, k[:-4]) def load_model(self, eep_scale_up: bool = False) -> None: logger.info("Starting to load model %s...", self.model_config.model) diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index d31991b5b363..2dc28d93049a 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from importlib import util -from typing import Optional +import platform +from typing import Callable, Optional import torch @@ -12,21 +12,14 @@ from vllm.logger import init_logger from vllm.model_executor.utils import set_random_seed from vllm.platforms import CpuArchEnum, current_platform +from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo from vllm.sequence import IntermediateTensors -from vllm.utils import PlaceholderModule from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import (Worker, init_worker_distributed_environment) -try: - import psutil - from numa import info -except ImportError: - psutil = PlaceholderModule("psutil") # type: ignore[assignment] - numa = PlaceholderModule("numa") # type: ignore[assignment] - logger = init_logger(__name__) @@ -45,20 +38,21 @@ def __init__(self, is_driver_worker=is_driver_worker) self.parallel_config.disable_custom_all_reduce = True - self.manually_bind_threads_suggestion = ( - "To get better performance, please try to manually bind threads.") def init_device(self): # Setup OpenMP threads affinity. omp_cpuids = envs.VLLM_CPU_OMP_THREADS_BIND - self.local_omp_cpuid = "all" - if omp_cpuids == "auto": + if omp_cpuids == "auto" and platform.system() == "Linux": if current_platform.get_cpu_architecture() == CpuArchEnum.POWERPC: - self.local_omp_cpuid = ( - self.get_cpus_id_binding_based_on_numa_nodes_ppc64le()) + # For POWERPC SMT-8/4/2 + self.local_omp_cpuid = self._get_autobind_cpu_ids( + lambda cpus: [cpu for cpu in cpus if cpu.id % 8 < 4]) + elif current_platform.get_cpu_architecture() == CpuArchEnum.X86: + # For x86 SMT-2, use 1 CPU per core + self.local_omp_cpuid = self._get_autobind_cpu_ids( + lambda cpus: cpus[-1:]) else: - self.local_omp_cpuid = ( - self.get_cpus_id_binding_based_on_numa_nodes()) + self.local_omp_cpuid = "all" else: self.local_omp_cpuid = omp_cpuids.split("|")[self.rank] @@ -122,126 +116,58 @@ def execute_model( assert isinstance(output, ModelRunnerOutput) return output if self.is_driver_worker else None - def warn_inability_to_detect_numa(self) -> None: - logger.warning( - "Auto thread-binding failed due to the " - "inability to detect numa nodes. %s", - self.manually_bind_threads_suggestion) - - def warn_lack_of_numa_and_psutil(self) -> None: - logger.warning( - "Auto thread-binding failed due to " - "the lack of package numa and psutil. %s", - self.manually_bind_threads_suggestion) - - def warn_world_size_too_large(self, world_size: int, - node_to_cpus_len: int) -> None: - logger.warning( - "Auto thread-binding failed due to " - "world size: %d being larger than " - "allowed NUMA nodes number: %d. %s", world_size, node_to_cpus_len, - self.manually_bind_threads_suggestion) - - def get_cpus_allow_list_and_numa_size(self): - cpus_allow_list = psutil.Process().cpu_affinity() - numa_size = info.get_num_configured_nodes() - return cpus_allow_list, numa_size - - def auto_thread_binding_based_on_numa_nodes(self, world_size: int, - rank_to_cpus: str) -> str: - cpu_count = psutil.cpu_count(logical=False) - cpus_allow_list, numa_size = self.get_cpus_allow_list_and_numa_size() - if not numa_size: - self.warn_inability_to_detect_numa() - return rank_to_cpus - - cpu_count_per_numa = cpu_count // numa_size - num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, - cpu_count_per_numa // 2) - - node_to_cpus = [] - for i in range(numa_size): - node_intersect = set( - info.node_to_cpus(i)).intersection(cpus_allow_list) - if bool(node_intersect): - node_to_cpus.append(list(node_intersect)) - - node_to_cpus_len = len(node_to_cpus) - if world_size > node_to_cpus_len: - self.warn_world_size_too_large(world_size, node_to_cpus_len) - else: - end = cpu_count_per_numa - num_of_reserved_cpu - rank_to_cpus_list = node_to_cpus[self.rank][:end] - rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) - logger.info("auto thread-binding list: %s", rank_to_cpus) - return rank_to_cpus - - def libnuma_and_psutil_found(self) -> bool: - libnuma_found = util.find_spec("numa") is not None - psutil_found = util.find_spec("psutil") is not None - - return libnuma_found and psutil_found - - def get_cpus_id_binding_based_on_numa_nodes(self) -> str: - """Return CPUs id binding based on NUMA nodes. + def _get_autobind_cpu_ids( + self, cpu_selector: Callable[[list[LogicalCPUInfo]], + list[LogicalCPUInfo]] + ) -> str: """ - rank_to_cpus = self.local_omp_cpuid - # Setup OpenMP thread affinity based on NUMA nodes automatically - world_size = self.vllm_config.parallel_config.world_size - if self.libnuma_and_psutil_found(): - rank_to_cpus = self.auto_thread_binding_based_on_numa_nodes( - world_size, rank_to_cpus) - else: - self.warn_lack_of_numa_and_psutil() - return rank_to_cpus - - def select_threads_per_power_core(self, - node_cpu_ids: list[int]) -> list[int]: - return [cpu for cpu in node_cpu_ids if cpu % 8 < 4] - - def auto_thread_binding_based_on_numa_nodes_ppc64le( - self, world_size: int, rank_to_cpus: str) -> str: - cpus_allow_list, numa_size = self.get_cpus_allow_list_and_numa_size() - if not numa_size: - self.warn_inability_to_detect_numa() - return rank_to_cpus - - node_to_cpus = [] - for i in range(numa_size): - node_intersect = set( - info.node_to_cpus(i)).intersection(cpus_allow_list) - if bool(node_intersect): - node_to_cpus.append(sorted(list(node_intersect))) - - node_to_cpus_len = len(node_to_cpus) - if world_size > node_to_cpus_len: - self.warn_world_size_too_large(world_size, node_to_cpus_len) - else: - node_cpus_this_rank = node_to_cpus[self.rank] - node_cpus_this_rank = self.select_threads_per_power_core( - node_cpus_this_rank) - cpu_count_per_numa = len(node_cpus_this_rank) - num_of_reserved_cpu = min(envs.VLLM_CPU_NUM_OF_RESERVED_CPU, - cpu_count_per_numa // 2) - end = cpu_count_per_numa - num_of_reserved_cpu - rank_to_cpus_list = node_cpus_this_rank[:end] - rank_to_cpus = ','.join(str(x) for x in rank_to_cpus_list) - logger.info("ppc64le thread-binding list: %s", rank_to_cpus) - return rank_to_cpus - - def get_cpus_id_binding_based_on_numa_nodes_ppc64le(self) -> str: - """ - Power (ppc64le) specific: Selects a subset of threads per core for - each NUMA node.This is robust to SMT mode (SMT-8, SMT-4, etc) - because the OS only exposes available threads.This maximizes - performance by avoiding oversubscription of logical CPUs on Power. + Return CPU ids to bind based on NUMA nodes. + Currently for rank N, only CPU ids on the N-th node in available NUMA + node list will be selected. + Args: + cpu_selector: a callable object to select CPUs from a CPU list + of a physical core. The input is a LogicalCPUInfo list, sorted by + the LogicalCPUInfo.id. A selected LogicalCPUInfo list should be + returned. """ - rank_to_cpus = self.local_omp_cpuid - world_size = self.vllm_config.parallel_config.world_size - if self.libnuma_and_psutil_found(): - rank_to_cpus = self.auto_thread_binding_based_on_numa_nodes_ppc64le( - world_size, rank_to_cpus) - else: - self.warn_lack_of_numa_and_psutil() - return rank_to_cpus + allowed_numa_nodes, logical_cpu_list = \ + CpuPlatform.get_allowed_cpu_memory_node_list() + assert len(allowed_numa_nodes) >= self.parallel_config.world_size, ( + f"No enough allowed NUMA nodes to bind threads of " + f"{self.parallel_config.world_size} CPUWorkers. " + f"Allowed NUMA nodes are {allowed_numa_nodes}. " + "Please try to bind threads manually.") + + # Get CPUs on NUMA node `allowed_numa_nodes[local_rank]`` + selected_numa_node = allowed_numa_nodes[ + self.local_rank] # type: ignore + logical_cpu_list = [ + x for x in logical_cpu_list if x.numa_node == selected_numa_node + ] + + # Select CPUs from each physical core via cpu_selector + core_to_cpus: dict[int, list[LogicalCPUInfo]] = {} + for cpu_info in logical_cpu_list: + if cpu_info.physical_core not in core_to_cpus: + core_to_cpus[cpu_info.physical_core] = [] + core_to_cpus[cpu_info.physical_core].append(cpu_info) + logical_cpu_list = [] + for cpu_list in core_to_cpus.values(): + cpu_list = sorted(cpu_list, key=lambda x: x.id) + logical_cpu_list.extend(cpu_selector(cpu_list)) + logical_cpu_list = sorted(logical_cpu_list, key=lambda x: x.id) + + # Reserve CPUs for other processes + reserve_cpu_num = envs.VLLM_CPU_NUM_OF_RESERVED_CPU + if reserve_cpu_num is None: + reserve_cpu_num = 1 if self.parallel_config.world_size > 1 else 0 + assert len(logical_cpu_list) > reserve_cpu_num, ( + f"VLLM_CPU_NUM_OF_RESERVED_CPU ({reserve_cpu_num}) " + f"should less than {len(logical_cpu_list)}.") + if reserve_cpu_num != 0: + logical_cpu_list = logical_cpu_list[:-reserve_cpu_num] + + logger.info("auto thread-binding list (id, physical core): %s", + [(x.id, x.physical_core) for x in logical_cpu_list]) + return ",".join([str(x.id) for x in logical_cpu_list]) From c81259d33a77f657bce9bd8ab0e3548826df258d Mon Sep 17 00:00:00 2001 From: Rabi Mishra Date: Sat, 19 Jul 2025 17:45:07 +0530 Subject: [PATCH 22/57] Fix/remove some broken model executor tests (#21224) Signed-off-by: Rabi Mishra --- tests/model_executor/test_guided_processors.py | 13 ------------- tests/model_executor/test_model_load_with_params.py | 6 +++--- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/tests/model_executor/test_guided_processors.py b/tests/model_executor/test_guided_processors.py index f08c7f7efccb..721478f42442 100644 --- a/tests/model_executor/test_guided_processors.py +++ b/tests/model_executor/test_guided_processors.py @@ -189,19 +189,6 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex): GuidedDecodingParams(json=sample_json_schema, grammar="test grammar") -def test_guided_decoding_backend_options(): - """Test backend-specific options""" - with pytest.warns(DeprecationWarning): - guided_decoding_params = GuidedDecodingParams( - backend= - "xgrammar:no-fallback,disable-any-whitespace,no-additional-properties" - ) - assert guided_decoding_params.backend == "xgrammar" - assert guided_decoding_params.disable_fallback - assert guided_decoding_params.disable_any_whitespace - assert guided_decoding_params.disable_additional_properties - - def test_pickle_xgrammar_tokenizer_data(): try: import xgrammar as xgr diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 4bdb651e5170..1d2d9f9a65bb 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -49,7 +49,7 @@ def test_model_loading_with_params(vllm_runner): def check_model(model): assert isinstance(model, BertEmbeddingModel) - assert isinstance(model._pooler, CLSPool) + assert isinstance(model.pooler.pooling, CLSPool) vllm_model.apply_model(check_model) @@ -87,7 +87,7 @@ def test_roberta_model_loading_with_params(vllm_runner): def check_model(model): assert isinstance(model, RobertaEmbeddingModel) - assert isinstance(model._pooler, MeanPool) + assert isinstance(model.pooler.pooling, MeanPool) vllm_model.apply_model(check_model) @@ -114,7 +114,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): def check_model(model): assert isinstance(model, RobertaEmbeddingModel) assert not hasattr(model, "lm_head") - assert isinstance(model._pooler, CLSPool) + assert isinstance(model.pooler.pooling, CLSPool) vllm_model.apply_model(check_model) From da6579bf41754e442de8f0a3ffa9652e02613618 Mon Sep 17 00:00:00 2001 From: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Date: Sat, 19 Jul 2025 21:16:48 +0900 Subject: [PATCH 23/57] [CI/CD][bugfix]fix: error argument to loads has incompatible type (#21223) Signed-off-by: Sungjae Lee <33976427+llsj14@users.noreply.github.com> Signed-off-by: Sungjae Lee --- vllm/engine/arg_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index d352a22a6d91..1ca4917de26b 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1266,8 +1266,8 @@ def create_engine_config( ) observability_config = ObservabilityConfig( - show_hidden_metrics_for_version=self. - show_hidden_metrics_for_version, + show_hidden_metrics_for_version=( + self.show_hidden_metrics_for_version), otlp_traces_endpoint=self.otlp_traces_endpoint, collect_detailed_traces=self.collect_detailed_traces, ) From 6a971ed692974b3d6309d556b15c8cc726b091f9 Mon Sep 17 00:00:00 2001 From: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Date: Sat, 19 Jul 2025 21:58:07 +0800 Subject: [PATCH 24/57] [Docs] Update the link to the 'Prometheus/Grafana' example (#21225) --- docs/design/v1/metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md index 7156ee9dd3ec..eec42d79d820 100644 --- a/docs/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../.. ### Grafana Dashboard -vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. +vLLM also provides [a reference example](https://docs.vllm.ai/en/stable/examples/online_serving/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: From 9f414a12adb991d04d2adf0b80f1f115d6281fad Mon Sep 17 00:00:00 2001 From: kourosh hakhamaneshi <31483498+kouroshHakha@users.noreply.github.com> Date: Sat, 19 Jul 2025 08:46:50 -0700 Subject: [PATCH 25/57] [BugFix] Make PD work with Ray (#21072) Signed-off-by: Kourosh Hakhamaneshi --- .../kv_connector/unit/test_nixl_connector.py | 117 +++++++----------- .../unit/test_output_aggreagator.py} | 37 ++---- .../kv_transfer/kv_connector/utils.py | 90 ++++++++++++++ .../kv_transfer/kv_connector/v1/base.py | 2 +- vllm/mocks/__init__.py | 0 vllm/mocks/mock_nixl_connector.py | 76 ++++++++++++ vllm/sequence.py | 6 + vllm/v1/executor/multiproc_executor.py | 86 ++----------- vllm/v1/executor/ray_distributed_executor.py | 57 +++++++-- vllm/v1/worker/gpu_model_runner.py | 49 +++++++- vllm/v1/worker/gpu_worker.py | 30 ++--- 11 files changed, 329 insertions(+), 221 deletions(-) rename tests/v1/{executor/test_multiproc_executor.py => kv_connector/unit/test_output_aggreagator.py} (72%) create mode 100644 vllm/mocks/__init__.py create mode 100644 vllm/mocks/mock_nixl_connector.py diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index c4f558b7acdb..a0dfd54fb825 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -1,13 +1,14 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os +import tempfile +import textwrap import time -import uuid -from collections import defaultdict -from typing import Optional from unittest.mock import patch import pytest +import ray from vllm import LLM from vllm.config import KVTransferConfig @@ -15,11 +16,32 @@ KVConnectorRole, NixlAgentMetadata, NixlConnector, NixlConnectorMetadata, NixlConnectorWorker) from vllm.forward_context import ForwardContext +from vllm.mocks.mock_nixl_connector import FakeNixlWrapper from vllm.sampling_params import SamplingParams from .utils import create_request, create_scheduler, create_vllm_config +def _make_stub_pkg() -> str: + """Return a directory that makes + `from nixl._api import nixl_agent` resolve to our FakeNixlWrapper.""" + td = tempfile.mkdtemp() + pkg_root = os.path.join(td, "nixl", "_api") + os.makedirs(pkg_root, exist_ok=True) + + stub = textwrap.dedent("""\ + # Forward the real FakeNixlWrapper that the driver already defined. + print("In fake package") + from vllm.mocks.mock_nixl_connector import FakeNixlWrapper as nixl_agent + """) + with open(os.path.join(pkg_root, "__init__.py"), "w") as f: + f.write(stub) + + # touch parent package + open(os.path.join(td, "nixl", "__init__.py"), "w").close() + return td + + def test_basic_interface(): """Unit test for basic NixlConnector interface functionality.""" @@ -87,77 +109,6 @@ def test_prompt_less_than_block_size(): assert len(scheduler_output.scheduled_new_reqs) == 1 -class FakeNixlWrapper: - """Mock implementation of NixlWrapper for testing. - - We don't inherit from nixl._api.nixl_agent because nixl may not be - installed. - """ - - AGENT_METADATA = b"fake_agent_metadata" - REMOTE_AGENT_NAME = "remote_agent" - - def __init__(self, agent_name: str, *args, **kwargs): - self._cycles_before_xfer_done = 0 - self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict( - lambda: 0) - - def get_reg_descs(self, caches_data, memory_type: str) -> list: - return [str(uuid.uuid4()) for _ in caches_data] - - def register_memory(self, descs) -> None: - pass - - def get_xfer_descs(self, blocks_data, memory_type: str) -> list: - return [str(uuid.uuid4()) for _ in blocks_data] - - def prep_xfer_dlist(self, agent_name: str, descs: list) -> int: - return uuid.uuid4().int - - def get_agent_metadata(self) -> bytes: - return self.AGENT_METADATA - - def add_remote_agent(self, agent_metadata: bytes) -> str: - return self.REMOTE_AGENT_NAME - - def get_new_notifs(self) -> dict[str, list[bytes]]: - # Used to collect done_sending, which we don't test yet. - return {} - - def check_xfer_state(self, handle: int) -> str: - if self._check_xfer_state_cycles[ - handle] >= self._cycles_before_xfer_done: - return "DONE" - self._check_xfer_state_cycles[handle] += 1 - return "PROC" - - def release_xfer_handle(self, handle: int) -> None: - pass - - def send_notif(self, agent_name: str, notif_msg: bytes) -> None: - pass - - def make_prepped_xfer(self, - xfer_type: str, - local_xfer_side_handle: int, - local_block_descs_ids: list[int], - remote_xfer_side_handle: int, - remote_block_descs_ids: list[int], - notif_msg: Optional[bytes] = None) -> int: - return uuid.uuid4().int - - def transfer(self, handle: int) -> str: - return "PROC" - - ############################################################ - # Follow are for changing the behavior during testing. - ############################################################ - - def set_cycles_before_xfer_done(self, cycles: int): - """Set the number of cycles before a transfer is considered done.""" - self._cycles_before_xfer_done = cycles - - class FakeNixlConnectorWorker(NixlConnectorWorker): REMOTE_ENGINE_ID = "remote_engine" @@ -378,10 +329,14 @@ def test_concurrent_load_kv( raise TimeoutError("Took too long to complete async handshake.") +# NOTE: resource cleanup in mp backend is a bit finicky, so the order in which +# we put here is important. First run ray, it will clean up the resources, then +# the rest of the tests. +@pytest.mark.parametrize("distributed_executor_backend", ["ray", None]) @patch( "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper", FakeNixlWrapper) -def test_abort_timeout_on_prefiller(monkeypatch): +def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend): """ Test lifecycle of an aborted Remote Prefill request hitting the timeout. -----> P @@ -399,11 +354,23 @@ def test_abort_timeout_on_prefiller(monkeypatch): timeout = 6 monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") monkeypatch.setenv("VLLM_NIXL_ABORT_REQUEST_TIMEOUT", str(timeout)) + + # Build runtime_env only if we’re using Ray + if distributed_executor_backend == "ray": + runtime_env = { + "working_dir": _make_stub_pkg(), # ship stub package + "env_vars": { + "VLLM_NIXL_ABORT_REQUEST_TIMEOUT": str(timeout), + }, + } + ray.init(runtime_env=runtime_env) + llm = LLM( model=model_name, enforce_eager=True, gpu_memory_utilization=0.5, kv_transfer_config=kv_transfer_config, + distributed_executor_backend=distributed_executor_backend, ) remote_prefill_opts = { "do_remote_decode": True, diff --git a/tests/v1/executor/test_multiproc_executor.py b/tests/v1/kv_connector/unit/test_output_aggreagator.py similarity index 72% rename from tests/v1/executor/test_multiproc_executor.py rename to tests/v1/kv_connector/unit/test_output_aggreagator.py index c1425d82becf..cad73f68e9f1 100644 --- a/tests/v1/executor/test_multiproc_executor.py +++ b/tests/v1/kv_connector/unit/test_output_aggreagator.py @@ -1,28 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import threading -from collections import defaultdict from concurrent.futures import Future from typing import Optional -from vllm.v1.executor.multiproc_executor import MultiprocExecutor +from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.v1.outputs import ModelRunnerOutput -class DummyMultiprocExecutor(MultiprocExecutor): - - def __init__(self, output_rank, world_size): - # Manually initialize minimal required fields - self.output_rank = output_rank - self.world_size = world_size - self._send_remaining_count = defaultdict[str, - int](lambda: self.world_size) - self._recv_remaining_count = defaultdict[str, - int](lambda: self.world_size) - self.io_thread_pool = None - self.shutdown_event = threading.Event() - - class DummyModelRunnerOutput(ModelRunnerOutput): def __init__(self, @@ -33,14 +17,14 @@ def __init__(self, def test_aggregate_workers_output(): - executor = DummyMultiprocExecutor(output_rank=0, world_size=2) + aggregator = KVOutputAggregator(world_size=2) output1 = DummyModelRunnerOutput(finished_sending={'req1'}, finished_recving={'req2'}) output2 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None) - aggregated = executor._aggregate_workers_output([output1, output2]) + aggregated = aggregator.aggregate([output1, output2]) assert aggregated is output1 assert aggregated.finished_sending is None @@ -51,7 +35,7 @@ def test_aggregate_workers_output(): output2 = DummyModelRunnerOutput(finished_sending={'req1'}, finished_recving=None) - aggregated = executor._aggregate_workers_output([output1, output2]) + aggregated = aggregator.aggregate([output1, output2]) assert aggregated is output1 assert aggregated.finished_sending == {'req1'} @@ -62,7 +46,7 @@ def test_aggregate_workers_output(): output2 = DummyModelRunnerOutput(finished_sending={'req1'}, finished_recving={'req2'}) - aggregated = executor._aggregate_workers_output([output1, output2]) + aggregated = aggregator.aggregate([output1, output2]) assert aggregated is output1 assert aggregated.finished_sending is None @@ -70,12 +54,11 @@ def test_aggregate_workers_output(): def test_async_aggregate_workers_output(): - executor = DummyMultiprocExecutor(output_rank=0, world_size=2) + aggregator = KVOutputAggregator(world_size=2) future1: Future[DummyModelRunnerOutput] = Future() future2: Future[DummyModelRunnerOutput] = Future() - result_future = executor._async_aggregate_workers_output( - [future1, future2]) + result_future = aggregator.async_aggregate([future1, future2]) output1 = DummyModelRunnerOutput(finished_sending={'req1'}, finished_recving={'req2'}) @@ -92,8 +75,7 @@ def test_async_aggregate_workers_output(): future1 = Future() future2 = Future() - result_future = executor._async_aggregate_workers_output( - [future1, future2]) + result_future = aggregator.async_aggregate([future1, future2]) output1 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None) @@ -110,8 +92,7 @@ def test_async_aggregate_workers_output(): future1 = Future() future2 = Future() - result_future = executor._async_aggregate_workers_output( - [future1, future2]) + result_future = aggregator.async_aggregate([future1, future2]) output1 = DummyModelRunnerOutput(finished_sending=None, finished_recving=None) diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 5cbc8ca31752..c179d6cc29b7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -3,12 +3,18 @@ """ KV cache helper for store. """ +from collections import defaultdict +from collections.abc import Sequence +from concurrent.futures import CancelledError, Future +from typing import Optional, cast + import torch import vllm.envs as envs from vllm import _custom_ops as ops from vllm.config import VllmConfig, get_current_vllm_config from vllm.logger import init_logger +from vllm.v1.outputs import ModelRunnerOutput logger = init_logger(__name__) @@ -107,3 +113,87 @@ def get_kv_connector_cache_layout(): "layout to HND for better xfer performance.") return "HND" return "NHD" + + +class KVOutputAggregator: + """Utility class to aggregate the output of all workers into a single + output corresponding to Rank 0 for scheduler.""" + + def __init__(self, world_size: int): + # Complete transfer tracker. Used by to track finished requests + # [req_id -> n_finished_workers] + self._recv_remaining_count = defaultdict[str, int](lambda: world_size) + self._send_remaining_count = defaultdict[str, int](lambda: world_size) + + def aggregate(self, + outputs: list[ModelRunnerOutput], + output_rank: int = 0) -> ModelRunnerOutput: + # aggregate finished_sending, finished_recving from all workers + + def update_finished_set(req_ids: Optional[set[str]], + remaining_count_dict: dict[str, int], + finished_set: set[str]) -> None: + for req_id in req_ids or (): + new_count = remaining_count_dict[req_id] - 1 + if new_count == 0: + finished_set.add(req_id) + del remaining_count_dict[req_id] + else: + remaining_count_dict[req_id] = new_count + + finished_sending = set[str]() + finished_recving = set[str]() + for output in outputs: + update_finished_set(output.finished_sending, + self._send_remaining_count, finished_sending) + update_finished_set(output.finished_recving, + self._recv_remaining_count, finished_recving) + + # select output of the worker specified by output_rank + output = outputs[output_rank] + + # set the aggregated finished_sending / finished_recving + # if output.finished_sending/recving is not empty, but the other ranks + # still have unfinished send/recv, we want to set the aggregated + # finished_sending/recving to None until all ranks have finished + # send/recv + output.finished_sending = finished_sending if finished_sending else None + output.finished_recving = finished_recving if finished_recving else None + + return output + + def async_aggregate(self, + output_futures: Sequence[Future[ModelRunnerOutput]], + output_rank: int = 0) -> Future[ModelRunnerOutput]: + """Takes a list of futures and returns a single future which resolves + to the respective list of outputs.""" + result_future: Future[ModelRunnerOutput] = Future() + + outputs: list[Optional[ModelRunnerOutput]] = [None + ] * len(output_futures) + + def make_callback(idx): + + def callback(fut): + if result_future.done(): + return + + try: + outputs[idx] = fut.result() + except CancelledError: + result_future.cancel() + except Exception as e: + result_future.set_exception(e) + + # this check assumes io_thread_pool uses a single thread + if all(outputs): + result_future.set_result( + self.aggregate(cast(list[ModelRunnerOutput], outputs), + output_rank)) + + return callback + + for i, output_future in enumerate(output_futures): + output_future.add_done_callback(make_callback(i)) + + return result_future diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 9459ab27aba3..e1245775bea3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -194,7 +194,7 @@ def get_finished( """ Notifies worker-side connector ids of requests that have finished generating tokens on the worker. - The scheduler process (via the MultiprocExecutor) will use this output + The scheduler process (via the Executors) will use this output to track which workers are done. Returns: diff --git a/vllm/mocks/__init__.py b/vllm/mocks/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/mocks/mock_nixl_connector.py b/vllm/mocks/mock_nixl_connector.py new file mode 100644 index 000000000000..54e2c5ee3b0a --- /dev/null +++ b/vllm/mocks/mock_nixl_connector.py @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import uuid +from collections import defaultdict +from typing import Optional + + +class FakeNixlWrapper: + """Mock implementation of NixlWrapper for testing. + + We don't inherit from nixl._api.nixl_agent because nixl may not be + installed. + """ + + AGENT_METADATA = b"fake_agent_metadata" + REMOTE_AGENT_NAME = "remote_agent" + + def __init__(self, agent_name: str, *args, **kwargs): + self._cycles_before_xfer_done = 0 + self._check_xfer_state_cycles: defaultdict[int, int] = defaultdict( + lambda: 0) + + def get_reg_descs(self, caches_data, memory_type: str) -> list: + return [str(uuid.uuid4()) for _ in caches_data] + + def register_memory(self, descs) -> None: + pass + + def get_xfer_descs(self, blocks_data, memory_type: str) -> list: + return [str(uuid.uuid4()) for _ in blocks_data] + + def prep_xfer_dlist(self, agent_name: str, descs: list) -> int: + return uuid.uuid4().int + + def get_agent_metadata(self) -> bytes: + return self.AGENT_METADATA + + def add_remote_agent(self, agent_metadata: bytes) -> str: + return self.REMOTE_AGENT_NAME + + def get_new_notifs(self) -> dict[str, list[bytes]]: + # Used to collect done_sending, which we don't test yet. + return {} + + def check_xfer_state(self, handle: int) -> str: + if self._check_xfer_state_cycles[ + handle] >= self._cycles_before_xfer_done: + return "DONE" + self._check_xfer_state_cycles[handle] += 1 + return "PROC" + + def release_xfer_handle(self, handle: int) -> None: + pass + + def send_notif(self, agent_name: str, notif_msg: bytes) -> None: + pass + + def make_prepped_xfer(self, + xfer_type: str, + local_xfer_side_handle: int, + local_block_descs_ids: list[int], + remote_xfer_side_handle: int, + remote_block_descs_ids: list[int], + notif_msg: Optional[bytes] = None) -> int: + return uuid.uuid4().int + + def transfer(self, handle: int) -> str: + return "PROC" + + ############################################################ + # Follow are for changing the behavior during testing. + ############################################################ + + def set_cycles_before_xfer_done(self, cycles: int): + """Set the number of cycles before a transfer is considered done.""" + self._cycles_before_xfer_done = cycles diff --git a/vllm/sequence.py b/vllm/sequence.py index 87ba74c68536..99208fbad65f 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1188,9 +1188,15 @@ class IntermediateTensors: """For all pipeline stages except the last, we need to return the hidden states and residuals to be sent to the next stage. This data structure contains the hidden states and residuals for a request. + + Each stage also needs to handle its own finished_sending and + finished_recving in case of kv transfer. """ tensors: dict[str, torch.Tensor] + # [req_ids] + finished_sending: Optional[set[str]] = None + finished_recving: Optional[set[str]] = None def __init__(self, tensors): # manually define this function, so that diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 4a4144c4860a..11ddade3eb70 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -9,8 +9,7 @@ import time import traceback import weakref -from collections import defaultdict -from concurrent.futures import CancelledError, Future, ThreadPoolExecutor +from concurrent.futures import Future, ThreadPoolExecutor from dataclasses import dataclass from enum import Enum, auto from functools import partial @@ -27,6 +26,7 @@ destroy_model_parallel) from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) +from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.executor.multiproc_worker_utils import ( _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger @@ -118,13 +118,8 @@ def _init_executor(self) -> None: self.output_rank = self._get_output_rank() self.has_connector = self.vllm_config.kv_transfer_config is not None - - # Complete transfer tracker. Used by to track finished requests - # [req_id -> n_finished_workers] - self._recv_remaining_count = defaultdict[str, - int](lambda: self.world_size) - self._send_remaining_count = defaultdict[str, - int](lambda: self.world_size) + self.kv_output_aggregator = KVOutputAggregator( + self.parallel_config.world_size) def start_worker_monitor(self): workers = self.workers @@ -186,8 +181,9 @@ def execute_model( # aggregate all workers output to a single output if non_block: - return self._async_aggregate_workers_output(outputs) - return self._aggregate_workers_output(outputs) + return self.kv_output_aggregator.async_aggregate( + outputs, self.output_rank) + return self.kv_output_aggregator.aggregate(outputs, self.output_rank) def collective_rpc(self, method: Union[str, Callable], @@ -246,74 +242,6 @@ def get_response(w: WorkerProcHandle, except TimeoutError as e: raise TimeoutError(f"RPC call to {method} timed out.") from e - def _aggregate_workers_output( - self, outputs: list[ModelRunnerOutput]) -> ModelRunnerOutput: - # aggregate finished_sending, finished_recving from all workers - - def update_finished_set(req_ids: Optional[set[str]], - remaining_count_dict: dict[str, int], - finished_set: set[str]) -> None: - for req_id in req_ids or (): - new_count = remaining_count_dict[req_id] - 1 - if new_count == 0: - finished_set.add(req_id) - del remaining_count_dict[req_id] - else: - remaining_count_dict[req_id] = new_count - - finished_sending = set[str]() - finished_recving = set[str]() - for output in outputs: - update_finished_set(output.finished_sending, - self._send_remaining_count, finished_sending) - update_finished_set(output.finished_recving, - self._recv_remaining_count, finished_recving) - - # select output of the worker specified by output_rank - output = outputs[self.output_rank] - - # set the aggregated finished_sending / finished_recving - output.finished_sending = finished_sending if finished_sending else None - output.finished_recving = finished_recving if finished_recving else None - - return output - - def _async_aggregate_workers_output( - self, output_futures: list[Future[ModelRunnerOutput]] - ) -> (Future[ModelRunnerOutput]): - """Takes a list of futures and returns a single future which resolves - to the respective list of outputs.""" - result_future: Future[ModelRunnerOutput] = Future() - - outputs: list[Optional[ModelRunnerOutput]] = [None - ] * len(output_futures) - - def make_callback(idx): - - def callback(fut): - if result_future.done(): - return - - try: - outputs[idx] = fut.result() - except CancelledError: - result_future.cancel() - except Exception as e: - result_future.set_exception(e) - - # this check assumes io_thread_pool uses a single thread - if all(outputs): - result_future.set_result( - self._aggregate_workers_output( - cast(list[ModelRunnerOutput], outputs))) - - return callback - - for i, output_future in enumerate(output_futures): - output_future.add_done_callback(make_callback(i)) - - return result_future - @staticmethod def _ensure_worker_termination(worker_procs: list[BaseProcess]): """Ensure that all worker processes are terminated. Assumes workers have diff --git a/vllm/v1/executor/ray_distributed_executor.py b/vllm/v1/executor/ray_distributed_executor.py index eb659e4f9e47..b86ac048f520 100644 --- a/vllm/v1/executor/ray_distributed_executor.py +++ b/vllm/v1/executor/ray_distributed_executor.py @@ -2,33 +2,55 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from concurrent.futures import Future -from typing import Union +from typing import Optional, Union +from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator from vllm.executor.ray_distributed_executor import ( # noqa RayDistributedExecutor as RayDistributedExecutorV0) +from vllm.logger import init_logger from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput +logger = init_logger(__name__) + class FutureWrapper(Future): - """A wrapper around a Ray output reference to meet the interface - of .execute_model(). + """A wrapper around Ray output reference to meet the interface + of .execute_model(): The top level (core busy loop) expects .result() api + to block and return a single output. + + If aggregator is provided, the outputs from all workers are aggregated upon + the result() call. If not only the first worker's output is returned. """ - def __init__(self, ref): + def __init__(self, refs, aggregator: Optional[KVOutputAggregator] = None): super().__init__() - self.ref = ref + self.refs = refs + self.aggregator = aggregator def result(self, timeout=None): if timeout is not None: raise NotImplementedError("timeout is not supported") - return self.ref.get() + + if self.aggregator is None: + return self.refs[0].get() + + outputs = [ref.get() for ref in self.refs] + return self.aggregator.aggregate(outputs, output_rank=0) class RayDistributedExecutor(RayDistributedExecutorV0, Executor): """Ray distributed executor using Ray Compiled Graphs.""" + def _init_executor(self) -> None: + super()._init_executor() + + # KV connector setup + self.has_connector = self.vllm_config.kv_transfer_config is not None + self.kv_output_aggregator = KVOutputAggregator( + self.parallel_config.world_size) + @property def max_concurrent_batches(self) -> int: """Ray distributed executor supports pipeline parallelism, @@ -56,13 +78,24 @@ def execute_model( refs = self.forward_dag.execute(scheduler_output) # type: ignore - # When PP is not used, we block here until the result is available. + if not self.has_connector: + # Get output only from a single worker (output_rank) + # When PP is not used, we block here until the result is available. + if self.max_concurrent_batches == 1: + return refs[0].get() + + # When PP is used, we return a FutureWrapper immediately so that + # the scheduler can yield to the next batch. + return FutureWrapper(refs) + + # Get output from all workers when connector is present if self.max_concurrent_batches == 1: - return refs[0].get() + # Block and get results from all workers + outputs = [ref.get() for ref in refs] + return self.kv_output_aggregator.aggregate(outputs) - # When PP is used, we return a FutureWrapper immediately so that - # the scheduler can yield to the next batch. - return FutureWrapper(refs[0]) + # Return a future that will aggregate outputs from all workers + return FutureWrapper(refs, self.kv_output_aggregator) def reinitialize_distributed( self, reconfig_request: ReconfigureDistributedRequest) -> None: @@ -70,4 +103,4 @@ def reinitialize_distributed( if reconfig_request.new_data_parallel_rank == \ ReconfigureRankType.SHUTDOWN_CURRENT_RANK: self.shutdown() - return + return \ No newline at end of file diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a5c446731144..d5449a68bc28 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import copy import gc import time from contextlib import contextmanager @@ -1270,6 +1271,8 @@ def _pool( hidden_states: torch.Tensor, num_scheduled_tokens: int, num_scheduled_tokens_np: np.ndarray, + finished_sending: Optional[set[str]], + finished_recving: Optional[set[str]], ) -> ModelRunnerOutput: assert self.input_batch.num_reqs ==\ len(self.input_batch.pooling_params), \ @@ -1304,6 +1307,8 @@ def _pool( logprobs=None, prompt_logprobs_dict={}, pooler_output=pooler_output, + finished_sending=finished_sending, + finished_recving=finished_recving, ) @torch.inference_mode() @@ -1314,12 +1319,11 @@ def execute_model( ) -> Union[ModelRunnerOutput, IntermediateTensors]: self._update_states(scheduler_output) if not scheduler_output.total_num_scheduled_tokens: - if has_kv_transfer_group(): - with set_forward_context(None, self.vllm_config): - self.maybe_setup_kv_connector(scheduler_output) + if not has_kv_transfer_group(): + # Return empty ModelRunnerOutput if there's no work to do. + return EMPTY_MODEL_RUNNER_OUTPUT - # Return empty ModelRunnerOutput if there's no work to do. - return EMPTY_MODEL_RUNNER_OUTPUT + return self.kv_connector_no_forward(scheduler_output) # Prepare the decoder inputs. (attn_metadata, attention_cuda_graphs, logits_indices, @@ -1412,6 +1416,8 @@ def execute_model( ) self.maybe_wait_for_kv_save() + finished_sending, finished_recving = ( + self.get_finished_kv_transfers(scheduler_output)) if self.use_aux_hidden_state_outputs: hidden_states, aux_hidden_states = model_output @@ -1429,6 +1435,9 @@ def execute_model( if not get_pp_group().is_last_rank: # For mid-pipeline stages, return the hidden states. if not broadcast_pp_output: + if finished_sending or finished_recving: + hidden_states.finished_sending = finished_sending + hidden_states.finished_recving = finished_recving return hidden_states assert isinstance(hidden_states, IntermediateTensors) get_pp_group().send_tensor_dict(hidden_states.tensors, @@ -1437,7 +1446,8 @@ def execute_model( else: if self.input_batch.pooling_params: return self._pool(hidden_states, num_scheduled_tokens, - num_scheduled_tokens_np) + num_scheduled_tokens_np, finished_sending, + finished_recving) sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None) @@ -1587,6 +1597,8 @@ def execute_model( logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict, pooler_output=[], + finished_sending=finished_sending, + finished_recving=finished_recving, num_nans_in_logits=num_nans_in_logits, ) @@ -1711,6 +1723,31 @@ def maybe_wait_for_kv_save() -> None: if has_kv_transfer_group(): get_kv_transfer_group().wait_for_save() + @staticmethod + def get_finished_kv_transfers( + scheduler_output: "SchedulerOutput", + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + if has_kv_transfer_group(): + return get_kv_transfer_group().get_finished( + scheduler_output.finished_req_ids) + return None, None + + def kv_connector_no_forward( + self, scheduler_output: "SchedulerOutput") -> ModelRunnerOutput: + # KV send/recv even if no work to do. + with set_forward_context(None, self.vllm_config): + self.maybe_setup_kv_connector(scheduler_output) + finished_sending, finished_recving = ( + self.get_finished_kv_transfers(scheduler_output)) + + if not finished_sending and not finished_recving: + return EMPTY_MODEL_RUNNER_OUTPUT + + output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) + output.finished_sending = finished_sending + output.finished_recving = finished_recving + return output + def propose_ngram_draft_token_ids( self, sampled_token_ids: list[list[int]], diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2201481fa5bf..6411874883ef 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -15,9 +15,7 @@ from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce) -from vllm.distributed.kv_transfer import (ensure_kv_transfer_initialized, - get_kv_transfer_group, - has_kv_transfer_group) +from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized from vllm.distributed.parallel_state import get_pp_group, get_tp_group from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -335,25 +333,17 @@ def execute_model( assert isinstance(output, IntermediateTensors) get_pp_group().send_tensor_dict(output.tensors, all_gather_group=get_tp_group()) - output = EMPTY_MODEL_RUNNER_OUTPUT - assert isinstance(output, ModelRunnerOutput) - if has_kv_transfer_group(): - finished_sending, finished_recving = ( - get_kv_transfer_group().get_finished( - scheduler_output.finished_req_ids)) - if finished_sending or finished_recving: - if output is EMPTY_MODEL_RUNNER_OUTPUT: - output = copy.copy(EMPTY_MODEL_RUNNER_OUTPUT) - output.finished_sending = finished_sending - output.finished_recving = finished_recving - - # Clear KVConnector state for this step. - get_kv_transfer_group().clear_connector_metadata() - - # with a connector, the scheduler expects output from all workers - return output + # In case of PP with kv transfer, we need to pass through the + # finished_sending and finished_recving buffers. + empty_output = EMPTY_MODEL_RUNNER_OUTPUT + if output.finished_sending or output.finished_recving: + empty_output = copy.copy(empty_output) + empty_output.finished_sending = output.finished_sending + empty_output.finished_recving = output.finished_recving + output = empty_output + assert isinstance(output, ModelRunnerOutput) # return output only from the driver worker return output if self.is_driver_worker else None From 881e3cbe3b3cef5d6fc50ca0c19e30a9dd11c452 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sat, 19 Jul 2025 21:27:21 +0200 Subject: [PATCH 26/57] [V1] [Hybrid] Enable piecewise CUDA Graph for mamba layers (#21194) Signed-off-by: Thomas Parnell --- .../models/language/generation/test_hybrid.py | 1 - vllm/config.py | 1 + .../layers/mamba/mamba_mixer2.py | 75 ++++++++++++++++--- vllm/model_executor/models/bamba.py | 11 +-- vllm/model_executor/models/falcon_h1.py | 8 +- .../model_executor/models/granitemoehybrid.py | 8 +- vllm/model_executor/models/mamba2.py | 8 +- vllm/model_executor/models/nemotron_h.py | 8 +- vllm/model_executor/models/zamba2.py | 8 +- vllm/v1/worker/gpu_model_runner.py | 3 - 10 files changed, 100 insertions(+), 31 deletions(-) diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index eba14e64553e..e4294512338b 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -104,7 +104,6 @@ def test_models( m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER") with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS, - enforce_eager=True, enable_prefix_caching=False) as vllm_model: vllm_v1_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs) diff --git a/vllm/config.py b/vllm/config.py index 384cb584fa9a..a9720fa3142c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4312,6 +4312,7 @@ def set_splitting_ops_for_v1(self): self.splitting_ops = [] if self.full_cuda_graph else [ "vllm.unified_attention", "vllm.unified_attention_with_output", + "vllm.mamba_mixer2", ] diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index f3850d31c829..e32b2be4d40e 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -13,7 +13,7 @@ get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather, tensor_model_parallel_all_reduce) -from vllm.forward_context import get_forward_context +from vllm.forward_context import ForwardContext, get_forward_context from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -33,6 +33,8 @@ LoaderFunction, composed_weight_loader, sharded_weight_loader) from vllm.model_executor.models.mamba_cache import MambaCacheParams from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform +from vllm.utils import direct_register_custom_op from vllm.v1.attention.backends.mamba_attn import Mamba2AttentionMetadata # Added by the IBM Team, 2024 @@ -424,14 +426,36 @@ def __init__( def forward_native( self, hidden_states: torch.Tensor, - conv_state: torch.Tensor, - ssm_state: torch.Tensor, + output: torch.Tensor, + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + mup_vector: Optional[torch.Tensor] = None, ): pass + def forward( + self, + hidden_states: torch.Tensor, + output: torch.Tensor, + mamba_cache_params: MambaCacheParams, + mamba2_metadata: Mamba2Metadata, + mup_vector: Optional[torch.Tensor] = None, + ): + if not envs.VLLM_USE_V1: + CustomOp.forward(self, hidden_states, output, mamba_cache_params, + mamba2_metadata, mup_vector) + else: + torch.ops.vllm.mamba_mixer2( + hidden_states, + output, + self.prefix, + mup_vector, + ) + def forward_cuda( self, hidden_states: torch.Tensor, + output: torch.Tensor, mamba_cache_params: MambaCacheParams, mamba2_metadata: Mamba2Metadata, mup_vector: Optional[torch.Tensor] = None, @@ -517,6 +541,7 @@ def forward_cuda( num_prefill_tokens = attn_metadata.num_prefill_tokens # token count has_prefill = num_prefills > 0 has_decode = num_decodes > 0 + num_actual_tokens = num_prefill_tokens + num_decodes # NOTE: V0 put prefill before decode, v1 puts decode before prefill # Separate prefill and decode by splitting varlen input @@ -524,18 +549,18 @@ def forward_cuda( # NOTE: V0 put prefill before decode, v1 puts decode before prefill if envs.VLLM_USE_V1: hidden_states_B_C_d, hidden_states_B_C_p = torch.split( - hidden_states_B_C, + hidden_states_B_C[:num_actual_tokens], [num_decodes, num_prefill_tokens], dim=0, ) dt_d, dt_p = torch.split( - dt, + dt[:num_actual_tokens], [num_decodes, num_prefill_tokens], dim=0, ) # Split along batch dimension state_indices_tensor_d, state_indices_tensor_p = torch.split( - state_indices_tensor, + state_indices_tensor[:num_actual_tokens], [num_decodes, num_prefills], dim=0, ) @@ -696,11 +721,10 @@ def forward_cuda( # GatedRMSNorm internally applying SiLU to the gate # SiLU is applied internally before normalization, unlike standard # norm usage - hidden_states = self.norm(hidden_states, gate) + hidden_states = self.norm(hidden_states, gate[:num_actual_tokens]) # 5. Final linear projection - out, _ = self.out_proj(hidden_states) - return out + output[:num_actual_tokens], _ = self.out_proj(hidden_states) def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: return get_mamba_state_shape( @@ -712,3 +736,36 @@ def get_state_shape(self) -> tuple[tuple[int, ...], tuple[int, ...]]: state_size=self.ssm_state_size, conv_kernel=self.conv_kernel_size, ) + + +def mamba_mixer2( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, + mup_vector: Optional[torch.Tensor] = None, +) -> None: + forward_context: ForwardContext = get_forward_context() + self = forward_context.no_compile_layers[layer_name] + self.forward_cuda(hidden_states=hidden_states, + output=output, + mamba_cache_params=None, + mamba2_metadata=None, + mup_vector=mup_vector) + + +def mamba_mixer2_fake( + hidden_states: torch.Tensor, + output: torch.Tensor, + layer_name: str, + mup_vector: Optional[torch.Tensor] = None, +) -> None: + return + + +direct_register_custom_op( + op_name="mamba_mixer2", + op_func=mamba_mixer2, + mutates_args=["output"], + fake_impl=mamba_mixer2_fake, + dispatch_key=current_platform.dispatch_key, +) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index e93d4294a62c..0f5494427634 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -11,6 +11,7 @@ from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group @@ -122,11 +123,10 @@ def forward( hidden_states, residual = self.input_layernorm( hidden_states, residual) - hidden_states = self.mamba(hidden_states, mamba_cache_params, - mamba2_metadata) + output = torch.empty_like(hidden_states) + self.mamba(hidden_states, output, mamba_cache_params, mamba2_metadata) # Fully Connected - hidden_states, residual = self.pre_ff_layernorm( - hidden_states, residual) + hidden_states, residual = self.pre_ff_layernorm(output, residual) hidden_states = self.feed_forward(hidden_states) return hidden_states, residual @@ -169,7 +169,7 @@ def __init__( self.max_position_embeddings = max_position_embeddings if hasattr(config, "partial_rotary_factor"): - rotary_dim = self.head_dim * config.partial_rotary_factor + rotary_dim = int(self.head_dim * config.partial_rotary_factor) elif hasattr(config, "attn_rotary_emb"): rotary_dim = config.attn_rotary_emb # for backward compatibility else: @@ -258,6 +258,7 @@ def forward( } +@support_torch_compile class BambaModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py index 7761de224c9d..6a58b1501fe6 100644 --- a/vllm/model_executor/models/falcon_h1.py +++ b/vllm/model_executor/models/falcon_h1.py @@ -10,6 +10,7 @@ from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group @@ -179,13 +180,15 @@ def forward( mamba2_metadata: Mamba2Metadata, **kwargs, ): - hidden_states = self.mamba( + output = torch.empty_like(hidden_states) + self.mamba( hidden_states, + output, mamba_cache_params, mamba2_metadata=mamba2_metadata, mup_vector=self.mup_vector, ) - return hidden_states, residual + return output, residual class FalconH1AttentionDecoderLayer(nn.Module): @@ -398,6 +401,7 @@ def forward( return hidden_states +@support_torch_compile class FalconH1Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 1c93e90737ad..59c1dce48ee7 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -11,6 +11,7 @@ from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group @@ -104,9 +105,9 @@ def forward( ): residual = hidden_states hidden_states = self.input_layernorm(hidden_states) - hidden_states = self.mamba(hidden_states, mamba_cache_params, - mamba2_metadata) - hidden_states = residual + hidden_states * self.residual_multiplier + output = torch.empty_like(hidden_states) + self.mamba(hidden_states, output, mamba_cache_params, mamba2_metadata) + hidden_states = residual + output * self.residual_multiplier residual = hidden_states hidden_states = self.post_attention_layernorm(hidden_states) @@ -307,6 +308,7 @@ def forward( } +@support_torch_compile class GraniteMoeHybridModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index d812d8cc0a39..adad181617e6 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -10,6 +10,7 @@ from vllm import envs from vllm.attention.backends.abstract import AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import get_forward_context @@ -79,11 +80,12 @@ def forward( else: hidden_states, residual = self.norm(hidden_states, residual) - hidden_states = self.mixer(hidden_states, mamba_cache_params, - mamba2_metadata) - return hidden_states, residual + output = torch.empty_like(hidden_states) + self.mixer(hidden_states, output, mamba_cache_params, mamba2_metadata) + return output, residual +@support_torch_compile class Mamba2Model(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index cf7b39db1fe3..6a999e2254e7 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -25,6 +25,7 @@ from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed.parallel_state import get_pp_group @@ -172,9 +173,9 @@ def forward( else: hidden_states, residual = self.norm(hidden_states, residual) - hidden_states = self.mixer(hidden_states, mamba_cache_params, - mamba2_metadata) - return hidden_states, residual + output = torch.empty_like(hidden_states) + self.mixer(hidden_states, output, mamba_cache_params, mamba2_metadata) + return output, residual class NemotronHAttention(nn.Module): @@ -292,6 +293,7 @@ def forward( } +@support_torch_compile class NemotronHModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index ebf8dd497f67..7764fd9b9e08 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -17,6 +17,7 @@ from vllm import envs from vllm.attention.layer import Attention +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context @@ -548,14 +549,16 @@ def forward( hidden_states = self.input_layernorm(hidden_states) # Process through Mamba mixer - hidden_states = self.mamba( + output = torch.empty_like(hidden_states) + self.mamba( hidden_states, + output, mamba_cache_params=mamba_cache_params, mamba2_metadata=mamba2_metadata, ) # residual connection after mamba - hidden_states = residual + hidden_states + hidden_states = residual + output return hidden_states @@ -646,6 +649,7 @@ def forward( return layer_outputs +@support_torch_compile class Zamba2Model(nn.Module): """Core Zamba2 model combining transformer and Mamba architectures. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d5449a68bc28..1ee9c070226c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2753,9 +2753,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: if self.vllm_config.speculative_config is not None: raise NotImplementedError( "Mamba with speculative decoding is not supported yet.") - if not self.vllm_config.model_config.enforce_eager: - raise NotImplementedError( - "Mamba with cuda graph is not supported yet.") if self.vllm_config.cache_config.enable_prefix_caching: raise NotImplementedError( "Prefix caching is not supported for Mamba yet.") From 752c6ade2e0f38a26cdaaed6ffae8f72781e2d61 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 19 Jul 2025 13:53:17 -0700 Subject: [PATCH 27/57] [V0 Deprecation] Deprecate BlockSparse Attention & Phi3-Small (#21217) Signed-off-by: Woosuk Kwon --- .../scripts/hardware_ci/run-amd-test.sh | 1 - docs/models/supported_models.md | 1 - .../attention/test_blocksparse_attention.py | 441 ----------------- .../attention/test_rocm_attention_selector.py | 32 +- tests/models/registry.py | 4 - vllm/attention/backends/abstract.py | 1 - vllm/attention/backends/blocksparse_attn.py | 466 ------------------ .../backends/differential_flash_attn.py | 4 - .../backends/dual_chunk_flash_attn.py | 1 - vllm/attention/backends/flash_attn.py | 6 +- vllm/attention/backends/flashinfer.py | 1 - vllm/attention/backends/flashmla.py | 12 +- vllm/attention/backends/mla/common.py | 1 - vllm/attention/backends/rocm_aiter_mla.py | 12 +- vllm/attention/backends/rocm_flash_attn.py | 6 +- vllm/attention/backends/triton_mla.py | 12 +- vllm/attention/backends/xformers.py | 6 +- vllm/attention/layer.py | 6 +- .../ops/blocksparse_attention/__init__.py | 0 .../blocksparse_attention_kernel.py | 433 ---------------- .../ops/blocksparse_attention/interface.py | 239 --------- .../ops/blocksparse_attention/utils.py | 246 --------- vllm/attention/selector.py | 9 - vllm/model_executor/models/phi3_small.py | 465 ----------------- vllm/model_executor/models/registry.py | 1 - vllm/platforms/interface.py | 1 - vllm/v1/attention/backends/cpu_attn.py | 6 +- vllm/v1/attention/backends/flash_attn.py | 6 +- vllm/v1/attention/backends/flashinfer.py | 3 +- vllm/v1/attention/backends/flex_attention.py | 7 +- vllm/v1/attention/backends/mla/common.py | 3 +- vllm/v1/attention/backends/mla/cutlass_mla.py | 12 +- vllm/v1/attention/backends/mla/flashmla.py | 12 +- .../attention/backends/mla/rocm_aiter_mla.py | 12 +- vllm/v1/attention/backends/mla/triton_mla.py | 12 +- vllm/v1/attention/backends/pallas.py | 8 +- vllm/v1/attention/backends/rocm_aiter_fa.py | 6 +- vllm/v1/attention/backends/triton_attn.py | 6 +- 38 files changed, 65 insertions(+), 2435 deletions(-) delete mode 100644 tests/kernels/attention/test_blocksparse_attention.py delete mode 100644 vllm/attention/backends/blocksparse_attn.py delete mode 100644 vllm/attention/ops/blocksparse_attention/__init__.py delete mode 100644 vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py delete mode 100644 vllm/attention/ops/blocksparse_attention/interface.py delete mode 100644 vllm/attention/ops/blocksparse_attention/utils.py delete mode 100644 vllm/model_executor/models/phi3_small.py diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 156456c92e63..5e5a532cb57d 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -108,7 +108,6 @@ fi if [[ $commands == *" kernels/attention"* ]]; then commands="${commands} \ --ignore=kernels/attention/test_attention_selector.py \ - --ignore=kernels/attention/test_blocksparse_attention.py \ --ignore=kernels/attention/test_encoder_decoder_attn.py \ --ignore=kernels/attention/test_flash_attn.py \ --ignore=kernels/attention/test_flashinfer.py \ diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 887f754a3d1c..f5a89ab6cf7d 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -376,7 +376,6 @@ Specified using `--task generate`. | `OrionForCausalLM` | Orion | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. | | ✅︎ | ✅︎ | | `PhiForCausalLM` | Phi | `microsoft/phi-1_5`, `microsoft/phi-2`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Phi3ForCausalLM` | Phi-4, Phi-3 | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | -| `Phi3SmallForCausalLM` | Phi-3-Small | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. | | ✅︎ | ✅︎ | | `PhiMoEForCausalLM` | Phi-3.5-MoE | `microsoft/Phi-3.5-MoE-instruct`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Phi4FlashForCausalLM` | Phi-4-mini-flash-reasoning | `microsoft/microsoft/Phi-4-mini-instruct`, etc. | | | | | `PersimmonForCausalLM` | Persimmon | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. | | ✅︎ | ✅︎ | diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py deleted file mode 100644 index 9aee818c9956..000000000000 --- a/tests/kernels/attention/test_blocksparse_attention.py +++ /dev/null @@ -1,441 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import random -from typing import Optional - -import pytest -import torch - -from tests.kernels.allclose_default import get_default_atol, get_default_rtol -from vllm import _custom_ops as ops -from vllm.attention.ops.blocksparse_attention.interface import ( - LocalStridedBlockSparseAttn) -from vllm.platforms import current_platform -from vllm.utils import get_max_shared_memory_bytes - -FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 -# This will change depending on the compute capability. -# - 512 as a buffer -MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512 -# MAX_SEQ_LEN = 2771 - -# There may not be enough gpu memory due to large NUM_BLOCKS. -# Reduce NUM_BLOCKS when it happens. -NUM_BLOCKS = 4321 # Arbitrary values for testing -PARTITION_SIZE = 512 -DTYPES = [torch.half, torch.bfloat16] -NUM_GEN_SEQS = [3] # Arbitrary values for testing -NUM_PREFILL_SEQS = [3] # Arbitrary values for testing -NUM_HEADS = [(40, 40)] # Arbitrary values for testing - -HEAD_SIZES = [64, 112] -BLOCK_SIZES = [16] -USE_ALIBI = [False, True] -KV_CACHE_DTYPE = ["auto", "fp8"] -SEEDS = [0] -CUDA_DEVICES = ['cuda:0'] -BLOCKSPARSE_LOCAL_BLOCKS = [16] -BLOCKSPARSE_VERT_STRIDES = [8] - -BLOCKSPARSE_BLOCK_SIZES = [64] -BLOCKSPARSE_HEADS_SLIDINGS = [2, -1] -BLOCKSPARSE_HOMO_HEADS = [True, False] - - -def ref_masked_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - scale: float, - attn_mask: Optional[torch.Tensor] = None, -) -> torch.Tensor: - attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float() - if attn_mask is not None: - attn_weights = attn_weights + attn_mask.float() - attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype) - out = torch.einsum("hqk,khd->qhd", attn_weights, value) - return out - - -def ref_single_query_cached_kv_attention( - output: torch.Tensor, - query: torch.Tensor, - num_queries_per_kv: int, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - block_tables: torch.Tensor, - seq_lens: torch.Tensor, - scale: float, - alibi_slopes: Optional[torch.Tensor], - tp_rank: int = 0, - blocksparse_local_blocks: int = 0, - blocksparse_vert_stride: int = 1, - blocksparse_block_size: int = 64, - blocksparse_head_sliding_step: int = 0, -) -> None: - num_query_heads = query.shape[1] - num_kv_heads = value_cache.shape[1] - head_size = value_cache.shape[2] - block_size = value_cache.shape[3] - num_seqs = query.shape[0] - - block_tables_lst = block_tables.cpu().tolist() - seq_lens_lst = seq_lens.cpu().tolist() - for i in range(num_seqs): - q = query[i].unsqueeze(0) - block_table = block_tables_lst[i] - seq_len = int(seq_lens_lst[i]) - - keys_lst: list[torch.Tensor] = [] - values_lst: list[torch.Tensor] = [] - for j in range(seq_len): - block_number = int(block_table[j // block_size]) - block_offset = j % block_size - - k = key_cache[block_number, :, :, block_offset, :] - k = k.reshape(num_kv_heads, head_size) - keys_lst.append(k) - - v = value_cache[block_number, :, :, block_offset] - values_lst.append(v) - keys = torch.stack(keys_lst, dim=0) - values = torch.stack(values_lst, dim=0) - if num_queries_per_kv > 1: - # Handle MQA and GQA - keys = torch.repeat_interleave(keys, num_queries_per_kv, dim=1) - values = torch.repeat_interleave(values, num_queries_per_kv, dim=1) - - alibi_bias = None - if alibi_slopes is not None: - # Create the ALiBi bias used in the paged attention kernel. - position_ids = torch.arange(seq_len).int() - alibi_bias = (position_ids - seq_len + 1).float() - alibi_bias = alibi_slopes.view(-1, 1, 1) * alibi_bias.view( - 1, 1, -1) - - if blocksparse_vert_stride >= 1: - bsize = blocksparse_block_size - hsliding = blocksparse_head_sliding_step - vert = blocksparse_vert_stride - locals = blocksparse_local_blocks - qb = (seq_len - 1) // bsize - attn_mask = q.new_zeros( - (num_query_heads, 1, seq_len)).float() - torch.inf - for h in range(num_query_heads): - if hsliding >= 0: # slide with q heads - bs_offset = (tp_rank * num_query_heads + h) * hsliding + 1 - else: # slide with kv heads - bs_offset = (tp_rank * num_kv_heads + - h // num_queries_per_kv) * (-hsliding) + 1 - for kb in range(qb + 1): - kj = kb * bsize - if (qb - kb) < locals or \ - (kb + bs_offset) % vert == 0: - attn_mask[h, 0, kj:min(kj + bsize, seq_len)] = 0 - if alibi_bias is not None: - attn_mask += alibi_bias - else: - attn_mask = alibi_bias - - out = ref_masked_attention(q, keys, values, scale, attn_mask=attn_mask) - out = out.view(num_query_heads, head_size) - output[i].copy_(out, non_blocking=True) - - -@pytest.mark.parametrize("version", ["v1", "v2"]) -@pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("use_alibi", USE_ALIBI) -@pytest.mark.parametrize("block_size", BLOCK_SIZES) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("kv_cache_dtype", KV_CACHE_DTYPE) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS) -@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES) -@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES) -@pytest.mark.parametrize("blocksparse_head_sliding_step", - BLOCKSPARSE_HEADS_SLIDINGS) -def test_paged_attention( - kv_cache_factory, - version: str, - num_seqs: int, - num_heads: tuple[int, int], - head_size: int, - use_alibi: bool, - block_size: int, - dtype: torch.dtype, - kv_cache_dtype: str, - seed: int, - device: str, - blocksparse_local_blocks: int, - blocksparse_vert_stride: int, - blocksparse_block_size: int, - blocksparse_head_sliding_step: int, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - scale = float(1.0 / (head_size**0.5)) - num_query_heads, num_kv_heads = num_heads - query = torch.empty(num_seqs, num_query_heads, head_size, dtype=dtype) - query.uniform_(-scale, scale) - - assert num_query_heads % num_kv_heads == 0 - num_queries_per_kv = num_query_heads // num_kv_heads - alibi_slopes = None - if use_alibi: - alibi_slopes = torch.rand(num_query_heads, dtype=torch.float) - - seq_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)] - seq_lens[-1] = MAX_SEQ_LEN - max_seq_len = max(seq_lens) - seq_lens = torch.tensor(seq_lens, dtype=torch.int) - - # Create the block tables. - max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] - for _ in range(num_seqs): - block_table = [ - random.randint(0, NUM_BLOCKS - 1) - for _ in range(max_num_blocks_per_seq) - ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int) - - # Create the KV caches. - key_caches, value_caches = kv_cache_factory(NUM_BLOCKS, block_size, 1, - num_kv_heads, head_size, - kv_cache_dtype, dtype, seed, - device) - key_cache, value_cache = key_caches[0], value_caches[0] - - # Using default kv_scale - k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device) - tp_rank = 0 - - # Call the paged attention kernel. - output = torch.empty_like(query) - if version == "v1": - ops.paged_attention_v1( - output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - block_size, - max_seq_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - tp_rank=tp_rank, - blocksparse_local_blocks=blocksparse_local_blocks, - blocksparse_vert_stride=blocksparse_vert_stride, - blocksparse_block_size=blocksparse_block_size, - blocksparse_head_sliding_step=blocksparse_head_sliding_step, - ) - elif version == "v2": - num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE) - assert PARTITION_SIZE % block_size == 0 - num_seqs, num_heads, head_size = output.shape - tmp_output = torch.empty( - size=(num_seqs, num_heads, num_partitions, head_size), - dtype=output.dtype, - ) - exp_sums = torch.empty( - size=(num_seqs, num_heads, num_partitions), - dtype=torch.float32, - ) - max_logits = torch.empty_like(exp_sums) - ops.paged_attention_v2( - output, - exp_sums, - max_logits, - tmp_output, - query, - key_cache, - value_cache, - num_kv_heads, - scale, - block_tables, - seq_lens, - block_size, - max_seq_len, - alibi_slopes, - kv_cache_dtype, - k_scale, - v_scale, - tp_rank=tp_rank, - blocksparse_local_blocks=blocksparse_local_blocks, - blocksparse_vert_stride=blocksparse_vert_stride, - blocksparse_block_size=blocksparse_block_size, - blocksparse_head_sliding_step=blocksparse_head_sliding_step, - ) - else: - raise AssertionError(f"Unknown version: {version}") - - # Run the reference implementation. - if kv_cache_dtype == "fp8": - # Convert cache data back to dtype. - x = 16 // torch.tensor([], dtype=dtype).element_size() - key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, - block_size, x) - dequantized_key_cache = torch.empty(size=key_cache_shape, - dtype=dtype, - device=device) - ops.convert_fp8(dequantized_key_cache, key_cache) - key_cache = dequantized_key_cache - - value_cache_shape = value_cache.shape - dequantized_value_cache = torch.empty(size=value_cache_shape, - dtype=dtype, - device=device) - ops.convert_fp8(dequantized_value_cache, value_cache) - value_cache = dequantized_value_cache - - ref_output = torch.empty_like(query) - ref_single_query_cached_kv_attention( - ref_output, - query, - num_queries_per_kv, - key_cache, - value_cache, - block_tables, - seq_lens, - scale, - alibi_slopes, - tp_rank, - blocksparse_local_blocks, - blocksparse_vert_stride, - blocksparse_block_size, - blocksparse_head_sliding_step, - ) - - # NOTE(woosuk): Due to the kernel-level differences in the two - # implementations, there is a small numerical difference in the two - # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 - rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 - - # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, - # so we use a relaxed tolerance for the test. - atol, rtol = 1e-3, 1e-5 - if kv_cache_dtype == "fp8": - atol, rtol = 1e-2, 1e-5 - torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) - - -def ref_multi_query_kv_attention( - cu_seq_lens: list[int], - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - scale: float, - dtype: torch.dtype, -) -> torch.Tensor: - num_seqs = len(cu_seq_lens) - 1 - ref_outputs = [] - for i in range(num_seqs): - start_idx = cu_seq_lens[i] - end_idx = cu_seq_lens[i + 1] - seq_len = end_idx - start_idx - - # Create attention mask. - attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype), - diagonal=1) - attn_mask = attn_mask * torch.finfo(dtype).min - attn_mask = attn_mask.to(dtype=dtype) - - ref_output = ref_masked_attention( - query[start_idx:end_idx], - key[start_idx:end_idx], - value[start_idx:end_idx], - scale, - attn_mask=attn_mask, - ) - ref_outputs.append(ref_output) - ref_output = torch.cat(ref_outputs, dim=0) - return ref_output - - -@pytest.mark.parametrize("num_seqs", NUM_PREFILL_SEQS) -@pytest.mark.parametrize("num_heads", NUM_HEADS) -@pytest.mark.parametrize("head_size", HEAD_SIZES) -@pytest.mark.parametrize("blocksparse_local_blocks", BLOCKSPARSE_LOCAL_BLOCKS) -@pytest.mark.parametrize("blocksparse_vert_stride", BLOCKSPARSE_VERT_STRIDES) -@pytest.mark.parametrize("blocksparse_block_size", BLOCKSPARSE_BLOCK_SIZES) -@pytest.mark.parametrize("blocksparse_homo_heads", BLOCKSPARSE_HOMO_HEADS) -@pytest.mark.parametrize("dtype", DTYPES) -@pytest.mark.parametrize("seed", SEEDS) -@pytest.mark.parametrize("device", CUDA_DEVICES) -@torch.inference_mode() -def test_varlen_blocksparse_attention_prefill( - num_seqs: int, - num_heads: tuple[int, int], - head_size: int, - blocksparse_local_blocks: int, - blocksparse_vert_stride: int, - blocksparse_block_size: int, - blocksparse_homo_heads: bool, - dtype: torch.dtype, - seed: int, - device: str, -) -> None: - current_platform.seed_everything(seed) - torch.set_default_device(device) - # MAX_SEQ_LEN sometimes causes OOM in the reference implementation. - # As the xformers library is already tested with its own tests, we can use - # a smaller MAX_SEQ_LEN here. - max_len = min(MAX_SEQ_LEN, 4096) - seq_lens = random.sample(range(1, max_len), num_seqs) - cu_seq_lens = torch.cumsum(torch.tensor([0] + seq_lens), dim=0) - num_tokens = sum(seq_lens) - - scale = float(1.0 / (head_size**0.5)) - num_query_heads, num_kv_heads = num_heads - assert num_query_heads % num_kv_heads == 0 - num_queries_per_kv = num_query_heads // num_kv_heads - - qkv = torch.empty(num_tokens, - num_query_heads + 2 * num_kv_heads, - head_size, - dtype=dtype) - qkv.uniform_(-scale, scale) - query, key, value = qkv.split( - [num_query_heads, num_kv_heads, num_kv_heads], dim=1) - - bs_attn_op = LocalStridedBlockSparseAttn( - num_query_heads, - max_len, - local_blocks=blocksparse_local_blocks, - vert_stride=blocksparse_vert_stride, - block_size=blocksparse_block_size, - device=device, - dtype=dtype, - homo_head=blocksparse_homo_heads) - - output = bs_attn_op(query, - key, - value, - cu_seq_lens.to(device), - sm_scale=scale) - - if num_queries_per_kv > 1: - # Handle MQA and GQA - key = torch.repeat_interleave(key, num_queries_per_kv, dim=1) - value = torch.repeat_interleave(value, num_queries_per_kv, dim=1) - - ref_output = ref_multi_query_kv_attention( - cu_seq_lens.tolist(), - query, - key, - value, - scale, - dtype, - ) - torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) diff --git a/tests/kernels/attention/test_rocm_attention_selector.py b/tests/kernels/attention/test_rocm_attention_selector.py index 34311b9ccd76..d56d3f4638f1 100644 --- a/tests/kernels/attention/test_rocm_attention_selector.py +++ b/tests/kernels/attention/test_rocm_attention_selector.py @@ -33,8 +33,12 @@ def test_selector(monkeypatch: pytest.MonkeyPatch): # change the attention backend to triton MLA m.setenv(STR_BACKEND_ENV_VAR, "TRITON_MLA") - backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, - False, True) + backend = get_attn_backend(576, + torch.bfloat16, + "auto", + 16, + False, + use_mla=True) assert (backend.get_name() == "TRITON_MLA" or backend.get_name() == "TRITON_MLA_VLLM_V1") @@ -42,15 +46,23 @@ def test_selector(monkeypatch: pytest.MonkeyPatch): # If use_mla is true # The selected backend is triton MLA m.setenv(STR_BACKEND_ENV_VAR, None) - backend = get_attn_backend(576, torch.bfloat16, "auto", 16, False, - False, True) + backend = get_attn_backend(576, + torch.bfloat16, + "auto", + 16, + False, + use_mla=True) assert (backend.get_name() == "TRITON_MLA" or backend.get_name() == "TRITON_MLA_VLLM_V1") # change the attention backend to AITER MLA m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA") - backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, - False, True) + backend = get_attn_backend(576, + torch.bfloat16, + "auto", + 1, + False, + use_mla=True) assert (backend.get_name() == "ROCM_AITER_MLA" or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1") @@ -60,7 +72,11 @@ def test_selector(monkeypatch: pytest.MonkeyPatch): # The selected backend is ROCM_AITER_MLA m.setenv(STR_BACKEND_ENV_VAR, None) m.setenv("VLLM_ROCM_USE_AITER", "1") - backend = get_attn_backend(576, torch.bfloat16, "auto", 1, False, - False, True) + backend = get_attn_backend(576, + torch.bfloat16, + "auto", + 1, + False, + use_mla=True) assert (backend.get_name() == "ROCM_AITER_MLA" or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1") diff --git a/tests/models/registry.py b/tests/models/registry.py index 5c546a6c86da..8afac32e1cf0 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -247,10 +247,6 @@ def check_available_online( "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"), "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"), "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"), - # Blocksparse attention not supported in V1 yet - "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct", - trust_remote_code=True, - v0_only=True), "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501 trust_remote_code=True, v0_only=True, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 05c098a58a0d..ba20da4fd75f 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -269,7 +269,6 @@ def __init__( alibi_slopes: Optional[List[float]] = None, sliding_window: Optional[int] = None, kv_cache_dtype: str = "auto", - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py deleted file mode 100644 index e4338805f564..000000000000 --- a/vllm/attention/backends/blocksparse_attn.py +++ /dev/null @@ -1,466 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple, Type - -import torch - -from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, - AttentionLayer, - AttentionMetadata, AttentionType) -from vllm.attention.backends.utils import (CommonAttentionState, - CommonMetadataBuilder) -from vllm.attention.ops.blocksparse_attention.interface import ( - LocalStridedBlockSparseAttn, get_head_sliding_step) -from vllm.attention.ops.paged_attn import PagedAttention -from vllm.distributed import (get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) - - -@dataclass -class BlocksparseParams: - max_seqlen: int - - # Num q heads per tensor-parallel rank/partition - num_heads: int # per TP partition - # Num kv heads per tensor-parallel rank/partition - num_kv_heads: int - - # block size used for blocksparse attention. - # This is the block_size used in `local_blocks`, `vert_stride`. - block_size: int - - # Number of blocks for local attention, i.e., number of - # local attended tokens / `sparse_block_size` - local_blocks: int - - # Attend to one block per every `vert_stride` blocks. - # Controlling the sparsity - vert_stride: int - """ - If to use the same vertical stride offset for all heads, - i.e., attend to the same block of tokens on all heads. - By default, it is False, i.e., attention on the non-local - blocks depends on the `head_idx`, that is on - blocks satisfying - `(block_idx + head_idx * head_sliding_step + 1) % vert_stride == 0` - where `head_sliding_step=max(1, int(vert_stride / num_total_heads))`, - `block_idx = position_id // sparse_block_size`. - See `..ops.blocksparse_attention.utils:get_sparse_attn_mask` - for more detail. - """ - homo_head: bool = False - - # If within a group, the kv offsets that each q attends is the same or no. - homo_head_group: bool = False - - # Decided by homo_head and homo_head group - head_sliding_step: int = field(init=False) - - # range of q heads to for a TP rank - active_head_range: Tuple = field(init=False) - - def __post_init__(self): - assert self.block_size > 0 - assert self.local_blocks >= 0 - assert self.vert_stride >= 1 - - tp_size = get_tensor_model_parallel_world_size() - tp_rank = get_tensor_model_parallel_rank() - total_heads = tp_size * self.num_heads - total_kv_heads = tp_size * self.num_kv_heads - - if self.homo_head: - self.head_sliding_step = 0 - elif self.homo_head_group: - head_sliding_step = get_head_sliding_step(total_kv_heads, - self.vert_stride) - # negative indicates sliding along kv heads, i.e., homo q group - self.head_sliding_step = -head_sliding_step - else: - self.head_sliding_step = get_head_sliding_step( - total_heads, self.vert_stride) - - self.active_head_range = ( - tp_rank * self.num_heads, - (tp_rank + 1) * self.num_heads, - ) - - -class BlocksparseFlashAttentionBackend(AttentionBackend): - - @staticmethod - def get_name() -> str: - return "BLOCK_SPARSE_FLASH_ATTN" - - @staticmethod - def get_impl_cls() -> Type["BlocksparseFlashAttentionImpl"]: - return BlocksparseFlashAttentionImpl - - @staticmethod - def get_metadata_cls() -> Type["AttentionMetadata"]: - return BlocksparseFlashAttentionMetadata - - @staticmethod - def get_builder_cls() -> Type["BlocksparseFlashAttentionMetadataBuilder"]: - return BlocksparseFlashAttentionMetadataBuilder - - @staticmethod - def get_state_cls() -> Type["CommonAttentionState"]: - return CommonAttentionState - - @staticmethod - def get_kv_cache_shape( - num_blocks: int, - block_size: int, - num_kv_heads: int, - head_size: int, - ) -> Tuple[int, ...]: - return PagedAttention.get_kv_cache_shape(num_blocks, block_size, - num_kv_heads, head_size) - - @staticmethod - def swap_blocks( - src_kv_cache: torch.Tensor, - dst_kv_cache: torch.Tensor, - src_to_dst: Dict[int, int], - ) -> None: - PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst) - - @staticmethod - def copy_blocks( - kv_caches: List[torch.Tensor], - src_to_dists: Dict[int, List[int]], - ) -> None: - PagedAttention.copy_blocks(kv_caches, src_to_dists) - - -@dataclass -class BlocksparseFlashAttentionMetadata(AttentionMetadata): - """A copy of Metadata for FlashAttentionBackend, - to avoid having to install flash_attn. - - NOTE: Any python object stored here is not updated when it is - cuda-graph replayed. If you have values that need to be changed - dynamically, it should be stored in tensor. The tensor has to be - updated from `CUDAGraphRunner.forward` API. - """ - # (batch_size,). The sequence length per sequence. Sequence length means - # the computed tokens + new tokens None if it is a decoding. - seq_lens: Optional[List[int]] - # seq_lens stored as a tensor. - seq_lens_tensor: Optional[torch.Tensor] - - # NOTE(sang): Definition of context_len, query_len, and seq_len. - # |---------- N-1 iteration --------| - # |---------------- N iteration ---------------------| - # |- tokenA -|......................|-- newTokens ---| - # |---------- context_len ----------| - # |-------------------- seq_len ----------------------| - # |-- query_len ---| - - # Maximum query length in the batch. None for decoding. - max_query_len: Optional[int] - # Maximum sequence length among prefill batch. 0 if there are decoding - # requests only. - max_prefill_seq_len: int - # Maximum sequence length among decode batch. 0 if there are prefill - # requests only. - max_decode_seq_len: int - # (batch_size + 1,). The cumulative subquery lengths of the sequences in - # the batch, used to index into subquery. E.g., if the subquery length - # is [4, 6], it is [0, 4, 10]. - query_start_loc: Optional[torch.Tensor] - # (batch_size + 1,). The cumulative sequence lengths of the sequences in - # the batch, used to index into sequence. E.g., if the sequence length is - # [4, 6], it is [0, 4, 10]. - seq_start_loc: Optional[torch.Tensor] - # (batch_size,) A tensor of context lengths (tokens that are computed - # so far). - context_lens_tensor: Optional[torch.Tensor] - - # (batch_size, max_blocks_per_seq). - # Block addresses per sequence. (Seq id -> list of physical block) - # E.g., [0, 1, 2] means tokens are stored in 0th, 1st, and 2nd blocks - # in the kv cache. Each block can contain up to block_size tokens. - # 2nd dimensions are padded up to max_blocks_per_seq if it is cuda-graph - # captured. - block_tables: Optional[torch.Tensor] - - # Whether or not if cuda graph is enabled. - # Cuda-graph is currently enabled for decoding only. - # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. - use_cuda_graph: bool - - # Max number of query tokens for among request in the batch. - max_decode_query_len: Optional[int] = None - - _cached_prefill_metadata: Optional[ - "BlocksparseFlashAttentionMetadata"] = None - _cached_decode_metadata: Optional[ - "BlocksparseFlashAttentionMetadata"] = None - - @property - def prefill_metadata( - self) -> Optional["BlocksparseFlashAttentionMetadata"]: - if self.num_prefills == 0: - return None - - if self._cached_prefill_metadata is not None: - return self._cached_prefill_metadata - - assert self.seq_lens is not None - assert self.seq_lens_tensor is not None - assert self.query_start_loc is not None - assert self.context_lens_tensor is not None - assert self.block_tables is not None - assert self.seq_start_loc is not None - - self._cached_prefill_metadata = BlocksparseFlashAttentionMetadata( - num_prefills=self.num_prefills, - num_prefill_tokens=self.num_prefill_tokens, - num_decode_tokens=0, - slot_mapping=self.slot_mapping[:self.num_prefill_tokens], - multi_modal_placeholder_index_maps=self. - multi_modal_placeholder_index_maps, - enable_kv_scales_calculation=self.enable_kv_scales_calculation, - seq_lens=self.seq_lens[:self.num_prefills], - seq_lens_tensor=self.seq_lens_tensor[:self.num_prefills], - max_query_len=self.max_query_len, - max_prefill_seq_len=self.max_prefill_seq_len, - max_decode_seq_len=0, - query_start_loc=self.query_start_loc[:self.num_prefills + 1], - seq_start_loc=self.seq_start_loc[:self.num_prefills + 1], - context_lens_tensor=self.context_lens_tensor[:self.num_prefills], - block_tables=self.block_tables[:self.num_prefills], - use_cuda_graph=False, - ) - return self._cached_prefill_metadata - - @property - def decode_metadata(self) -> Optional["BlocksparseFlashAttentionMetadata"]: - if self.num_decode_tokens == 0: - return None - - if self._cached_decode_metadata is not None: - return self._cached_decode_metadata - assert self.block_tables is not None - assert self.seq_lens_tensor is not None - - self._cached_decode_metadata = BlocksparseFlashAttentionMetadata( - num_prefills=0, - num_prefill_tokens=0, - num_decode_tokens=self.num_decode_tokens, - slot_mapping=self.slot_mapping[self.num_prefill_tokens:], - multi_modal_placeholder_index_maps=None, - enable_kv_scales_calculation=False, - seq_lens=None, - seq_lens_tensor=self.seq_lens_tensor[self.num_prefills:], - max_query_len=None, - max_prefill_seq_len=0, - max_decode_seq_len=self.max_decode_seq_len, - query_start_loc=None, - seq_start_loc=None, - context_lens_tensor=None, - block_tables=self.block_tables[self.num_prefills:], - use_cuda_graph=self.use_cuda_graph, - ) - return self._cached_decode_metadata - - -class BlocksparseFlashAttentionMetadataBuilder( - CommonMetadataBuilder[BlocksparseFlashAttentionMetadata]): - - _metadata_cls = BlocksparseFlashAttentionMetadata - - -class BlocksparseFlashAttentionImpl(AttentionImpl): - """ - If the input tensors contain prompt tokens, the layout is as follows: - |<--------------- num_prompt_tokens -------------->| - |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->| - - Otherwise, the layout is as follows: - |<------------------ num_generation_tokens (M) ----------------->| - |<--generation_0-->|..........|<--generation_M-1-->|<--padding-->| - - Generation tokens can contain padding when cuda-graph is used. - Currently, prompt tokens don't contain any padding. - - The prompts might have different lengths, while the generation tokens - always have length 1. - - """ - - def __init__( - self, - num_heads: int, - head_size: int, - scale: float, - num_kv_heads: int, - alibi_slopes: Optional[List[float]], - sliding_window: Optional[int], - kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, - logits_soft_cap: Optional[float] = None, - attn_type: str = AttentionType.DECODER, - kv_sharing_target_layer_name: Optional[str] = None, - ) -> None: - if kv_sharing_target_layer_name is not None: - raise NotImplementedError("KV sharing is not supported in V0 " - "BLOCK_SPARSE_FLASH_ATTN Backend.") - assert blocksparse_params is not None - assert alibi_slopes is None, ValueError( - "Alibi not support for blocksparse flash attention.") - assert sliding_window is None, ValueError( - "sliding_window is invalid for blocksparse attention.") - assert logits_soft_cap is None, ValueError( - "logits_soft_cap is invalid for blocksparse attention.") - - if "num_heads" not in blocksparse_params: - blocksparse_params["num_heads"] = num_heads - if "num_kv_heads" not in blocksparse_params: - blocksparse_params["num_kv_heads"] = num_kv_heads or num_heads - self.blocksparse_params = BlocksparseParams(**blocksparse_params) - self.kv_cache_dtype = kv_cache_dtype - - self.num_heads = num_heads - self.head_size = head_size - self.scale = float(scale) - self.alibi_slopes = alibi_slopes - self.num_kv_heads = num_kv_heads - - self.num_queries_per_kv = self.num_heads // self.num_kv_heads - - self.local_blocks = self.blocksparse_params.local_blocks - self.vert_stride = self.blocksparse_params.vert_stride - self.sparse_block_size = self.blocksparse_params.block_size - self.head_sliding_step = self.blocksparse_params.head_sliding_step - - supported_head_sizes = PagedAttention.get_supported_head_sizes() - if head_size not in supported_head_sizes: - raise ValueError( - f"Head size {head_size} is not supported by PagedAttention. " - f"Supported head sizes are: {supported_head_sizes}.") - - self.tp_size = get_tensor_model_parallel_world_size() - self.tp_rank = get_tensor_model_parallel_rank() - - total_num_heads = num_heads * self.tp_size - self.bs_attn = LocalStridedBlockSparseAttn( - total_num_heads, - self.blocksparse_params.max_seqlen, - self.blocksparse_params.local_blocks, - self.blocksparse_params.vert_stride, - self.blocksparse_params.block_size, - homo_head=self.blocksparse_params.homo_head, - active_head_range=self.blocksparse_params.active_head_range, - ) - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "BlocksparseFlashAttentionImpl") - - def forward( - self, - layer: AttentionLayer, - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - kv_cache: torch.Tensor, - attn_metadata: BlocksparseFlashAttentionMetadata, - output: Optional[torch.Tensor] = None, - output_scale: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - """Forward pass with FlashAttention and PagedAttention. - - Args: - query: shape = [num_tokens, num_heads * head_size] - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] - kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] - NOTE: kv_cache will be an empty tensor with shape [0] - for profiling run. - attn_metadata: Metadata for attention. - Returns: - shape = [num_tokens, num_heads * head_size] - """ - if output_scale is not None: - raise NotImplementedError( - "fused output quantization is not yet supported" - " for BlocksparseFlashAttentionImpl") - - num_tokens, hidden_size = query.shape - # Reshape the query, key, and value tensors. - query = query.view(-1, self.num_heads, self.head_size) - key = key.view(-1, self.num_kv_heads, self.head_size) - value = value.view(-1, self.num_kv_heads, self.head_size) - - if kv_cache.numel() > 0: - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_kv_heads, self.head_size) - - # Reshape the input keys and values and store them in the cache. - # If kv_cache is not provided, the new key and value tensors are - # not cached. This happens during the initial memory profiling run. - - PagedAttention.write_to_paged_cache( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - - if prefill_meta := attn_metadata.prefill_metadata: - - # Prompt run. - # normal attention - # When block_tables are not filled, it means q and k are the - # prompt, and they have the same length. - - assert kv_cache.numel() == 0 \ - or prefill_meta.block_tables is None \ - or prefill_meta.block_tables.numel() == 0, \ - "Does not support prefix-enabled attention." - - output = self.bs_attn( - q=query, - k=key, - v=value, - cu_seqlens_q=prefill_meta.seq_start_loc, - cu_seqlens_k=prefill_meta.seq_start_loc, - sm_scale=self.scale, - ) - - if decode_meta := attn_metadata.decode_metadata: - # Decoding run. - output = PagedAttention.forward_decode( - query, - key_cache, - value_cache, - decode_meta.block_tables, - decode_meta.seq_lens_tensor, - self.blocksparse_params.max_seqlen, - self.kv_cache_dtype, - self.num_kv_heads, - self.scale, - self.alibi_slopes, - layer._k_scale, - layer._v_scale, - tp_rank=self.tp_rank, - blocksparse_local_blocks=self.local_blocks, - blocksparse_vert_stride=self.vert_stride, - blocksparse_block_size=self.sparse_block_size, - blocksparse_head_sliding_step=self.head_sliding_step, - ) - - assert output is not None - # Reshape the output tensor. - return output.view(num_tokens, hidden_size) diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py index 1c139952371a..bd9bc427728d 100644 --- a/vllm/attention/backends/differential_flash_attn.py +++ b/vllm/attention/backends/differential_flash_attn.py @@ -667,7 +667,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -680,9 +679,6 @@ def __init__( differential_flash_attention_config self.used_shared_kv_cache = kv_sharing_target_layer_name is not None self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - if blocksparse_params is not None: - raise ValueError( - "FlashAttention does not support block-sparse attention.") if use_irope: logger.warning( "Using irope in V0 is not supported yet, it will fall back " diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py index 40557a4e8f8f..e108646e7ffb 100644 --- a/vllm/attention/backends/dual_chunk_flash_attn.py +++ b/vllm/attention/backends/dual_chunk_flash_attn.py @@ -287,7 +287,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 20e67eb9b401..ee36fd19e012 100755 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -4,7 +4,7 @@ from collections import defaultdict from dataclasses import dataclass from itertools import accumulate -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Type import torch @@ -615,7 +615,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -624,9 +623,6 @@ def __init__( if kv_sharing_target_layer_name is not None: raise NotImplementedError("KV sharing is not supported in V0 " "FLASH_ATTN backend.") - if blocksparse_params is not None: - raise ValueError( - "FlashAttention does not support block-sparse attention.") if use_irope: logger.warning( "Using irope in V0 is not supported yet, it will fall back " diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 1f913ad89523..56d3da699f40 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -999,7 +999,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, diff --git a/vllm/attention/backends/flashmla.py b/vllm/attention/backends/flashmla.py index e185d0260d0a..a242ac9bbe0b 100644 --- a/vllm/attention/backends/flashmla.py +++ b/vllm/attention/backends/flashmla.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, List, Optional, Tuple, Type import torch @@ -181,7 +181,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str] = None, @@ -189,20 +188,17 @@ def __init__( **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( "FlashMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") + "alibi_slopes, sliding_window, logits_soft_cap") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/attention/backends/mla/common.py b/vllm/attention/backends/mla/common.py index 0c3ff26d04c8..52c4a9e7da3d 100644 --- a/vllm/attention/backends/mla/common.py +++ b/vllm/attention/backends/mla/common.py @@ -997,7 +997,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], diff --git a/vllm/attention/backends/rocm_aiter_mla.py b/vllm/attention/backends/rocm_aiter_mla.py index 1edf34351db3..a165a786d63d 100644 --- a/vllm/attention/backends/rocm_aiter_mla.py +++ b/vllm/attention/backends/rocm_aiter_mla.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional, Type, Union +from typing import TYPE_CHECKING, Optional, Type, Union import torch @@ -367,7 +367,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], @@ -375,17 +374,14 @@ def __init__( **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( "Aiter MLA does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") + "alibi_slopes, sliding_window, logits_soft_cap") from aiter import flash_attn_varlen_func self.flash_attn_varlen_func = flash_attn_varlen_func diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 4653d5267e19..1ee1dea729d9 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -4,7 +4,7 @@ import itertools from dataclasses import dataclass from functools import cache -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, List, Optional, Tuple, Type import torch @@ -494,7 +494,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -507,9 +506,6 @@ def __init__( logger.warning_once( "Using irope in ROCm Flash Attention is not supported yet, it " "will fail back to global attention for long context.") - if blocksparse_params is not None: - raise ValueError( - "ROCmFlashAttention does not support blocksparse attention.") if use_irope: logger.warning( "Using irope in V0 is not supported yet, it will fall back " diff --git a/vllm/attention/backends/triton_mla.py b/vllm/attention/backends/triton_mla.py index e06f7d54e342..fba5b5f6bca8 100644 --- a/vllm/attention/backends/triton_mla.py +++ b/vllm/attention/backends/triton_mla.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Dict, List, Optional, Type +from typing import List, Optional, Type import torch @@ -35,7 +35,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], @@ -43,17 +42,14 @@ def __init__( **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( "TritonMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") + "alibi_slopes, sliding_window, logits_soft_cap") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 3ef79bb62120..0bc38b414290 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with xFormers and PagedAttention.""" from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import torch from xformers import ops as xops @@ -387,7 +387,6 @@ def __init__( alibi_slopes: Optional[List[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -396,9 +395,6 @@ def __init__( if kv_sharing_target_layer_name is not None: raise NotImplementedError("KV sharing is not supported in V0 " "XFORMERS backend.") - if blocksparse_params is not None: - raise ValueError( - "XFormers does not support block-sparse attention.") if logits_soft_cap is not None: logger.warning_once("XFormers does not support logits soft cap. " "Outputs may be slightly off.") diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index d0677525d310..5d8ffb8e82d3 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer.""" -from typing import Any, Dict, List, Optional +from typing import List, Optional import torch import torch.nn as nn @@ -74,7 +74,6 @@ def __init__( alibi_slopes: Optional[List[float]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, per_layer_sliding_window: Optional[int] = None, use_mla: bool = False, @@ -163,12 +162,11 @@ def __init__( kv_cache_dtype, block_size, is_attention_free, - blocksparse_params is not None, use_mla=use_mla) impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **extra_impl_args) self.backend = backend_name_to_enum(attn_backend.get_name()) self.dtype = dtype diff --git a/vllm/attention/ops/blocksparse_attention/__init__.py b/vllm/attention/ops/blocksparse_attention/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py b/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py deleted file mode 100644 index 05fa9d11f228..000000000000 --- a/vllm/attention/ops/blocksparse_attention/blocksparse_attention_kernel.py +++ /dev/null @@ -1,433 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -from vllm.triton_utils import tl, triton - - -def blocksparse_flash_attn_varlen_fwd( - q, - k, - v, # (#tokens, n_heads, head_size) - cu_seqlens_k, - cu_seqlens_q, - sm_scale, - sparse_layout, - *, - block_size=64, - q_block_size=None, - max_seqlen=None): - # split q to blocks - - assert isinstance(sparse_layout, (list, tuple)) - - _, n_heads, head_size = q.shape - batch_size = cu_seqlens_k.size(0) - 1 - q_block_size = q_block_size or block_size - - assert q.dim() == k.dim() == v.dim() == 3 - assert q.size(1) % k.size(1) == 0 - assert q.size(2) == k.size(2) - # TODO(linxihui): allow k, v to have different head_size - assert k.shape == v.shape - assert cu_seqlens_k.dim() == 1 - - q_k_ratio = q.size(1) // k.size(1) - - if cu_seqlens_q is None: - if q.size(0) == batch_size: # decoding only - cu_seqlens_q = torch.arange( - 0, - batch_size + 1, - dtype=cu_seqlens_k.dtype, - device=cu_seqlens_k.device, - ) - elif q.size(0) == k.size(0): - cu_seqlens_q = cu_seqlens_k - else: - raise ValueError("cu_seqlens_q must be specified\ - if it mix of prefilling and decoding.") - else: - assert cu_seqlens_k.size(0) == cu_seqlens_q.size(0) - - # switch to use cpu to avoid too many kernel launches when iterated over - q_lens = (cu_seqlens_q[1:] - cu_seqlens_q[:-1]).cpu() - k_lens = (cu_seqlens_k[1:] - cu_seqlens_k[:-1]).cpu() - - assert torch.logical_or(q_lens == 1, k_lens == q_lens).all(), ( - "length of q should either be 1 (decoding) or same as k (prefilling).") - - if max_seqlen: - assert k_lens.max() <= max_seqlen - - n_blocks = (q_lens + q_block_size - 1) // q_block_size - - q_batch_ids = torch.tensor( - [i for i, n in enumerate(n_blocks) for _ in range(n)], - dtype=cu_seqlens_q.dtype, - device=cu_seqlens_q.device, - ) - q_start_sids = torch.tensor( - [i * q_block_size for n in n_blocks for i in range(n)], - dtype=cu_seqlens_q.dtype, - device=cu_seqlens_q.device, - ) - - out = q.new_empty(q.shape) - cu_seqlens_q = cu_seqlens_q.contiguous() - cu_seqlens_k = cu_seqlens_k.contiguous() - - layout_crow_indices, layout_col_indices = sparse_layout - block_d = triton.next_power_of_2(head_size) - - decoding_only = (q_lens == 1).all().item() - grid = (len(q_start_sids), n_heads, 1) - - _fwd_kernel_batch_inference[grid]( - q, - k, - v, - out, - sm_scale, - cu_seqlens_q[:-1], - cu_seqlens_q[1:], - cu_seqlens_k[:-1], - cu_seqlens_k[1:], - q_batch_ids, - q_start_sids, - 0, - *q.stride(), - 0, - *k.stride(), - 0, - *v.stride(), - 0, - *out.stride(), - layout_crow_indices, - layout_col_indices, - *layout_crow_indices.stride(), - *layout_col_indices.stride(), - q_k_ratio, - HAS_BATCH_DIM=False, - D_HEAD=head_size, - BLOCK_M=q_block_size, - BLOCK_N=block_size, - BLOCK_D=block_d, - BLOCK_M_LOADING=(16 if decoding_only else - q_block_size), # smaller for decoding - EVEN_D=block_d == head_size, - num_warps=1 if decoding_only else 4, - num_stages=3) - - return out - - -@triton.jit -def _fwd_kernel_inner( - acc, - l_i, - m_i, - q, - Q, - k_block_col_idx, - layout_col_ptr, - layout_col_stride_h, - layout_col_stride_m, - k_ptrs, - v_ptrs, - off_h, - offs_m, - offs_n, - offs_d, - stride_kt, - stride_vt, - sm_scale, - k_seqlen, - past_len, - LAST_K_BLOCK: tl.constexpr, - BLOCK_M_LOADING: tl.constexpr, - BLOCK_N: tl.constexpr, - D_HEAD: tl.constexpr, - EVEN_D: tl.constexpr, - M_LT_N: tl.constexpr, -): - k_block_id = tl.load(layout_col_ptr + off_h * layout_col_stride_h + - k_block_col_idx * layout_col_stride_m).to(tl.int32) - start_n = k_block_id * BLOCK_N - if LAST_K_BLOCK: - if EVEN_D: - k = tl.load( - k_ptrs + start_n * stride_kt, - mask=offs_n[None, :] + start_n < k_seqlen, - other=0.0, - ) - else: - k = tl.load( - k_ptrs + start_n * stride_kt, - mask=(offs_n[None, :] + start_n < k_seqlen) & - (offs_d[:, None] < D_HEAD), - other=0.0, - ) - else: - if EVEN_D: - k = tl.load(k_ptrs + start_n * stride_kt) - else: - k = tl.load(k_ptrs + start_n * stride_kt, - mask=offs_d[:, None] < D_HEAD, - other=0.0) - - qk = tl.zeros([BLOCK_M_LOADING, BLOCK_N], dtype=tl.float32) - qk += tl.dot(q, k) - qk *= sm_scale - - # the following is needed only when LAST_K_BLOCK or BLOCK_M < BLOCK_N - if LAST_K_BLOCK | M_LT_N: - qk += tl.where( - offs_m[:, None] + past_len >= (start_n + offs_n[None, :]), - 0, - float("-inf"), - ) - - # flash-attn2 - m_ij = tl.maximum(m_i, tl.max(qk, 1)) - p = tl.math.exp2(qk - m_ij[:, None]) - l_ij = tl.sum(p, 1) - alpha = tl.math.exp2(m_i - m_ij) - acc = acc * alpha[:, None] - # update m_i - m_i = m_ij - l_i = l_i * alpha + l_ij - - p = p.to(Q.dtype.element_ty) - # update acc - if LAST_K_BLOCK: - if EVEN_D: - v = tl.load( - v_ptrs + start_n * stride_vt, - mask=offs_n[:, None] + start_n < k_seqlen, - other=0.0, - ) - else: - v = tl.load( - v_ptrs + start_n * stride_vt, - mask=(offs_n[:, None] + start_n < k_seqlen) & - (offs_d[None, :] < D_HEAD), - other=0.0, - ) - else: - if EVEN_D: - v = tl.load(v_ptrs + start_n * stride_vt) - else: - v = tl.load(v_ptrs + start_n * stride_vt, - mask=offs_d[None, :] < D_HEAD, - other=0.0) - - acc += tl.dot(p, v) - - return acc, l_i, m_i - - -@triton.heuristics({ - "M_LT_N": - lambda kwargs: kwargs["BLOCK_M"] < kwargs["BLOCK_N"], -}) -@triton.jit -def _fwd_kernel_batch_inference( - Q, - K, - V, - Out, - sm_scale, - q_batch_starts, - q_batch_ends, - k_batch_starts, - k_batch_ends, - q_batch_ids, - q_start_sids, - stride_qb, - stride_qt, - stride_qh, - stride_qd, - stride_kb, - stride_kt, - stride_kh, - stride_kd, - stride_vb, - stride_vt, - stride_vh, - stride_vd, - stride_ob, - stride_ot, - stride_oh, - stride_od, - layout_crow_ptr, - layout_col_ptr, - layout_crow_stride_h, - layout_crow_stride_m, - layout_col_stride_h, - layout_col_stride_m, - q_k_ratio, - HAS_BATCH_DIM: tl.constexpr, - D_HEAD: tl.constexpr, - BLOCK_M: tl.constexpr, - BLOCK_N: tl.constexpr, - BLOCK_D: tl.constexpr, - BLOCK_M_LOADING: tl.constexpr, - EVEN_D: tl.constexpr, - M_LT_N: tl.constexpr, -): - """ - NOTATION: - pid: position id - sid: storage id - sbid: storage block id - pbid: position block id - offs_m, offs_n: storage offsets of m-dim(q, row) and n-dim(k, col) - - TODO(linxihui): - Optimize grouped-attn - """ - off_zm = tl.program_id(0) - off_h = tl.program_id(1) - - off_h_for_kv = off_h // q_k_ratio - - if HAS_BATCH_DIM: - off_z = tl.program_id(2) - Q += off_z * stride_qb - K += off_z * stride_kb - V += off_z * stride_vb - Out += off_z * stride_ob - start_m = off_zm - q_start_sid = start_m * BLOCK_M # always 0 for decoding - else: - off_z = tl.load(q_batch_ids + off_zm).to(tl.int32) # [0, 0, 0, 1] - q_start_sid = tl.load(q_start_sids + off_zm) - start_m = q_start_sid // BLOCK_M # q_sbid - - offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M_LOADING) - offs_n = tl.arange(0, BLOCK_N) - offs_d = tl.arange(0, BLOCK_D) - - q_cu_start = tl.load(q_batch_starts + off_z).to(tl.int32) - q_seqlen = tl.load(q_batch_ends + off_z).to(tl.int32) - q_cu_start - k_cu_start = tl.load(k_batch_starts + off_z).to(tl.int32) - k_seqlen = tl.load(k_batch_ends + off_z).to(tl.int32) - k_cu_start - past_len = k_seqlen - q_seqlen - - Q += q_cu_start * stride_qt + off_h * stride_qh - K += k_cu_start * stride_kt + off_h_for_kv * stride_kh - V += k_cu_start * stride_vt + off_h_for_kv * stride_vh - Out += q_cu_start * stride_ot + off_h * stride_oh - - q_pbid = (past_len + q_start_sid) // BLOCK_M - - if EVEN_D: - q = tl.load( - Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd, - mask=offs_m[:, None] < q_seqlen, - other=0.0, - ) - else: - q = tl.load( - Q + offs_m[:, None] * stride_qt + offs_d[None, :] * stride_qd, - mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD), - other=0.0, - ) - - sparse_crow_ptr = (layout_crow_ptr + off_h * layout_crow_stride_h + - q_pbid * layout_crow_stride_m) - - # TODO(linxihui): load at once, with any Triton version - # that supports `tl.split`, e.g., Triton 3.0 - k_block_start = tl.load(sparse_crow_ptr).to(tl.int32) - k_block_end = tl.load(sparse_crow_ptr + 1).to(tl.int32) - - m_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - float("inf") - l_i = tl.zeros([BLOCK_M_LOADING], dtype=tl.float32) - acc = tl.zeros([BLOCK_M_LOADING, BLOCK_D], dtype=tl.float32) - - k_ptrs = K + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd - v_ptrs = V + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd - - sm_scale *= ( - 1.44269504 # 1/log2 as we use base2 for exponential and logarithm - ) - - for k_block_col_idx in range(k_block_start, k_block_end - 1): - acc, l_i, m_i = _fwd_kernel_inner( - acc, - l_i, - m_i, - q, - Q, - k_block_col_idx, - layout_col_ptr, - layout_col_stride_h, - layout_col_stride_m, - k_ptrs, - v_ptrs, - off_h, - offs_m, - offs_n, - offs_d, - stride_kt, - stride_vt, - sm_scale, - k_seqlen, - past_len, - False, - BLOCK_M_LOADING, - BLOCK_N, - D_HEAD, - EVEN_D, - M_LT_N, - ) - - acc, l_i, m_i = _fwd_kernel_inner( - acc, - l_i, - m_i, - q, - Q, - k_block_end - 1, - layout_col_ptr, - layout_col_stride_h, - layout_col_stride_m, - k_ptrs, - v_ptrs, - off_h, - offs_m, - offs_n, - offs_d, - stride_kt, - stride_vt, - sm_scale, - k_seqlen, - past_len, - True, - BLOCK_M_LOADING, - BLOCK_N, - D_HEAD, - EVEN_D, - M_LT_N, - ) - - # flash-attn 2 - m_i += tl.math.log2(l_i) - acc = acc / l_i[:, None] - - # write output - if EVEN_D: - tl.store( - Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, - acc, - mask=offs_m[:, None] < q_seqlen, - ) - else: - tl.store( - Out + offs_m[:, None] * stride_ot + offs_d[None, :] * stride_od, - acc, - mask=(offs_m[:, None] < q_seqlen) & (offs_d[None, :] < D_HEAD), - ) diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py deleted file mode 100644 index c6f6cc29793f..000000000000 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ /dev/null @@ -1,239 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math - -import torch - -from vllm.platforms import current_platform - -from .utils import (dense_to_crow_col, get_head_sliding_step, - get_sparse_attn_mask) - -IS_COMPUTE_8_OR_ABOVE = current_platform.has_device_capability(80) - -if IS_COMPUTE_8_OR_ABOVE: - from .blocksparse_attention_kernel import blocksparse_flash_attn_varlen_fwd - - -class LocalStridedBlockSparseAttn(torch.nn.Module): - - def __init__( - self, - n_heads, - max_seqlen, - local_blocks, - vert_stride, - block_size, - device=None, - dtype=None, - homo_head=False, - active_head_range=None, - q_block_size=None, - use_spda=None, - ): - super().__init__() - if use_spda is None: - use_spda = current_platform.is_rocm() or \ - current_platform.is_cpu() or not \ - IS_COMPUTE_8_OR_ABOVE - device = device or (torch.cuda.current_device() - if current_platform.is_cuda_alike() else "cpu") - device = torch.device(device) - # NOTE: vllm CPU backend support BF16 instead of FP16. - dtype = dtype or (torch.bfloat16 if IS_COMPUTE_8_OR_ABOVE - or device.type == "cpu" else torch.half) - - self.n_heads = n_heads - self.max_seqlen = max_seqlen - self.local_blocks = local_blocks - self.vert_stride = vert_stride - self.use_spda = use_spda - self.dtype = dtype - self.device = device - self.block_size = block_size - self.q_block_size = q_block_size - self.homo_head = homo_head - self.active_head_range = active_head_range - self.head_sliding_step = get_head_sliding_step(n_heads, vert_stride, - homo_head) - - sparse_layout, sparse_pattern, self.dense_attn_mask = ( - self.get_attn_pattern(dtype, device)) - - if q_block_size is not None and q_block_size != block_size: - if q_block_size > block_size: - assert q_block_size % block_size == 0 - blocks_to_merge = q_block_size // block_size - shape = sparse_pattern.shape - sparse_pattern = sparse_pattern.view(shape[0], -1, - blocks_to_merge, - shape[-1]) - sparse_pattern = sparse_pattern.sum(2) - sparse_layout = dense_to_crow_col(sparse_pattern) - else: - raise ValueError( - "Does not support smaller q_block_size. It will be slower." - ) - - self.sparse_layout = sparse_layout - - def get_attn_pattern(self, dtype, device): - sparse_layout, sparse_pattern, dense_attn_mask = get_sparse_attn_mask( - self.n_heads, - self.max_seqlen, - self.max_seqlen, - dtype, - device, - block_size=self.block_size, - local_blocks=self.local_blocks, - vert_stride=self.vert_stride, - homo_head=self.homo_head, - return_dense=self.use_spda, - dense_mask_type="bias", - ) - if (not self.homo_head) and (self.active_head_range is not None): - assert isinstance(self.active_head_range, tuple) - assert (len(self.active_head_range) == 2) - h_start, h_end = self.active_head_range - sparse_layout = tuple(x[h_start:h_end] for x in sparse_layout) - if self.use_spda: - dense_attn_mask = dense_attn_mask[h_start:h_end] - return sparse_layout, sparse_pattern, dense_attn_mask - - def varlen_attn(self, - q, - k, - v, - cu_seqlens_k, - cu_seqlens_q=None, - sm_scale=None): - """ - q, k, v: shape = (num_tokens, num_heads_q/kv, head_size). - Support grouped attention, with `q[:, i*r:(i*r + r)]` - is correspondent to `k[:, i]`, where `r` is the q/k ratio. - cu_seqlens_k: shape=(batch_size + 1,), - indicating segment of samples, - e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i - cu_seqlens_q: shape=(batch_size + 1, ). - Default None: same as cu_seqlens_k for prefilling or - [0, 1, .., batch_size] for decoding. - The only case you need to specify is when q is a mix of - prefilling and decoding. - sm_scale: softmax scale, default to 1/sqrt(head_size). - - return: tensor of shape as q. - """ - assert ( - IS_COMPUTE_8_OR_ABOVE - ), "Requires compute capability of 8 or above (Ampere or newer) to use \ - Triton kernel." - - sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1)) - - return blocksparse_flash_attn_varlen_fwd( - q, - k, - v, - cu_seqlens_k, - cu_seqlens_q, - sm_scale, - self.sparse_layout, - block_size=self.block_size, - q_block_size=self.q_block_size, - max_seqlen=self.max_seqlen, - ) - - @staticmethod - def transpose_and_pad(x, cu_seqlens, maxlen, head_repeats=1): - """ - :param x: (total_tokens, n_heads, head_size) - :return: (batch, n_heads, length, head_size) - """ - x_padded = x.new_empty( - len(cu_seqlens) - 1, x.size(1), head_repeats, maxlen, x.size(2)) - cu_seqlens = cu_seqlens.cpu() - for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])): - x_padded[i, :, :, :e - s].copy_(x[s:e].transpose(0, - 1).unsqueeze(1)) - return x_padded.flatten(1, 2) - - @staticmethod - def transpose_and_unpad(x_padded, cu_seqlens): - """ - :param x_padded: (batch, n_heads, length, head_size) - :return: (total_tokens, n_heads, head_size) - """ - cu_seqlens = cu_seqlens.cpu() - total_n_tokens = cu_seqlens[-1] - x = x_padded.new_empty(total_n_tokens, x_padded.size(1), - x_padded.size(3)) - for i, (s, e) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])): - x[s:e].copy_(x_padded[i, :, :e - s].transpose(0, 1)) - return x - - def spda(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): - """For CPU, V100 or other older GPUs. - NOTE: torch SPDA supports nested tensor, - but seems extremely slow. Choose to pad instead. - """ - assert (cu_seqlens_q is None or - (cu_seqlens_q - == cu_seqlens_k).all()), "Can only handle prompt with SPDA." - assert q.size(0) == k.size(0), "can only handle prompt with SPDA." - - assert q.size(1) % k.size(1) == 0 - q_k_ratio = q.size(1) // k.size(1) - sm_scale = sm_scale or 1.0 / math.sqrt(q.size(-1)) - cu_seqlens = cu_seqlens_k.cpu() - maxlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() - - if (self.dense_attn_mask.dtype != q.dtype - or self.dense_attn_mask.device != q.device): - _, _, self.dense_attn_mask = self.get_attn_pattern( - q.dtype, q.device) - attn_mask = self.dense_attn_mask[None, :, :maxlen, :maxlen] - - q2 = self.transpose_and_pad(q, cu_seqlens, maxlen, 1) - k2, v2 = (self.transpose_and_pad(x, cu_seqlens, maxlen, q_k_ratio) - for x in [k, v]) - spda_output = torch.nn.functional.scaled_dot_product_attention( - q2, k2, v2, attn_mask=attn_mask, scale=sm_scale) - return self.transpose_and_unpad(spda_output, cu_seqlens) - - def forward(self, q, k, v, cu_seqlens_k, cu_seqlens_q=None, sm_scale=None): - """Dispatch to `varlen_attn` (Ampere or newer) or - `self.spda`(cpu, Volta, Turing or older)based on - the type of device used and cuda compute capability. - - q, k, v: shape = (num_tokens, num_heads_q/kv, head_size). - Support grouped attention, with `q[:, i*r:(i*r + r)]` - is correspondent to `k[:, i]`, where `r` is the q/k ratio. - cu_seqlens_k: shape=(batch_size + 1,), indicating segment of samples, - e.g., `k[cu_seqlen[i]:cu_seqlne[i+1]]` is q of sample i - cu_seqlens_q: shape=(batch_size + 1, ). - Default None: same as cu_seqlens_k for prefilling or - [0, 1, .., batch_size] for decoding. - The only case you need to specify - is when q is a mix of prefilling - and decoding. - sm_scale: softmax scale, default to 1/sqrt(head_size). - - return: tensor of shape as q. - """ - assert k.dim() == 3 - if self.use_spda: - return self.spda( - q, - k, - v, - cu_seqlens_k, - cu_seqlens_q=cu_seqlens_q, - sm_scale=sm_scale, - ) - return self.varlen_attn(q, - k, - v, - cu_seqlens_k, - cu_seqlens_q=cu_seqlens_q, - sm_scale=sm_scale) diff --git a/vllm/attention/ops/blocksparse_attention/utils.py b/vllm/attention/ops/blocksparse_attention/utils.py deleted file mode 100644 index 445720c709c4..000000000000 --- a/vllm/attention/ops/blocksparse_attention/utils.py +++ /dev/null @@ -1,246 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -# Helper functions for 3D sparse pattern -# These function are not optimized and very inefficient. -# Avoid calling them too frequent or use a cache mechanism. - -from functools import lru_cache - -import numpy as np -import torch - -from vllm.triton_utils import triton - - -class csr_matrix: - """Simple implementation of CSR matrix conversion without scipy. - This replaced scipy.sparse.csr_matrix() previously used.""" - - def __init__(self, input_array): - if not isinstance(input_array, np.ndarray): - raise ValueError("Input must be a NumPy array") - - self.shape = input_array.shape - rows, cols = self.shape - data = [] - indices = [] - indptr = [0] - - for i in range(rows): - for j in range(cols): - if input_array[i, j]: - data.append(input_array[i, j]) - indices.append(j) - indptr.append(len(indices)) - - self.data = np.array(data) - self.indices = np.array(indices) - self.indptr = np.array(indptr) - - -def dense_to_crow_col(x: torch.Tensor): - """Turning a 2D/3D torch tensor (x) to CSR rows/cols indexing. - NOTE: col_indices padded -1 - """ - device = x.device - pad = -1 - dim = x.dim() - assert x.dim() in (2, 3) - if x.dim() == 2: - x = x[None] - x = [csr_matrix(xi.bool().cpu().numpy()) for xi in x] - crows = torch.vstack([torch.from_numpy(xi.indptr) for xi in x]) - cols = [torch.from_numpy(xi.indices) for xi in x] - max_cols = max(len(xi) for xi in cols) - cols = [ - torch.cat([xi, pad + xi.new_zeros(max_cols - xi.shape[0])]) - for xi in cols - ] - cols = torch.vstack(cols) - if dim == 2: - crows = crows[0] - cols = cols[0] - return crows.to(device), cols.to(device) - - -def crow_col_to_dense(crows: torch.Tensor, - cols: torch.Tensor, - dtype: torch.dtype = torch.float16): - dim = crows.dim() - if dim == 1: - crows = crows[None] - cols = cols[None] - device = crows.device - crows, cols = crows.cpu(), cols.cpu() # faster in cpu - shape = (crows.shape[0], crows.shape[1] - 1, cols.max() + 1) - x = torch.zeros(shape, dtype=dtype) - for i in range(shape[0]): - for j in range(shape[1]): - x[i, j, cols[i, crows[i, j]:crows[i, j + 1]]] = 1 - if dim == 1: - x = x[0] - return x.to(device) - - -def dense_to_ccol_row(x: torch.Tensor): - """Similar, but to CSC format""" - x = x.transpose(-2, -1) - return dense_to_crow_col(x) - - -def ccol_row_to_dense(ccol: torch.Tensor, - rows: torch.Tensor, - dtype: torch.dtype = torch.float16): - return crow_col_to_dense(ccol, rows, dtype).permute(0, 2, 1).contiguous() - - -def _get_sparse_attn_mask_homo_head( - q_len: int, - max_seqlen: int, - dtype: torch.dtype, - device: torch.device, - block_size: int = 128, - local_blocks: int = 4, - vert_stride: int = 4, - return_dense: bool = False, -): - """ - :return: a tuple of 3: - - tuple of crow_indices, col_indices representation - of CSR format. - - block dense mask - - all token dense mask (be aware that it can be - OOM if it is too big) if `return_dense==True`, - otherwise, None - """ - with torch.no_grad(): - num_blocks = triton.cdiv(max_seqlen, block_size) - q_pos = torch.arange(num_blocks)[:, None] - k_pos = torch.arange(num_blocks)[None] - mask_vert_strided = (torch.arange(num_blocks) + 1) % vert_stride == 0 - block_mask_dense = (((q_pos >= k_pos) - & ((q_pos - k_pos < local_blocks) - | mask_vert_strided)).to(device).to(dtype)) - num_blocks_q = triton.cdiv(q_len, block_size) - block_mask_dense_output = (dense_to_crow_col( - block_mask_dense[-num_blocks_q:].contiguous())) - if return_dense: - mask_dense = torch.kron( - block_mask_dense, - block_mask_dense.new_ones((block_size, block_size)), - ) - causal_mask = torch.tril(torch.ones( - max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:] - mask_dense = mask_dense[-q_len:, :max_seqlen] * causal_mask - return ( - block_mask_dense_output, - block_mask_dense, - mask_dense, - ) - else: - return ( - block_mask_dense_output, - block_mask_dense, - None, - ) - - -def binary_mask_to_bias(mask_dense: torch.Tensor): - mask_dense = 1 - mask_dense - mask_dense.masked_fill_(mask_dense.bool(), -torch.inf) - return mask_dense - - -def get_head_sliding_step(n_heads: int, - vert_stride: int, - homo_head: bool = False): - if homo_head: - return 0 - return max(1, int(vert_stride / n_heads)) - - -@lru_cache -def get_sparse_attn_mask( - n_heads: int, - q_len: int, - max_seqlen: int, - dtype: torch.dtype, - device: torch.device, - block_size: int = 64, - local_blocks: int = 4, - vert_stride: int = 4, - homo_head: bool = True, - return_dense: bool = False, - dense_mask_type: str = "binary", -): - """ - :param dense_mask_type: "binary" (0 for skip token, 1 for others) - or "bias" (-inf for skip token, 0 or others) - :return: a tuple of 3: - - tuple of crow_indices, col_indices representation - of CSR format. - - block dense mask - - all token dense mask (be aware that it can be OOM if it - is too big) if `return_dense==True`, otherwise, None - """ - assert dense_mask_type in ("binary", "bias") - if homo_head: - with torch.no_grad(): - (crow, col), block_mask_dense, mask_dense = ( - _get_sparse_attn_mask_homo_head( - q_len, - max_seqlen, - dtype, - device, - block_size, - local_blocks, - vert_stride, - return_dense, - )) - crow = crow[None].expand(n_heads, crow.shape[0]) - col = col[None].expand(n_heads, col.shape[0]) - if return_dense: - mask_dense = mask_dense[None].expand(n_heads, - *mask_dense.shape) - if dense_mask_type == "bias": - mask_dense = binary_mask_to_bias(mask_dense) - return (crow, col), block_mask_dense, mask_dense - - with torch.no_grad(): - num_blocks = triton.cdiv(max_seqlen, block_size) - q_pos = torch.arange(num_blocks)[None, :, None] - k_pos = torch.arange(num_blocks)[None, None] - head_sliding_step = get_head_sliding_step(n_heads, vert_stride) - mask_vert_strided = [ - (torch.arange(num_blocks) + h * head_sliding_step + 1) % - vert_stride == 0 for h in range(n_heads) - ] - mask_vert_strided = torch.vstack(mask_vert_strided).unsqueeze(1) - block_mask_dense = (((q_pos >= k_pos) - & ((q_pos - k_pos < local_blocks) - | mask_vert_strided)).to(device).to(dtype)) - num_blocks_q = triton.cdiv(q_len, block_size) - block_mask_dense_output = block_mask_dense[:, -num_blocks_q:] - if return_dense: - mask_dense = torch.kron( - block_mask_dense, - block_mask_dense.new_ones((block_size, block_size)), - ) - causal_mask = torch.tril(torch.ones( - max_seqlen, max_seqlen)).type_as(mask_dense)[-q_len:] - mask_dense = mask_dense[..., -q_len:, :max_seqlen] * causal_mask[None] - if dense_mask_type == "bias": - mask_dense = binary_mask_to_bias(mask_dense) - - return ( - dense_to_crow_col(block_mask_dense_output), - block_mask_dense, - mask_dense, - ) - else: - return ( - dense_to_crow_col(block_mask_dense_output), - block_mask_dense, - None, - ) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 4d4886d02b78..2e3c8638125f 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -143,7 +143,6 @@ def get_attn_backend( kv_cache_dtype: Optional[str], block_size: int, is_attention_free: bool, - is_blocksparse: bool = False, use_mla: bool = False, ) -> type[AttentionBackend]: """Selects which attention backend to use and lazily imports it.""" @@ -157,7 +156,6 @@ def get_attn_backend( kv_cache_dtype=kv_cache_dtype, block_size=block_size, is_attention_free=is_attention_free, - is_blocksparse=is_blocksparse, use_v1=envs.VLLM_USE_V1, use_mla=use_mla, ) @@ -170,16 +168,9 @@ def _cached_get_attn_backend( kv_cache_dtype: Optional[str], block_size: int, is_attention_free: bool, - is_blocksparse: bool = False, use_v1: bool = False, use_mla: bool = False, ) -> type[AttentionBackend]: - if is_blocksparse: - logger.info("Using BlocksparseFlashAttention backend.") - from vllm.attention.backends.blocksparse_attn import ( - BlocksparseFlashAttentionBackend) - return BlocksparseFlashAttentionBackend - # If there are no attention layers (e.g. we are running Mamba), # use the placeholder NO_ATTENTION if is_attention_free: diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py deleted file mode 100644 index 754ddda233f4..000000000000 --- a/vllm/model_executor/models/phi3_small.py +++ /dev/null @@ -1,465 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import math -from collections.abc import Iterable -from typing import Optional, Union - -import torch -from torch import nn -from transformers.configuration_utils import PretrainedConfig - -from vllm.attention import Attention -from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, - get_tensor_model_parallel_world_size) -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, - QKVParallelLinear, - RowParallelLinear) -from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.model_executor.layers.vocab_parallel_embedding import ( - DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) -from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.platforms import current_platform -from vllm.sequence import IntermediateTensors - -from .interfaces import SupportsPP -from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, - make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) - - -def load_column_parallel_weight(param: torch.nn.Parameter, - loaded_weight: torch.Tensor): - tp = get_tensor_model_parallel_world_size() - rk = get_tensor_model_parallel_rank() - assert param.size(0) * tp == loaded_weight.size(0) - s = rk * param.size(0) - e = (rk + 1) * param.size(0) - loaded_weight = loaded_weight[s:e] - assert param.shape == loaded_weight.shape - param.data.copy_(loaded_weight) - - -class HeadMajorQKVParallelLinear(QKVParallelLinear): - - def weight_loader(self, param: torch.nn.Parameter, - loaded_weight: torch.Tensor): - return load_column_parallel_weight(param, loaded_weight) - - -class HeadMajorColumnParallelLinear(MergedColumnParallelLinear): - - def weight_loader(self, param: torch.nn.Parameter, - loaded_weight: torch.Tensor): - return load_column_parallel_weight(param, loaded_weight) - - -@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) -def quick_gelu(x): - return x * torch.sigmoid(1.702 * x) - - -@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) -def gegelu(input, limit: Optional[float] = None): - a_gelu, a_linear = input[..., ::2], input[..., 1::2] - if limit is not None: - a_gelu = torch.where(torch.isinf(a_gelu), a_gelu, - a_gelu.clamp(min=None, max=limit)) - a_linear = torch.where( - torch.isinf(a_linear), - a_linear, - a_linear.clamp(min=-limit, max=limit), - ) - out_gelu = quick_gelu(a_gelu) - return out_gelu * (a_linear + 1) - - -class Phi3SmallMLP(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None, - ) -> None: - super().__init__() - self.config = config - assert (self.config.hidden_act == "gegelu" - ), "Only `gegelu` is supported for the 4.7 series of models .." - self.hidden_size = config.hidden_size - self.gegelu_limit = config.gegelu_limit - self.intermediate_size = config.intermediate_size - - self.up_proj = HeadMajorColumnParallelLinear( - self.hidden_size, - 2 * [self.intermediate_size], - bias=True, - quant_config=quant_config, - ) - self.down_proj = RowParallelLinear( - self.intermediate_size, - self.hidden_size, - bias=True, - quant_config=quant_config, - ) - - def forward(self, x): - gate_up, _ = self.up_proj(x) - x = gegelu(gate_up) - x, _ = self.down_proj(x) - return x - - -class Phi3SmallSelfAttention(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - layer_idx: int, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: - super().__init__() - self.layer_idx = layer_idx - self.config = config - self.sparse_block_size = config.blocksparse_block_size - self.homo_heads = config.blocksparse_homo_head_pattern - self.local_blocks = config.blocksparse_num_local_blocks - self.vert_stride = config.blocksparse_vert_stride - - assert (config.blocksparse_block_size == - config.blocksparse_triton_kernel_block_size) - - self.hidden_size = config.hidden_size - # Number of Query Heads - self.num_heads = config.num_attention_heads - - self.head_dim = self.hidden_size // self.num_heads - self.tp_size = get_tensor_model_parallel_world_size() - # Number of total Key Value Heads before tensor parallel - self.num_key_value_heads = config.num_key_value_heads - self.num_q_per_kv = self.num_heads // self.num_key_value_heads - if self.tp_size > 1: - assert self.num_key_value_heads % self.tp_size == 0 - self.num_kv_heads_per_partition = max( - 1, self.num_key_value_heads // self.tp_size) - self.num_heads_per_partition = self.num_heads // self.tp_size - - self.max_position_embeddings = config.max_position_embeddings - self.rope_embedding_base = config.rope_embedding_base - self.rope_position_scale = config.rope_position_scale - self.is_causal = True - - norm_factor = None - if config.mup_use_scaling: - norm_factor = self.head_dim / config.mup_attn_multiplier - else: - norm_factor = math.sqrt(self.head_dim) - self.scale = 1 / norm_factor - - self.query_key_value = HeadMajorQKVParallelLinear( - self.hidden_size, - self.head_dim, - self.num_heads, - self.num_key_value_heads, - bias=True, - quant_config=quant_config, - ) - - self.dense = RowParallelLinear(self.hidden_size, - self.hidden_size, - bias=True, - quant_config=quant_config) - - if getattr(self.config, "rope_scaling", None) is not None: - rope_scaling = self.config.rope_scaling - for key in rope_scaling: - if isinstance(rope_scaling[key], list): - rope_scaling[key] = tuple(rope_scaling[key]) - - if "factor" not in rope_scaling: - rope_scaling["factor"] = self.rope_position_scale - else: - rope_scaling = { - "rope_type": "linear", - "factor": self.rope_position_scale, - } - - self.rotary_emb = get_rope( - self.head_dim, - rotary_dim=self.head_dim, - max_position=self.max_position_embeddings, - base=self.rope_embedding_base, - rope_scaling=rope_scaling, - ) - - # blocksparse params - self.blocksparse_block_size = config.blocksparse_block_size - self.blocksparse_num_local_blocks = config.blocksparse_num_local_blocks - self.blocksparse_vert_stride = config.blocksparse_vert_stride - - use_dense_attn = (getattr(self.config, - "dense_attention_every_n_layers", None) - and (self.layer_idx + 1) % - self.config.dense_attention_every_n_layers == 0) - - bs_params = None - if not use_dense_attn: - bs_params = { - 'max_seqlen': self.max_position_embeddings, - 'num_heads': self.num_heads_per_partition, - "num_kv_heads": self.num_kv_heads_per_partition, - "block_size": self.sparse_block_size, - "local_blocks": self.local_blocks, - "vert_stride": self.vert_stride, - "homo_head": self.homo_heads - } - - self.attn = Attention(self.num_heads_per_partition, - self.head_dim, - self.scale, - num_kv_heads=self.num_kv_heads_per_partition, - cache_config=cache_config, - quant_config=quant_config, - blocksparse_params=bs_params, - prefix=f"{prefix}.attn") - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> tuple[torch.Tensor, Optional[torch.Tensor], - Optional[tuple[torch.Tensor]]]: - qkv, _ = self.query_key_value(hidden_states) - - qkv = qkv.view(qkv.shape[:-1] + - (-1, (self.num_q_per_kv + 2), self.head_dim)) - q, k, v = qkv.split([self.num_q_per_kv, 1, 1], dim=-2) - - # NOTE: this is required by RotaryEmbed, which indeed does not have to - # TODO: allow 3D QK for rotary forward - q = q.reshape(-1, self.head_dim * self.num_heads_per_partition) - k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partition) - v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partition) - - q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, k, v) - output, _ = self.dense(attn_output) - - return output - - -class Phi3SmallDecoderLayer(nn.Module): - - def __init__( - self, - config: PretrainedConfig, - layer_idx: int, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ): - super().__init__() - self.hidden_size = config.hidden_size - self.self_attn = Phi3SmallSelfAttention(config, - layer_idx, - cache_config=cache_config, - quant_config=quant_config, - prefix=f"{prefix}.self_attn") - self.mlp = Phi3SmallMLP(config, quant_config) - - self.input_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_epsilon) - self.post_attention_layernorm = nn.LayerNorm( - config.hidden_size, eps=config.layer_norm_epsilon) - - def forward( - self, - positions: torch.Tensor, - hidden_states: torch.Tensor, - ) -> torch.Tensor: - residual = hidden_states - hidden_states = self.input_layernorm(hidden_states) - - hidden_states = self.self_attn( - positions=positions, - hidden_states=hidden_states, - ) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - return hidden_states - - -class Phi3SmallModel(nn.Module): - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - - self.config = config - self.embed_tokens = VocabParallelEmbedding(config.vocab_size, - config.hidden_size) - self.mup_embedding_multiplier = config.mup_embedding_multiplier - self.start_layer, self.end_layer, self.layers = make_layers( - config.num_hidden_layers, - lambda prefix: Phi3SmallDecoderLayer(config, - int(prefix.split('.')[-1]), - cache_config, - quant_config, - prefix=prefix), - prefix=f"{prefix}.layers") - - self.final_layernorm = nn.LayerNorm(config.hidden_size, - eps=config.layer_norm_epsilon) - self.make_empty_intermediate_tensors = ( - make_empty_intermediate_tensors_factory(["hidden_states"], - config.hidden_size)) - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.embed_tokens(input_ids) - - def forward( - self, - input_ids: torch.LongTensor, - positions: Optional[torch.LongTensor], - intermediate_tensors: Optional[IntermediateTensors], - inputs_embeds: Optional[torch.Tensor], - ) -> Union[torch.Tensor, IntermediateTensors]: - if get_pp_group().is_first_rank: - if inputs_embeds is not None: - hidden_states = inputs_embeds - else: - hidden_states = self.get_input_embeddings(input_ids) - if (self.mup_embedding_multiplier is not None - and self.mup_embedding_multiplier > 0.0): - hidden_states = hidden_states * self.mup_embedding_multiplier - else: - assert intermediate_tensors - hidden_states = intermediate_tensors["hidden_states"] - for layer in self.layers[self.start_layer:self.end_layer]: - hidden_states = layer(positions, hidden_states) - if not get_pp_group().is_last_rank: - return IntermediateTensors({"hidden_states": hidden_states}) - hidden_states = self.final_layernorm(hidden_states) - return hidden_states - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters()) - loaded_params: set[str] = set() - for name, loaded_weight in weights: - if name.endswith(".bias") and name not in params_dict: - continue - if is_pp_missing_parameter(name, self): - continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params - - -class Phi3SmallForCausalLM(nn.Module, SupportsPP): - _tied_weights_keys = ["lm_head.weight"] - - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_suffix={"rotary_emb.inv_freq": None}) - - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - config = vllm_config.model_config.hf_config - quant_config = vllm_config.quant_config - self.config = config - self.quant_config = quant_config - self.model = Phi3SmallModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self.vocab_size = config.vocab_size - self.mup_width_multiplier = config.mup_width_multiplier - self.lm_head = ParallelLMHead( - self.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, - padding_size=DEFAULT_VOCAB_PADDING_SIZE, - quant_config=quant_config, - ) - if self.config.tie_word_embeddings: - self.lm_head.weight = self.model.embed_tokens.weight - self.logits_processor = LogitsProcessor(config.vocab_size) - self.make_empty_intermediate_tensors = ( - self.model.make_empty_intermediate_tensors) - - # tokens in tiktoken but not used - if hasattr(config, 'dummy_token_indices'): - device = self.lm_head.weight.device - self.register_buffer('dummy_token_indices', - torch.LongTensor( - config.dummy_token_indices).to(device), - persistent=False) - else: - self.dummy_token_indices = None - - def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: - return self.model.get_input_embeddings(input_ids) - - def set_input_embeddings(self, value): - self.model.embed_tokens = value - - def get_output_embeddings(self): - return self.lm_head - - def set_output_embeddings(self, value): - self.lm_head = value - - def set_decoder(self, decoder): - self.model = decoder - - def get_decoder(self): - return self.model - - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata) - if self.dummy_token_indices is not None and logits is not None: - logits.index_fill_(-1, self.dummy_token_indices, -torch.inf) - logits = logits / self.mup_width_multiplier - return logits - - def forward( - self, - input_ids: torch.LongTensor, - positions: Optional[torch.LongTensor], - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, IntermediateTensors]: - output_hidden_states = self.model( - input_ids=input_ids, - positions=positions, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - ) - output_hidden_states = output_hidden_states - return output_hidden_states - - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: - loader = AutoWeightsLoader( - self, - skip_prefixes=(["lm_head.weight"] - if self.config.tie_word_embeddings else None)) - return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2ca37867b88c..3440dd656c50 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -110,7 +110,6 @@ "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"), - "Phi3SmallForCausalLM": ("phi3_small", "Phi3SmallForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), "Phi4FlashForCausalLM": ("phi4flash", "Phi4FlashForCausalLM"), "Plamo2ForCausalLM": ("plamo2", "Plamo2ForCausalLM"), diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index b8e788de11c6..1cd5cb5e83db 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -57,7 +57,6 @@ class _Backend(enum.Enum): PALLAS = enum.auto() PALLAS_VLLM_V1 = enum.auto() IPEX = enum.auto() - BLOCK_SPARSE_FLASH_ATTN = enum.auto() DUAL_CHUNK_FLASH_ATTN = enum.auto() DIFFERENTIAL_FLASH_ATTN = enum.auto() NO_ATTENTION = enum.auto() diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index d63b82012a52..2efbe0de2725 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import numpy as np import torch @@ -443,7 +443,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, @@ -451,9 +450,6 @@ def __init__( ) -> None: if kv_sharing_target_layer_name is not None: raise NotImplementedError("KV sharing is not supported in V0.") - if blocksparse_params is not None: - raise ValueError( - "Torch SPDA does not support block-sparse attention.") if logits_soft_cap is not None: logger.warning_once("Torch SPDA does not support logits soft cap. " "Outputs may be slightly off.") diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a37bf2a7115b..ad414ee0a1fc 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with FlashAttention.""" from dataclasses import dataclass -from typing import Any, ClassVar, Optional +from typing import ClassVar, Optional import numpy as np import torch @@ -349,15 +349,11 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, use_irope: bool = False, ) -> None: - if blocksparse_params is not None: - raise ValueError( - "FlashAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 7f3c4ed129cf..e1ffa61a6005 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -4,7 +4,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Optional +from typing import TYPE_CHECKING, Optional import torch from flashinfer import (BatchDecodeWithPagedKVCacheWrapper, @@ -490,7 +490,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index c229ec12fd1b..ad63f92cd88a 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -3,7 +3,7 @@ """Attention layer with FlashAttention.""" from collections import defaultdict from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import torch from torch.nn.attention.flex_attention import (BlockMask, _mask_mod_signature, @@ -342,15 +342,10 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, ) -> None: - if blocksparse_params is not None: - # TODO we should support this :think - raise ValueError( - "FlashAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 93c8156b16a7..cf17d9330239 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -190,7 +190,7 @@ import functools from abc import abstractmethod from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Generic, Optional, TypeVar, Union +from typing import TYPE_CHECKING, Generic, Optional, TypeVar, Union import torch @@ -754,7 +754,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], diff --git a/vllm/v1/attention/backends/mla/cutlass_mla.py b/vllm/v1/attention/backends/mla/cutlass_mla.py index a0f7c39c0041..c787f25cd3ad 100644 --- a/vllm/v1/attention/backends/mla/cutlass_mla.py +++ b/vllm/v1/attention/backends/mla/cutlass_mla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from typing import Any, Optional +from typing import Optional import torch @@ -74,7 +74,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], @@ -82,17 +81,14 @@ def __init__( **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( "CutlassMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") + "alibi_slopes, sliding_window, logits_soft_cap") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 935311aacc35..d3e5300dbbd6 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, ClassVar, Optional +from typing import ClassVar, Optional import torch @@ -119,7 +119,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], @@ -127,20 +126,17 @@ def __init__( **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) assert is_flashmla_supported(), \ "FlashMLA is not supported on this device" - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( "FlashMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") + "alibi_slopes, sliding_window, logits_soft_cap") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 42a042583615..834c23455835 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, ClassVar, Optional +from typing import ClassVar, Optional import torch @@ -167,7 +167,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], @@ -175,20 +174,17 @@ def __init__( **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) assert (num_heads == 16 or num_heads == 128), ( f"Aiter MLA only supports 16 or 128 number of heads.\n" f"Provided {num_heads} number of heads.\n" "Try adjusting tensor_parallel_size value.") - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( "Aiter MLA does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") + "alibi_slopes, sliding_window, logits_soft_cap") from aiter import flash_attn_varlen_func self.flash_attn_varlen_func = flash_attn_varlen_func diff --git a/vllm/v1/attention/backends/mla/triton_mla.py b/vllm/v1/attention/backends/mla/triton_mla.py index 99938f22f108..700fce68953e 100644 --- a/vllm/v1/attention/backends/mla/triton_mla.py +++ b/vllm/v1/attention/backends/mla/triton_mla.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any, Optional +from typing import Optional import torch @@ -42,7 +42,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]], logits_soft_cap: Optional[float], attn_type: str, kv_sharing_target_layer_name: Optional[str], @@ -50,17 +49,14 @@ def __init__( **mla_args) -> None: super().__init__(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap, attn_type, + logits_soft_cap, attn_type, kv_sharing_target_layer_name, **mla_args) - unsupported_features = [ - alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap - ] + unsupported_features = [alibi_slopes, sliding_window, logits_soft_cap] if any(unsupported_features): raise NotImplementedError( "TritonMLAImpl does not support one of the following: " - "alibi_slopes, sliding_window, blocksparse_params, " - "logits_soft_cap") + "alibi_slopes, sliding_window, logits_soft_cap") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 52e12a1a506f..ac7980c79e4d 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import torch import torch_xla.core.xla_builder as xb @@ -132,7 +132,6 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, @@ -142,9 +141,6 @@ def __init__( logger.warning_once( "Using irope in Pallas is not supported yet, it will fall back " "to global attention for long context.") - if blocksparse_params is not None: - raise ValueError("Paged attention Pallas kernel does " - "not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) @@ -158,8 +154,6 @@ def __init__( raise NotImplementedError("Alibi slopes is not supported.") if kv_cache_dtype != "auto": raise NotImplementedError("FP8 KV cache dtype is not supported.") - if blocksparse_params is not None: - raise NotImplementedError("Blocksparse is not supported.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 43fe30a9a89f..8f7567639449 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with AiterFlashAttention.""" from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import torch @@ -334,15 +334,11 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, use_irope: bool = False, ) -> None: - if blocksparse_params is not None: - raise ValueError( - "AiterFlashAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 79796ac14928..d65ff5ff74ec 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Attention layer with PagedAttention and Triton prefix prefill.""" from dataclasses import dataclass -from typing import Any, ClassVar, Optional +from typing import ClassVar, Optional import torch @@ -205,15 +205,11 @@ def __init__( alibi_slopes: Optional[list[float]], sliding_window: Optional[int], kv_cache_dtype: str, - blocksparse_params: Optional[dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, use_irope: bool = False, ) -> None: - if blocksparse_params is not None: - raise ValueError( - "TritonAttention does not support block-sparse attention.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) From 2e8cbb58f395ea4546399d3d019e38cf4d09c3cd Mon Sep 17 00:00:00 2001 From: fhl2000 <63384265+fhl2000@users.noreply.github.com> Date: Sun, 20 Jul 2025 05:13:18 +0800 Subject: [PATCH 28/57] [BugFix] Fix full cuda graph slot_mapping (#21228) Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com> --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1ee9c070226c..670e653929ce 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2079,7 +2079,7 @@ def _dummy_run( block_table_tensor=self.input_batch.block_table[ kv_cache_group_id].get_device_tensor()[:num_reqs], slot_mapping=self.input_batch. - block_table[kv_cache_group_id].slot_mapping[:num_reqs]) + block_table[kv_cache_group_id].slot_mapping[:num_tokens]) attn_metadata_i = self.attn_metadata_builders[ kv_cache_group_id].build_for_cudagraph_capture( From 10eb24cc91315481414fba0e0134209e6a9d7c94 Mon Sep 17 00:00:00 2001 From: Yuxuan Zhang <2448370773@qq.com> Date: Sun, 20 Jul 2025 06:40:31 +0800 Subject: [PATCH 29/57] GLM-4 Update (#20736) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: Isotr0py Signed-off-by: Lu Fang Co-authored-by: Isotr0py Co-authored-by: Lu Fang --- benchmarks/kernels/benchmark_moe.py | 6 +- .../benchmark_moe_permute_unpermute.py | 1 + docs/models/supported_models.md | 1 + tests/models/registry.py | 7 + tests/tool_use/test_glm4_moe_tool_parser.py | 410 +++++++++++ vllm/config.py | 15 +- .../openai/tool_parsers/__init__.py | 25 +- .../tool_parsers/glm4_moe_tool_parser.py | 402 ++++++++++ vllm/model_executor/models/glm4_moe.py | 685 ++++++++++++++++++ vllm/model_executor/models/glm4_moe_mtp.py | 307 ++++++++ vllm/model_executor/models/registry.py | 2 + vllm/reasoning/__init__.py | 2 + vllm/reasoning/glm4_moe_reasoning_parser.py | 151 ++++ vllm/worker/worker.py | 3 +- 14 files changed, 2006 insertions(+), 11 deletions(-) create mode 100644 tests/tool_use/test_glm4_moe_tool_parser.py create mode 100644 vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py create mode 100644 vllm/model_executor/models/glm4_moe.py create mode 100644 vllm/model_executor/models/glm4_moe_mtp.py create mode 100644 vllm/reasoning/glm4_moe_reasoning_parser.py diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 132c325ce591..c350aaf5d3ad 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -576,7 +576,11 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - elif config.architectures[0] in ("DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"): + elif config.architectures[0] in ( + "DeepseekV3ForCausalLM", + "DeepseekV2ForCausalLM", + "Glm4MoeForCausalLM", + ): E = config.n_routed_experts topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index dba1f3943b96..4ed690090144 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -318,6 +318,7 @@ def main(args: argparse.Namespace): elif ( config.architectures[0] == "DeepseekV3ForCausalLM" or config.architectures[0] == "DeepseekV2ForCausalLM" + or config.architectures[0] == "Glm4MoeForCausalLM" ): E = config.n_routed_experts topk = config.num_experts_per_tok diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index f5a89ab6cf7d..306a7851a432 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -576,6 +576,7 @@ Specified using `--task generate`. | `Gemma3ForConditionalGeneration` | Gemma 3 | T + I+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | ⚠️ | | `GLM4VForCausalLM`^ | GLM-4V | T + I | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220`, etc. | ✅︎ | ✅︎ | ✅︎ | | `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + IE+ + VE+ | `THUDM/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ | ✅︎ | +| `Glm4MoeForCausalLM` | GLM-4.5 | T + IE+ + VE+ | `THUDM/GLM-4.5`, etc. | ✅︎ | ✅︎ | ✅︎ | | `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ | ✅︎ | | `H2OVLChatModel` | H2OVL | T + IE+ | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎ | | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | | ✅︎ | diff --git a/tests/models/registry.py b/tests/models/registry.py index 8afac32e1cf0..c2f1089af2ac 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -360,6 +360,9 @@ def check_available_online( trust_remote_code=True, hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501 "Glm4vForConditionalGeneration": _HfExamplesInfo("THUDM/GLM-4.1V-9B-Thinking", min_transformers_version="4.53"), # noqa: E501 + "Glm4MoeForCausalLM": _HfExamplesInfo("THUDM/GLM-4.5", + min_transformers_version="4.54", + is_available_online=False), # noqa: E501 "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m", extras={"2b": "h2oai/h2ovl-mississippi-2b"}, # noqa: E501 max_transformers_version="4.48", # noqa: E501 @@ -485,6 +488,10 @@ def check_available_online( is_available_online=False, speculative_model="openbmb/MiniCPM-2B-sft-bf16", tokenizer="openbmb/MiniCPM-2B-sft-bf16"), + "Glm4MoeMTPModel": _HfExamplesInfo("THUDM/GLM-4.5", + speculative_model="THUDM/GLM-4.5", + min_transformers_version="4.54", + is_available_online=False), "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL", trust_remote_code=True, speculative_model="XiaomiMiMo/MiMo-7B-RL") diff --git a/tests/tool_use/test_glm4_moe_tool_parser.py b/tests/tool_use/test_glm4_moe_tool_parser.py new file mode 100644 index 000000000000..478f4b916672 --- /dev/null +++ b/tests/tool_use/test_glm4_moe_tool_parser.py @@ -0,0 +1,410 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 + +import json + +import pytest + +from vllm.entrypoints.openai.protocol import FunctionCall, ToolCall +from vllm.entrypoints.openai.tool_parsers import Glm4MoeModelToolParser +from vllm.transformers_utils.tokenizer import get_tokenizer + +pytest.skip("skip glm4_moe parser test", allow_module_level=True) +# Use a common model that is likely to be available +MODEL = "THUDM/GLM-4.5" + + +@pytest.fixture(scope="module") +def glm4_moe_tokenizer(): + return get_tokenizer(tokenizer_name=MODEL) + + +@pytest.fixture +def glm4_moe_tool_parser(glm4_moe_tokenizer): + return Glm4MoeModelToolParser(glm4_moe_tokenizer) + + +def assert_tool_calls(actual_tool_calls: list[ToolCall], + expected_tool_calls: list[ToolCall]): + assert len(actual_tool_calls) == len(expected_tool_calls) + + for actual_tool_call, expected_tool_call in zip(actual_tool_calls, + expected_tool_calls): + assert isinstance(actual_tool_call.id, str) + assert len(actual_tool_call.id) > 0 + + assert actual_tool_call.type == "function" + assert actual_tool_call.function.name == expected_tool_call.function.name + # Compare arguments as JSON objects to handle formatting differences + actual_args = json.loads(actual_tool_call.function.arguments) + expected_args = json.loads(expected_tool_call.function.arguments) + assert actual_args == expected_args + + +def test_extract_tool_calls_no_tools(glm4_moe_tool_parser): + model_output = "This is a test" + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output + + +@pytest.mark.parametrize( + ids=[ + "single_tool_call", + "multiple_tool_calls", + "tool_call_with_content_before", + "tool_call_with_mixed_args", + "tool_call_with_chinese_content", + ], + argnames=["model_output", "expected_tool_calls", "expected_content"], + argvalues=[ + ( + """get_current_weather + city + Dallas + state + TX + unit + fahrenheit + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit", + }), + )) + ], + None, + ), + ( + """get_current_weather + city + Dallas + state + TX + unit + fahrenheit + + get_current_weather + city + Orlando + state + FL + unit + fahrenheit + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Dallas", + "state": "TX", + "unit": "fahrenheit", + }), + )), + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Orlando", + "state": "FL", + "unit": "fahrenheit", + }), + )), + ], + None, + ), + ( + """I'll help you check the weather. get_current_weather + city + Seattle + state + WA + unit + celsius + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "Seattle", + "state": "WA", + "unit": "celsius", + }), + )) + ], + "I'll help you check the weather.", + ), + ( + """get_current_weather + city + New York + state + NY + unit + celsius + """, + [ + ToolCall(function=FunctionCall( + name="get_current_weather", + arguments=json.dumps({ + "city": "New York", + "state": "NY", + "unit": "celsius", + }), + )) + ], + None, + ), + ("""I will help you get the weather.get_weather + city + Beijing + date + 2025-08-01 + """, [ + ToolCall(function=FunctionCall( + name="get_weather", + arguments=json.dumps({ + "city": "Beijing", + "date": "2025-08-01", + }), + )) + ], "I will help you get the weather."), + ], +) +def test_extract_tool_calls(glm4_moe_tool_parser, model_output, + expected_tool_calls, expected_content): + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + assert extracted_tool_calls.tools_called + assert_tool_calls(extracted_tool_calls.tool_calls, expected_tool_calls) + + assert extracted_tool_calls.content == expected_content + + +def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser): + """Test tool extraction when thinking tags are present.""" + model_output = """I want to get the weather. + +I will help you get the weather. +get_weather +city +Beijing +date +2025-08-01 +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[0].function.name == "get_weather" + + expected_content = """I want to get the weather. + +I will help you get the weather.""" + assert extracted_tool_calls.content == expected_content + + +def test_extract_tool_calls_malformed_xml(glm4_moe_tool_parser): + """Test that malformed XML is handled gracefully.""" + model_output = """get_weather +city +Seattle +incomplete_arg +value +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + # Should handle malformed XML gracefully + # The parser should either extract what it can or return no tool calls + # depending on how robust we want the parsing to be + assert isinstance(extracted_tool_calls.tools_called, bool) + assert isinstance(extracted_tool_calls.tool_calls, list) + + +def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser): + """Test tool calls with no arguments.""" + model_output = """get_current_time +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[ + 0].function.name == "get_current_time" + # Empty arguments should result in empty JSON object + assert extracted_tool_calls.tool_calls[0].function.arguments == "{}" + + +def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser): + """Test extraction with mixed content and multiple tool calls.""" + model_output = """I will help you get the weather info. + +get_weather +city +Beijing +date +2025-08-01 + + +meaningwhile, I will also check the weather in Shanghai. + +get_weather +city +Shanghai +date +2025-08-01 +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 2 + + # Check first tool call + assert extracted_tool_calls.tool_calls[0].function.name == "get_weather" + args1 = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args1["city"] == "Beijing" + assert args1["date"] == "2025-08-01" + + # Check second tool call + assert extracted_tool_calls.tool_calls[1].function.name == "get_weather" + args2 = json.loads(extracted_tool_calls.tool_calls[1].function.arguments) + assert args2["city"] == "Shanghai" + assert args2["date"] == "2025-08-01" + + # Content should be everything before the first tool call + assert extracted_tool_calls.content == "I will help you get the weather info." + + +def test_streaming_basic_functionality(glm4_moe_tool_parser): + """Test basic streaming functionality.""" + # Reset streaming state + glm4_moe_tool_parser.current_tool_name_sent = False + glm4_moe_tool_parser.prev_tool_call_arr = [] + glm4_moe_tool_parser.current_tool_id = -1 + glm4_moe_tool_parser.streamed_args_for_tool = [] + + # Test with a simple tool call + current_text = """get_weather +city +Beijing +""" + + # Mock token IDs for testing + tool_call_start_id = glm4_moe_tool_parser.tool_call_start_token_id or 12345 + tool_call_end_id = glm4_moe_tool_parser.tool_call_end_token_id or 12346 + + result = glm4_moe_tool_parser.extract_tool_calls_streaming( + previous_text="", + current_text=current_text, + delta_text="", + previous_token_ids=[], + current_token_ids=[tool_call_start_id, tool_call_end_id], + delta_token_ids=[tool_call_end_id], + request=None, + ) + + # The result behavior depends on the streaming state + # This test mainly ensures no exceptions are thrown + assert result is None or hasattr(result, 'tool_calls') or hasattr( + result, 'content') + + +def test_streaming_no_tool_calls(glm4_moe_tool_parser): + """Test streaming when there are no tool calls.""" + current_text = "This is just regular text without any tool calls." + + result = glm4_moe_tool_parser.extract_tool_calls_streaming( + previous_text="This is just regular text", + current_text=current_text, + delta_text=" without any tool calls.", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + # Should return the delta text as content + assert result is not None + assert hasattr(result, 'content') + assert result.content == " without any tool calls." + + +def test_streaming_with_content_before_tool_calls(glm4_moe_tool_parser): + """Test streaming when there's content before tool calls.""" + # Reset streaming state + glm4_moe_tool_parser.current_tool_name_sent = False + glm4_moe_tool_parser.prev_tool_call_arr = [] + glm4_moe_tool_parser.current_tool_id = -1 + glm4_moe_tool_parser.streamed_args_for_tool = [] + + current_text = "I will help you get the weather" + + result = glm4_moe_tool_parser.extract_tool_calls_streaming( + previous_text="I will help you", + current_text=current_text, + delta_text="get the weather.", + previous_token_ids=[], + current_token_ids=[], + delta_token_ids=[], + request=None, + ) + + # Should return content when no tool call tokens are detected + assert result is not None + assert hasattr(result, 'content') + assert result.content == "get the weather." + + +def test_extract_tool_calls_special_characters(glm4_moe_tool_parser): + """Test tool calls with special characters and unicode.""" + model_output = """send_message +recipient +Amy +message +It is a nice day +priority +high +""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + assert extracted_tool_calls.tools_called + assert len(extracted_tool_calls.tool_calls) == 1 + assert extracted_tool_calls.tool_calls[0].function.name == "send_message" + + args = json.loads(extracted_tool_calls.tool_calls[0].function.arguments) + assert args["recipient"] == "Amy" + assert args["message"] == "It is a nice day" + assert args["priority"] == "high" + + +def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser): + """Test incomplete tool calls (missing closing tag).""" + model_output = """get_weather +city +Beijing +date +2025-08-01""" + + extracted_tool_calls = glm4_moe_tool_parser.extract_tool_calls( + model_output, request=None) # type: ignore[arg-type] + + # Incomplete tool calls should not be extracted + assert not extracted_tool_calls.tools_called + assert extracted_tool_calls.tool_calls == [] + assert extracted_tool_calls.content == model_output diff --git a/vllm/config.py b/vllm/config.py index a9720fa3142c..c261f968e7fc 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1333,7 +1333,8 @@ def get_layers_start_end_indices( self, parallel_config: "ParallelConfig") -> tuple[int, int]: from vllm.distributed.utils import get_pp_indices if (self.hf_text_config.model_type == "deepseek_mtp" - or self.hf_config.model_type == "mimo_mtp"): + or self.hf_config.model_type == "mimo_mtp" + or self.hf_config.model_type == "glm4_moe_mtp"): total_num_hidden_layers = getattr(self.hf_text_config, "num_nextn_predict_layers", 0) else: @@ -2663,7 +2664,15 @@ def hf_config_override(hf_config: PretrainedConfig) -> PretrainedConfig: "n_predict": n_predict, "architectures": ["MiMoMTPModel"] }) - return hf_config + + if hf_config.architectures[0] == "Glm4MoeForCausalLM": + hf_config.model_type = "glm4_moe_mtp" + n_predict = getattr(hf_config, "num_nextn_predict_layers", None) + hf_config.update({ + "num_hidden_layers": 0, + "n_predict": n_predict, + "architectures": ["Glm4MoeMTPModel"] + }) return hf_config @@ -2774,7 +2783,7 @@ def __post_init__(self): "mlp_speculator"): self.method = "mlp_speculator" elif (self.draft_model_config.hf_config.model_type - in ("deepseek_mtp", "mimo_mtp")): + in ("deepseek_mtp", "mimo_mtp", "glm4_moe_mtp")): self.method = "deepseek_mtp" if self.num_speculative_tokens > 1: logger.warning( diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 137375b9707c..9eda7155f01f 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -3,6 +3,7 @@ from .abstract_tool_parser import ToolParser, ToolParserManager from .deepseekv3_tool_parser import DeepSeekV3ToolParser +from .glm4_moe_tool_parser import Glm4MoeModelToolParser from .granite_20b_fc_tool_parser import Granite20bFCToolParser from .granite_tool_parser import GraniteToolParser from .hermes_tool_parser import Hermes2ProToolParser @@ -19,10 +20,22 @@ from .xlam_tool_parser import xLAMToolParser __all__ = [ - "ToolParser", "ToolParserManager", "Granite20bFCToolParser", - "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser", - "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser", - "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser", - "DeepSeekV3ToolParser", "xLAMToolParser", "MinimaxToolParser", - "KimiK2ToolParser", "HunyuanA13BToolParser" + "ToolParser", + "ToolParserManager", + "Granite20bFCToolParser", + "GraniteToolParser", + "Hermes2ProToolParser", + "MistralToolParser", + "Internlm2ToolParser", + "Llama3JsonToolParser", + "JambaToolParser", + "Llama4PythonicToolParser", + "PythonicToolParser", + "Phi4MiniJsonToolParser", + "DeepSeekV3ToolParser", + "xLAMToolParser", + "MinimaxToolParser", + "KimiK2ToolParser", + "HunyuanA13BToolParser", + "Glm4MoeModelToolParser", ] diff --git a/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py new file mode 100644 index 000000000000..c3f9d7923575 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/glm4_moe_tool_parser.py @@ -0,0 +1,402 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# code modified from deepseekv3_tool_parser.py + +from collections.abc import Sequence +from typing import Union + +import regex as re + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser, ToolParserManager) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +@ToolParserManager.register_module("glm4_moe") +class Glm4MoeModelToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + self.current_tool_name_sent = False + self.prev_tool_call_arr: list[dict] = [] + self.current_tool_id = -1 + self.streamed_args_for_tool: list[str] = [] + self.tool_call_start_token = "" + self.tool_call_end_token = "" + + self.tool_calls_start_token = self.tool_call_start_token + + # Updated regex for the XML-based format + self.tool_call_regex = re.compile( + r"\s*" + r"(?P[^\n<]+)\s*" # 函数名(到换行或 <) + r"(?P(?:\s*[^<]+\s*" + r"[^<]*\s*)*)\s*" + r"", + re.DOTALL, + ) + + # Regex for parsing individual arguments + self.arg_regex = re.compile( + r"(?P[^<]+)\s*(?P[^<]*)", + re.DOTALL, + ) + + # Streaming regex + self.stream_tool_call_portion_regex = re.compile( + r"(?P[^\n<]+)\s*" + r"(?P(?:\s*[^<]+\s*" + r"[^<]*\s*)*)", + re.DOTALL, + ) + + # For streaming, we also need a regex to match just the function name + self.stream_tool_call_name_regex = re.compile( + r"(?P[^\n<]+)", + re.DOTALL, + ) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction.") + + self.tool_call_start_token_id = self.vocab.get( + self.tool_call_start_token) + self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) + + def _parse_arguments(self, args_text: str) -> str: + """Parse XML-based arguments into JSON format.""" + if not args_text or not args_text.strip(): + return "{}" + + args_dict = {} + matches = self.arg_regex.findall(args_text) + + for key, value in matches: + args_dict[key.strip()] = value.strip() + + import json + return json.dumps(args_dict, ensure_ascii=False) + + def extract_tool_calls( + self, + model_output: str, + request: ChatCompletionRequest, + ) -> ExtractedToolCallInformation: + + # sanity check; avoid unnecessary processing + if self.tool_calls_start_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + try: + # Find all tool calls in the output + function_call_matches = self.tool_call_regex.findall(model_output) + + logger.debug("function_call_matches: %s", function_call_matches) + + if not function_call_matches: + return ExtractedToolCallInformation( + tools_called=False, + tool_calls=[], + content=model_output, + ) + + tool_calls = [] + for i, match in enumerate(function_call_matches): + function_name, function_args_xml = match + function_name = function_name.strip() + + # Parse XML arguments to JSON + function_args_json = self._parse_arguments(function_args_xml) + + tool_calls.append( + ToolCall( + id=f"call_{i}", + type='function', + function=FunctionCall(name=function_name, + arguments=function_args_json), + )) + + # Extract content before the first tool call + content = model_output[:model_output.find(self. + tool_calls_start_token)] + return ExtractedToolCallInformation( + tools_called=bool(tool_calls), + tool_calls=tool_calls, + content=content.strip() if content.strip() else None, + ) + + except Exception: + logger.exception("Error in extracting tool call from response.") + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + request: ChatCompletionRequest, + ) -> Union[DeltaMessage, None]: + + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + # check to see if we should be streaming a tool call - is there a + if self.tool_call_start_token_id not in current_token_ids: + logger.debug("No tool call tokens found!") + return DeltaMessage(content=delta_text) + delta_text = delta_text.replace(self.tool_calls_start_token, + "").replace(self.tool_call_end_token, + "") + try: + + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_token_ids.count( + self.tool_call_start_token_id) + prev_tool_end_count = previous_token_ids.count( + self.tool_call_end_token_id) + cur_tool_start_count = current_token_ids.count( + self.tool_call_start_token_id) + cur_tool_end_count = current_token_ids.count( + self.tool_call_end_token_id) + tool_call_portion = None + text_portion = None + + # case: if we're generating text, OR rounding out a tool call + if (cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count + and self.tool_call_end_token not in delta_text): + logger.debug("Generating text content! skipping tool parsing.") + return DeltaMessage(content=delta_text) + + if self.tool_call_end_token in delta_text: + logger.debug("tool_call_end_token in delta_text") + full_text = current_text + delta_text + tool_call_portion = full_text.split( + self.tool_call_start_token)[-1].split( + self.tool_call_end_token)[0].rstrip() + delta_text = delta_text.split( + self.tool_call_end_token)[0].rstrip() + text_portion = delta_text.split( + self.tool_call_end_token)[-1].lstrip() + + # case -- we're starting a new tool call + if (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + else: + tool_call_portion = None + delta = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count): + + # get the portion of the text that's the tool call + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif (cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count >= prev_tool_end_count): + if self.prev_tool_call_arr is None or len( + self.prev_tool_call_arr) == 0: + logger.debug( + "attempting to close tool call, but no tool call") + return None + diff = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments") + if diff: + diff = (diff.encode("utf-8").decode("unicode_escape") + if diff is str else diff) + if '"}' not in delta_text: + return None + end_loc = delta_text.rindex('"}') + diff = delta_text[:end_loc] + '"}' + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", + diff, + ) + self.streamed_args_for_tool[self.current_tool_id] += diff + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump(exclude_none=True), + ) + ]) + + # case -- otherwise we're just generating text + else: + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + return delta + + current_tool_call = dict() + if tool_call_portion: + current_tool_call_matches = ( + self.stream_tool_call_portion_regex.match( + tool_call_portion)) + if current_tool_call_matches: + tool_id, tool_args = (current_tool_call_matches.groups()) + tool_name = tool_id.split('.')[1].split(':')[0] + current_tool_call['id'] = tool_id + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = tool_args + else: + current_tool_call_name_matches = ( + self.stream_tool_call_name_regex.match( + tool_call_portion)) + if current_tool_call_name_matches: + tool_id_str, = current_tool_call_name_matches.groups() + tool_name = tool_id_str.split('.')[1].split(':')[0] + current_tool_call['id'] = tool_id_str + current_tool_call["name"] = tool_name + current_tool_call["arguments"] = "" + else: + logger.debug("Not enough token") + return None + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + if not self.current_tool_name_sent: + if current_tool_call is None: + return None + function_name: Union[str, None] = current_tool_call.get("name") + tool_id = current_tool_call.get("id") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + type="function", + id=tool_id, + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True), + ) + ]) + else: + return None + + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = (DeltaMessage( + content=delta_text) if text_portion is not None else None) + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug("Trying to parse current tool call with ID %s", + self.current_tool_id) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments") + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but non are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error("should be impossible to have arguments reset " + "mid-call. skipping streaming anything.") + delta = None + + # case -- we now have the first info about arguments available from + # autocompleting the JSON + elif cur_arguments and not prev_arguments: + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=cur_arguments).model_dump( + exclude_none=True), + ) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] = cur_arguments + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + if (isinstance(delta_text, str) + and cur_arguments != prev_arguments + and len(cur_arguments) > len(prev_arguments) + and cur_arguments.startswith(prev_arguments)): + delta_arguments = cur_arguments[len(prev_arguments):] + logger.debug("got diff %s", delta_text) + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall( + index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=delta_arguments).model_dump( + exclude_none=True), + ) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] = cur_arguments + else: + delta = None + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[ + self.current_tool_id] = current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + return delta + + except Exception: + logger.exception("Error trying to handle streaming tool call.") + return None # do not stream a delta. skip this token ID. diff --git a/vllm/model_executor/models/glm4_moe.py b/vllm/model_executor/models/glm4_moe.py new file mode 100644 index 000000000000..bdca293d21db --- /dev/null +++ b/vllm/model_executor/models/glm4_moe.py @@ -0,0 +1,685 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The ZhipuAI Team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GLM-4.5 model compatible with HuggingFace weights.""" +import typing +from collections.abc import Callable, Iterable +from typing import Any, Optional, Union + +import torch +from torch import nn +from transformers import PretrainedConfig + +from vllm.attention import Attention +from vllm.compilation.decorators import support_torch_compile +from vllm.config import CacheConfig, VllmConfig, get_current_vllm_config +from vllm.distributed import (get_ep_group, get_pp_group, + get_tensor_model_parallel_world_size) +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, + QKVParallelLinear, + ReplicatedLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import ( + default_weight_loader, maybe_remap_kv_scale_name) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .interfaces import SupportsPP +from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers, + maybe_prefix) + +logger = init_logger(__name__) + + +class Glm4MoeMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + hidden_act: str, + quant_config: Optional[QuantizationConfig] = None, + reduce_results: bool = True, + prefix: str = "", + ) -> None: + super().__init__() + self.gate_up_proj = MergedColumnParallelLinear( + hidden_size, [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj") + self.down_proj = RowParallelLinear(intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + reduce_results=reduce_results, + prefix=f"{prefix}.down_proj") + if hidden_act != "silu": + raise ValueError(f"Unsupported activation: {hidden_act}. " + "Only silu is supported for now.") + self.act_fn = SiluAndMul() + + def forward(self, x): + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class Glm4MoE(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + enable_eplb: bool = False, + ): + super().__init__() + self.tp_size = get_tensor_model_parallel_world_size() + self.routed_scaling_factor = config.routed_scaling_factor + + self.ep_group = get_ep_group().device_group + self.ep_rank = self.ep_group.rank() + self.ep_size = self.ep_group.size() + self.n_routed_experts: int = config.n_routed_experts + self.n_shared_experts: int = config.n_shared_experts + + if config.hidden_act != "silu": + raise ValueError(f"Unsupported activation: {config.hidden_act}. " + "Only silu is supported for now.") + + self.gate = ReplicatedLinear(config.hidden_size, + config.n_routed_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate") + + # noaux_tc is not set in transformers new config now + self.gate.e_score_correction_bias = (nn.Parameter( + torch.empty(config.n_routed_experts))) + + # Load balancing settings. + vllm_config = get_current_vllm_config() + parallel_config = vllm_config.parallel_config + self.enable_eplb = enable_eplb + + self.n_redundant_experts = parallel_config.num_redundant_experts + self.n_logical_experts = self.n_routed_experts + self.n_physical_experts = (self.n_logical_experts + + self.n_redundant_experts) + self.n_local_physical_experts = self.n_physical_experts // self.ep_size + + self.physical_expert_start = (self.ep_rank * + self.n_local_physical_experts) + self.physical_expert_end = (self.physical_expert_start + + self.n_local_physical_experts) + + self.experts = FusedMoE( + num_experts=config.n_routed_experts, + top_k=config.num_experts_per_tok, + hidden_size=config.hidden_size, + intermediate_size=config.moe_intermediate_size, + reduce_results=False, + renormalize=config.norm_topk_prob, + quant_config=quant_config, + use_grouped_topk=True, + num_expert_group=config.n_group, + topk_group=config.topk_group, + prefix=f"{prefix}.experts", + scoring_func="sigmoid", + e_score_correction_bias=self.gate.e_score_correction_bias, + enable_eplb=self.enable_eplb, + num_redundant_experts=self.n_redundant_experts) + + if config.n_shared_experts is not None: + intermediate_size = (config.moe_intermediate_size * + config.n_shared_experts) + self.shared_experts = Glm4MoeMLP( + hidden_size=config.hidden_size, + intermediate_size=intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), + prefix=f"{prefix}.shared_experts", + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + num_tokens, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + + if self.n_shared_experts is not None: + shared_output = self.shared_experts(hidden_states) + router_logits, _ = self.gate(hidden_states) + final_hidden_states = self.experts( + hidden_states=hidden_states, + router_logits=router_logits) * self.routed_scaling_factor + if shared_output is not None: + final_hidden_states = final_hidden_states + shared_output + if self.tp_size > 1: + final_hidden_states = ( + self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states)) + return final_hidden_states.view(num_tokens, hidden_dim) + + +class Glm4MoeAttention(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + hidden_size: int, + num_heads: int, + num_kv_heads: int, + rope_theta: float = 10000, + rope_scaling: Optional[dict[str, Any]] = None, + max_position_embeddings: int = 131072, + head_dim: Optional[int] = None, + rms_norm_eps: float = 1e-05, + qkv_bias: bool = False, + use_qk_norm: bool = False, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + super().__init__() + self.hidden_size = hidden_size + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: + # Number of KV heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_kv_heads % tp_size == 0 + else: + # Number of KV heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_kv_heads == 0 + self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) + self.head_dim = head_dim or (hidden_size // self.total_num_heads) + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.scaling = self.head_dim**-0.5 + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + self.use_qk_norm = use_qk_norm + + self.qkv_proj = QKVParallelLinear(hidden_size, + self.head_dim, + self.total_num_heads, + self.total_num_kv_heads, + bias=qkv_bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj") + + self.o_proj = RowParallelLinear(self.total_num_heads * self.head_dim, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.o_proj") + + partial_rotary_factor = getattr(config, "partial_rotary_factor", 0.5) + self.rotary_emb = get_rope( + self.head_dim, + rotary_dim=self.head_dim, + max_position=max_position_embeddings, + base=rope_theta, + rope_scaling=rope_scaling, + partial_rotary_factor=partial_rotary_factor, + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + if self.use_qk_norm: + self.q_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) + self.k_norm = RMSNorm(self.head_dim, eps=rms_norm_eps) + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + if self.use_qk_norm: + q = self.q_norm(q.reshape(-1, self.num_heads, + self.head_dim)).reshape(q.shape) + k = self.k_norm(k.reshape(-1, self.num_kv_heads, + self.head_dim)).reshape(k.shape) + + q, k = self.rotary_emb(positions, q, k) + attn_output = self.attn(q, k, v) + output, _ = self.o_proj(attn_output) + return output + + +class Glm4MoeDecoderLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + enable_eplb: bool = False, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + rope_theta = getattr(config, "rope_theta", 10000) + rope_scaling = getattr(config, "rope_scaling", None) + max_position_embeddings = getattr(config, "max_position_embeddings", + 131072) + # DecoderLayers are created with `make_layers` which passes the prefix + # with the layer's index. + layer_idx = int(prefix.split(sep='.')[-1]) + self.layer_idx = layer_idx + + self.self_attn = Glm4MoeAttention( + config=config, + hidden_size=self.hidden_size, + num_heads=config.num_attention_heads, + num_kv_heads=config.num_key_value_heads, + rope_theta=rope_theta, + rope_scaling=rope_scaling, + max_position_embeddings=max_position_embeddings, + head_dim=config.head_dim, + rms_norm_eps=config.rms_norm_eps, + qkv_bias=config.attention_bias, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + use_qk_norm=config.use_qk_norm, + ) + + if (config.n_routed_experts is not None + and layer_idx >= config.first_k_dense_replace): + self.mlp = Glm4MoE( + config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + enable_eplb=enable_eplb, + ) + else: + self.mlp = Glm4MoeMLP(hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + quant_config=quant_config, + prefix=f"{prefix}.mlp") + + self.input_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, + eps=config.rms_norm_eps) + self.routed_scaling_factor = config.routed_scaling_factor + + def forward( + self, + positions: torch.Tensor, + hidden_states: torch.Tensor, + residual: Optional[torch.Tensor], + ) -> tuple[torch.Tensor, torch.Tensor]: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm( + hidden_states, residual) + hidden_states = self.self_attn(positions=positions, + hidden_states=hidden_states) + hidden_states, residual = self.post_attention_layernorm( + hidden_states, residual) + hidden_states = self.mlp(hidden_states) + return hidden_states, residual + + +@support_torch_compile +class Glm4MoeModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + enable_eplb = vllm_config.parallel_config.enable_eplb + self.config = config + + self.vocab_size = config.vocab_size + + if get_pp_group().is_first_rank: + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens") + else: + self.embed_tokens = PPMissingLayer() + + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: Glm4MoeDecoderLayer( + config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + enable_eplb=enable_eplb, + ), + prefix=f"{prefix}.layers") + + if get_pp_group().is_last_rank: + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + else: + self.norm = PPMissingLayer() + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.embed_tokens(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.get_input_embeddings(input_ids) + residual = None + else: + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + + for i in range(self.start_layer, self.end_layer): + layer = self.layers[i] + hidden_states, residual = layer(positions, hidden_states, residual) + + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + def make_empty_intermediate_tensors( + self, batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + "hidden_states": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + "residual": + torch.zeros((batch_size, self.config.hidden_size), + dtype=dtype, + device=device), + }) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is not None: + continue + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + is_expert_weight = False + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + + # Anyway, this is an expert weight and should not be + # attempted to load as other weights later + is_expert_weight = True + + # Do not modify `name` since the loop may continue here + # Instead, create a new variable + name_mapped = name.replace(weight_name, param_name) + + if is_pp_missing_parameter(name_mapped, self): + continue + + param = params_dict[name_mapped] + # We should ask the weight loader to return success or not + # here since otherwise we may skip experts with other + # available replicas. + weight_loader = typing.cast(Callable[..., bool], + param.weight_loader) + success = weight_loader(param, + loaded_weight, + name_mapped, + shard_id=shard_id, + expert_id=expert_id, + return_success=True) + if success: + name = name_mapped + break + else: + if is_expert_weight: + # We've checked that this is an expert weight + # However it's not mapped locally to this rank + # So we simply skip it + continue + + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # Remapping the name of FP8 kv-scale. + name = maybe_remap_kv_scale_name(name, params_dict) + if name is None: + continue + + if is_pp_missing_parameter(name, self): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + return loaded_params + + +class Glm4MoeForCausalLM(nn.Module, SupportsPP): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + fall_back_to_pt_during_load = False + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.quant_config = quant_config + self.model = Glm4MoeModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + if get_pp_group().is_last_rank: + self.lm_head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + else: + self.lm_head = PPMissingLayer() + if self.config.tie_word_embeddings: + self.lm_head.weight = self.model.embed_tokens.weight + self.logits_processor = LogitsProcessor(config.vocab_size) + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + self.expert_weights = [] + + # Set MoE hyperparameters + self.num_moe_layers = (config.num_hidden_layers - + config.first_k_dense_replace) + self.num_expert_groups = config.n_group + + self.moe_layers: list[FusedMoE] = [] + for layer in self.model.layers: + assert isinstance(layer, Glm4MoeDecoderLayer) + if isinstance(layer.mlp, Glm4MoE): + self.moe_layers.append(layer.mlp.experts) + + # Pick last one layer since the first ones may be dense layers. + example_moe = typing.cast( + Glm4MoE, self.model.layers[config.num_hidden_layers - 1].mlp) + self.num_logical_experts = example_moe.n_logical_experts + self.num_physical_experts = example_moe.n_physical_experts + self.num_local_physical_experts = example_moe.n_local_physical_experts + self.num_routed_experts = example_moe.n_routed_experts + self.num_shared_experts = example_moe.n_shared_experts + self.num_redundant_experts = example_moe.n_redundant_experts + + def set_eplb_state( + self, + expert_load_view: torch.Tensor, + logical_to_physical_map: torch.Tensor, + logical_replica_count: torch.Tensor, + ) -> None: + for layer_idx, layer in enumerate(self.moe_layers): + # Register the expert weights. + self.expert_weights.append(layer.get_expert_weights()) + layer.set_eplb_state( + moe_layer_idx=layer_idx, + expert_load_view=expert_load_view, + logical_to_physical_map=logical_to_physical_map, + logical_replica_count=logical_replica_count, + ) + + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader(self) + return loader.load_weights(weights) + + +def get_spec_layer_idx_from_weight_name(config: PretrainedConfig, + weight_name: str) -> Optional[int]: + if hasattr(config, + "num_nextn_predict_layers") and (config.num_nextn_predict_layers + > 0): + layer_idx = config.num_hidden_layers + for i in range(config.num_nextn_predict_layers): + if f"layers.{layer_idx+i}." in weight_name: + return layer_idx + i + return None diff --git a/vllm/model_executor/models/glm4_moe_mtp.py b/vllm/model_executor/models/glm4_moe_mtp.py new file mode 100644 index 000000000000..0624640054d1 --- /dev/null +++ b/vllm/model_executor/models/glm4_moe_mtp.py @@ -0,0 +1,307 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Copyright 2025 The ZhipuAI Team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inference-only GLM-4.5 MTP model compatible with HuggingFace weights.""" + +from collections.abc import Iterable +from typing import Optional + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.fused_moe import FusedMoE +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, VocabParallelEmbedding) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .glm4_moe import Glm4MoeDecoderLayer, get_spec_layer_idx_from_weight_name +from .interfaces import SupportsPP +from .utils import maybe_prefix + + +class SharedHead(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.head = ParallelLMHead(config.vocab_size, + config.hidden_size, + quant_config=quant_config) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return self.norm(hidden_states) + + +class Glm4MoeMultiTokenPredictorLayer(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + prefix: str, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.eh_proj = nn.Linear(config.hidden_size * 2, + config.hidden_size, + bias=False) + self.shared_head = SharedHead(config=config, quant_config=quant_config) + self.mtp_block = Glm4MoeDecoderLayer(config=config, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_index: int = 0, + ) -> torch.Tensor: + assert inputs_embeds is not None + # masking inputs at position 0, as not needed by MTP + inputs_embeds[positions == 0] = 0 + inputs_embeds = self.enorm(inputs_embeds) + previous_hidden_states = self.hnorm(previous_hidden_states) + + hidden_states = self.eh_proj( + torch.cat([inputs_embeds, previous_hidden_states], dim=-1)) + + hidden_states, residual = self.mtp_block(positions=positions, + hidden_states=hidden_states, + residual=None) + hidden_states = residual + hidden_states + return hidden_states + + +class Glm4MoeMultiTokenPredictor(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + self.mtp_start_layer_idx = config.num_hidden_layers + self.num_mtp_layers = config.num_nextn_predict_layers + # to map the exact layer index from weights + self.layers = torch.nn.ModuleDict({ + str(idx): + Glm4MoeMultiTokenPredictorLayer( + config, + f"{prefix}.layers.{idx}", + cache_config=vllm_config.cache_config, + quant_config=vllm_config.quant_config, + ) + for idx in range(self.mtp_start_layer_idx, + self.mtp_start_layer_idx + self.num_mtp_layers) + }) + self.embed_tokens = VocabParallelEmbedding( + config.vocab_size, + config.hidden_size, + ) + self.logits_processor = LogitsProcessor(config.vocab_size) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + current_step_idx = (spec_step_idx % self.num_mtp_layers) + return self.layers[str(self.mtp_start_layer_idx + current_step_idx)]( + input_ids, + positions, + previous_hidden_states, + inputs_embeds, + current_step_idx, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> torch.Tensor: + current_step_idx = (spec_step_idx % self.num_mtp_layers) + mtp_layer = self.layers[str(self.mtp_start_layer_idx + + current_step_idx)] + logits = self.logits_processor(mtp_layer.shared_head.head, + mtp_layer.shared_head(hidden_states), + sampling_metadata) + return logits + + +class Glm4MoeMTP(nn.Module, SupportsPP): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.config = vllm_config.model_config.hf_config + self.model = Glm4MoeMultiTokenPredictor(vllm_config=vllm_config, + prefix=maybe_prefix( + prefix, "model")) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + previous_hidden_states: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + spec_step_idx: int = 0, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, + previous_hidden_states, inputs_embeds, + spec_step_idx) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + spec_step_idx: int = 0, + ) -> Optional[torch.Tensor]: + return self.model.compute_logits(hidden_states, sampling_metadata, + spec_step_idx) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + + # Params for weights, fp8 weight scales, fp8 activation scales + # (param_name, weight_name, expert_id, shard_id) + expert_params_mapping = FusedMoE.make_expert_params_mapping( + ckpt_gate_proj_name="gate_proj", + ckpt_down_proj_name="down_proj", + ckpt_up_proj_name="up_proj", + num_experts=self.config.n_routed_experts) + + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) + if spec_layer is None: + continue + name = self._rewrite_spec_layer_name(spec_layer, name) + for (param_name, weight_name, shard_id) in stacked_params_mapping: + # Skip non-stacked layers and experts (experts handled below). + if weight_name not in name: + continue + # We have mlp.experts[0].gate_proj in the checkpoint. + # Since we handle the experts below in expert_params_mapping, + # we need to skip here BEFORE we update the name, otherwise + # name will be updated to mlp.experts[0].gate_up_proj, which + # will then be updated below in expert_params_mapping + # for mlp.experts[0].gate_gate_up_proj, which breaks load. + if (("mlp.experts." in name) and name not in params_dict): + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + for mapping in expert_params_mapping: + param_name, weight_name, expert_id, shard_id = mapping + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, + loaded_weight, + name, + shard_id=shard_id, + expert_id=expert_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + # According to DeepSeek-V3 Technical Report, MTP modules + # shares embedding layer. We only load the first weights. + if (spec_layer != self.model.mtp_start_layer_idx + and ".layers" not in name): + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str: + """ + Rewrite the weight name to match the format of the original model. + Add .mtp_block for modules in transformer layer block for spec layer + and rename shared layer weights to be top level. + """ + spec_layer_weight_names = [ + "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head" + ] + shared_weight_names = ["embed_tokens"] + spec_layer_weight = False + shared_weight = False + for weight_name in spec_layer_weight_names: + if weight_name in name: + spec_layer_weight = True + if weight_name in shared_weight_names: + shared_weight = True + break + if not spec_layer_weight: + # treat rest weights as weights for transformer layer block + name = name.replace(f"model.layers.{spec_layer}.", + f"model.layers.{spec_layer}.mtp_block.") + elif shared_weight: + # treat shared weights as top level weights + name = name.replace(f"model.layers.{spec_layer}.", "model.") + return name diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 3440dd656c50..b57130ec84c9 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -67,6 +67,7 @@ "Gemma3nForConditionalGeneration": ("gemma3n", "Gemma3nForConditionalGeneration"), # noqa: E501 "GlmForCausalLM": ("glm", "GlmForCausalLM"), "Glm4ForCausalLM": ("glm4", "Glm4ForCausalLM"), + "Glm4MoeForCausalLM": ("glm4_moe", "Glm4MoeForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), "GPTBigCodeForCausalLM": ("gpt_bigcode", "GPTBigCodeForCausalLM"), "GPTJForCausalLM": ("gpt_j", "GPTJForCausalLM"), @@ -244,6 +245,7 @@ "EagleMiniCPMForCausalLM": ("minicpm_eagle", "EagleMiniCPMForCausalLM"), "Eagle3LlamaForCausalLM": ("llama_eagle3", "Eagle3LlamaForCausalLM"), "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"), + "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"), "MedusaModel": ("medusa", "Medusa"), # Temporarily disabled. # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. diff --git a/vllm/reasoning/__init__.py b/vllm/reasoning/__init__.py index 3e5485b883f1..bae593c1dff0 100644 --- a/vllm/reasoning/__init__.py +++ b/vllm/reasoning/__init__.py @@ -3,6 +3,7 @@ from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from .glm4_moe_reasoning_parser import Glm4MoeModelReasoningParser from .granite_reasoning_parser import GraniteReasoningParser from .hunyuan_a13b_reasoning_parser import HunyuanA13BReasoningParser from .qwen3_reasoning_parser import Qwen3ReasoningParser @@ -14,4 +15,5 @@ "GraniteReasoningParser", "HunyuanA13BReasoningParser", "Qwen3ReasoningParser", + "Glm4MoeModelReasoningParser", ] diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py new file mode 100644 index 000000000000..6511fb49d10e --- /dev/null +++ b/vllm/reasoning/glm4_moe_reasoning_parser.py @@ -0,0 +1,151 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence +from typing import Optional, Union + +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, + DeltaMessage) +from vllm.logger import init_logger +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +logger = init_logger(__name__) + + +@ReasoningParserManager.register_module("glm4_moe") +class Glm4MoeModelReasoningParser(ReasoningParser): + """ + Reasoning parser for the Glm4MoeModel model. + + The Glm4MoeModel model uses ... tokens to denote reasoning + text within its output. The model provides a strict switch to disable + reasoning output via the 'enable_thinking=False' parameter. This parser + extracts the reasoning content enclosed by and tokens + from the model's output. + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + self.think_start_token = "" + self.think_end_token = "" + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ReasoningParser " + "constructor during construction.") + + self.think_start_token_id = self.vocab.get(self.think_start_token) + self.think_end_token_id = self.vocab.get(self.think_end_token) + if (self.think_start_token_id is None + or self.think_end_token_id is None): + raise RuntimeError( + "Glm4MoeModel reasoning parser could not locate " + "think start/end tokens in the tokenizer!") + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + return self.think_end_token_id in input_ids + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + """ + Extract the content after the end tokens + """ + if self.think_end_token_id not in input_ids[:-1]: + return [] + else: + return input_ids[input_ids.index(self.think_end_token_id) + 1:] + + def extract_reasoning_content_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Extract reasoning content from a delta message. + Handles streaming output where previous + delta = current. + Uses token IDs for faster processing. + For text abcxyz: + - 'abc' goes to reasoning_content + - 'xyz' goes to content + """ + # Skip single special tokens + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ + self.think_start_token_id, self.think_end_token_id + ]): + return None + + if self.think_start_token_id in previous_token_ids: + if self.think_end_token_id in delta_token_ids: + # in previous, in delta, + # extract reasoning content + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[:end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + elif self.think_end_token_id in previous_token_ids: + # in previous, in previous, + # reasoning content continues + return DeltaMessage(content=delta_text) + else: + # in previous, no in previous or delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + elif self.think_start_token_id in delta_token_ids: + if self.think_end_token_id in delta_token_ids: + # in delta, in delta, extract reasoning content + start_index = delta_text.find(self.think_start_token) + end_index = delta_text.find(self.think_end_token) + reasoning_content = delta_text[start_index + + len(self.think_start_token + ):end_index] + content = delta_text[end_index + len(self.think_end_token):] + return DeltaMessage(reasoning_content=reasoning_content, + content=content if content else None) + else: + # in delta, no in delta, + # reasoning content continues + return DeltaMessage(reasoning_content=delta_text) + else: + # thinking is disabled, just content + return DeltaMessage(content=delta_text) + + def extract_reasoning_content( + self, model_output: str, request: ChatCompletionRequest + ) -> tuple[Optional[str], Optional[str]]: + """ + Extract reasoning content from the model output. + + For text abcxyz: + - 'abc' goes to reasoning_content + - 'xyz' goes to content + + Returns: + tuple[Optional[str], Optional[str]]: reasoning content and content + """ + + # Check if the model output contains the and tokens. + if (self.think_start_token not in model_output + or self.think_end_token not in model_output): + return None, model_output + # Check if the is present in the model output, remove it + # if it is present. + model_output_parts = model_output.partition(self.think_start_token) + model_output = model_output_parts[2] if model_output_parts[ + 1] else model_output_parts[0] + # Check if the model output contains the tokens. + # If the end token is not found, return the model output as is. + if self.think_end_token not in model_output: + return None, model_output + + # Extract reasoning content from the model output. + reasoning_content, _, content = model_output.partition( + self.think_end_token) + + final_content = content or None + return reasoning_content, final_content diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index b2926dbd185a..6b6943d76436 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -77,7 +77,8 @@ def __init__( "mlp_speculator", "eagle", "deepseek_mtp", - "mimo_mtp")) \ + "glm4_moe_mtp", + "mimo_mtp")) \ else {"return_hidden_states": True} ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner From 2b504eb77031cfc947a9990ead42c8bc8baa98c5 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Sun, 20 Jul 2025 01:09:58 +0200 Subject: [PATCH 30/57] [Docs] [V1] Update docs to remove enforce_eager limitation for hybrid models. (#21233) Signed-off-by: Thomas Parnell --- docs/usage/v1_guide.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/usage/v1_guide.md b/docs/usage/v1_guide.md index 12150cf2a82e..498ff3da0ca3 100644 --- a/docs/usage/v1_guide.md +++ b/docs/usage/v1_guide.md @@ -107,12 +107,11 @@ to enable simultaneous generation and embedding using the same engine instance i Models using selective state-space mechanisms instead of standard transformer attention are partially supported. Models that use Mamba-2 layers (e.g., `Mamba2ForCausalLM`) are supported, but models that use older Mamba-1 layers (e.g., `MambaForCausalLM`, `JambaForCausalLM`) are not yet supported. Please note that these models currently require -enforcing eager mode and disabling prefix caching in V1. +disabling prefix caching in V1. Models that combine Mamba-2 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`, `Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`). Please note that -these models currently require enforcing eager mode, disabling prefix caching, and using the FlashInfer attention -backend in V1. +these models currently require disabling prefix caching and using the FlashInfer attention backend in V1. #### Encoder-Decoder Models From 3a1d8940aea57999411b7ea47287d3ad5cb71676 Mon Sep 17 00:00:00 2001 From: Chengji Yao Date: Sat, 19 Jul 2025 20:01:00 -0700 Subject: [PATCH 31/57] [TPU] support fp8 kv cache quantization (#19292) Signed-off-by: Chengji Yao --- tests/entrypoints/llm/test_accuracy.py | 40 +++++++++++++----- tests/v1/tpu/test_pallas.py | 2 + vllm/engine/arg_utils.py | 8 ++-- vllm/platforms/tpu.py | 4 +- vllm/v1/attention/backends/pallas.py | 58 ++++++++++++++++++++++---- vllm/v1/worker/tpu_model_runner.py | 11 ++--- 6 files changed, 95 insertions(+), 28 deletions(-) diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py index 30a666d4c39c..6c5706d16340 100644 --- a/tests/entrypoints/llm/test_accuracy.py +++ b/tests/entrypoints/llm/test_accuracy.py @@ -15,15 +15,18 @@ from vllm.platforms import current_platform MODEL_NAMES = [ - "Qwen/Qwen2-1.5B-Instruct", + "Qwen/Qwen3-1.7B", "google/gemma-3-1b-it", ] +FP8_KV_MODEL_NAMES = [ + "Qwen/Qwen3-1.7B", +] NUM_CONCURRENT = 500 TASK = "gsm8k" FILTER = "exact_match,strict-match" RTOL = 0.03 EXPECTED_VALUES = { - "Qwen/Qwen2-1.5B-Instruct": 0.58, + "Qwen/Qwen3-1.7B": 0.68, "google/gemma-3-1b-it": 0.25, } @@ -70,10 +73,9 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch): if current_platform.is_tpu(): # Limit compilation time for TPU V1 - if model == "google/gemma-3-1b-it": - # TPU + google/gemma-3-1b-it + xet doesn't work well. - m.setenv("HF_HUB_DISABLE_XET", "1") - + # xet doesn't work well for both Qwen/Qwen3-1.7B and + # google/gemma-3-1b-it + m.setenv("HF_HUB_DISABLE_XET", "1") more_args = "max_model_len=2048,max_num_seqs=64" # Add TP test (if provided) @@ -83,9 +85,27 @@ def test_lm_eval_accuracy_v1_engine(model, monkeypatch: pytest.MonkeyPatch): run_test(model, more_args) -def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch): - """Run with the V0 Engine.""" +@pytest.mark.skipif(not current_platform.is_cuda() + and not current_platform.is_tpu(), + reason="V1 is currently only supported on CUDA and TPU") +@pytest.mark.parametrize("model", FP8_KV_MODEL_NAMES) +def test_lm_eval_accuracy_v1_engine_fp8_kv_cache( + model, monkeypatch: pytest.MonkeyPatch): + """Run with the V1 Engine.""" with monkeypatch.context() as m: - m.setenv("VLLM_USE_V1", "0") - run_test("Qwen/Qwen2-1.5B-Instruct") + m.setenv("VLLM_USE_V1", "1") + + more_args = None + if current_platform.is_tpu(): + # Limit compilation time for TPU V1 + + # xet doesn't work well for Qwen/Qwen3-1.7B + m.setenv("HF_HUB_DISABLE_XET", "1") + more_args = "max_model_len=2048,max_num_seqs=128,kv_cache_dtype=fp8" + + # Add TP test (if provided) + if TPU_TP_TEST_STR: + more_args += ",{}".format(TPU_TP_TEST_STR) + + run_test(model, more_args) diff --git a/tests/v1/tpu/test_pallas.py b/tests/v1/tpu/test_pallas.py index df89133170b8..bfba3af57f71 100644 --- a/tests/v1/tpu/test_pallas.py +++ b/tests/v1/tpu/test_pallas.py @@ -95,4 +95,6 @@ class FakeAttentionLayer: sm_scale=scale, sliding_window=sliding_window, soft_cap=logits_soft_cap, + k_scale=1.0, + v_scale=1.0, ) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1ca4917de26b..019ff033eda2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1358,10 +1358,10 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: and not envs.is_set("VLLM_ATTENTION_BACKEND") ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" supported = False - if current_platform.is_rocm() or ( - current_platform.is_cuda() - and current_platform.is_device_capability(100) - ): # handle hpu also for OOT platform + if (current_platform.is_rocm() + or (current_platform.is_cuda() + and current_platform.is_device_capability(100)) + or current_platform.is_tpu()): supported = True elif fp8_attention and will_use_fa: from vllm.attention.utils.fa_utils import ( diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py index 5ec3be908e7d..febc6ae4662b 100644 --- a/vllm/platforms/tpu.py +++ b/vllm/platforms/tpu.py @@ -35,7 +35,9 @@ class TpuPlatform(Platform): device_control_env_var: str = "TPU_VISIBLE_CHIPS" simple_compile_backend: str = "openxla" - supported_quantization: list[str] = ["tpu_int8", "compressed-tensors"] + supported_quantization: list[str] = [ + "fp8", "tpu_int8", "compressed-tensors" + ] additional_env_vars: list[str] = [ "TPU_CHIPS_PER_HOST_BOUNDS", "TPU_HOST_BOUNDS" diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index ac7980c79e4d..9307cd937d5d 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -24,6 +24,19 @@ # TPU requires the head size to be a multiple of 128. TPU_HEAD_SIZE_ALIGNMENT = 128 +# Note: TPU can fp8 as storage dtype but doesn't support converting from uint8 +# from to fp32 directly. That's why it has a dtype mapping different from GPU +TPU_STR_DTYPE_TO_TORCH_DTYPE = { + "half": torch.half, + "bfloat16": torch.bfloat16, + "float": torch.float, + "fp8": torch.float8_e4m3fn, + "fp8_e4m3": torch.float8_e4m3fn, + "fp8_e5m2": torch.float8_e5m2, + "int8": torch.int8, + "uint8": torch.uint8, +} + class PallasAttentionBackend(AttentionBackend): @@ -152,8 +165,6 @@ def __init__( self.num_queries_per_kv = self.num_heads // self.num_kv_heads if alibi_slopes is not None: raise NotImplementedError("Alibi slopes is not supported.") - if kv_cache_dtype != "auto": - raise NotImplementedError("FP8 KV cache dtype is not supported.") if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " @@ -161,6 +172,11 @@ def __init__( "are not implemented for " "PallasAttentionBackendImpl") + self.kv_cache_quantized_dtype = None + if kv_cache_dtype != "auto": + self.kv_cache_quantized_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE.get( + kv_cache_dtype.lower().strip()) + def forward( self, layer: AttentionLayer, @@ -194,7 +210,6 @@ def forward( output = torch.ones_like(query) return output - assert layer._k_scale_float == 1.0 and layer._v_scale_float == 1.0 num_tokens, hidden_size = query.shape query = query.view(num_tokens, self.num_heads, self.head_size) key = key.view(-1, self.num_kv_heads, self.head_size) @@ -215,10 +230,21 @@ def forward( # Skip this if sharing KV cache with an earlier attention layer. slot_mapping = attn_metadata.slot_mapping write_to_kv_cache( - key, value, kv_cache, slot_mapping, + key, + value, + kv_cache, + slot_mapping, attn_metadata.num_slices_per_kv_cache_update_block, - attn_metadata.num_kv_update_slices) - + attn_metadata.num_kv_update_slices, + self.kv_cache_quantized_dtype, + layer._k_scale_float, + layer._v_scale_float, + ) + + if self.kv_cache_quantized_dtype is not None and ( + layer._k_scale_float == 0.0 or layer._v_scale_float == 0.0): + raise ValueError( + "k_scale_float and v_scale_float must be non-zero") output = torch.ops.xla.ragged_paged_attention( query, kv_cache, @@ -236,6 +262,8 @@ def forward( sm_scale=self.scale, sliding_window=self.sliding_window, soft_cap=self.logits_soft_cap, + k_scale=layer._k_scale_float, + v_scale=layer._v_scale_float, ) if self.head_size % TPU_HEAD_SIZE_ALIGNMENT != 0: @@ -251,18 +279,32 @@ def write_to_kv_cache( slot_mapping: torch.Tensor, num_slices_per_kv_cache_update_block: int, num_kv_update_slices: torch.Tensor, + kv_cache_quantized_dtype: Optional[torch.dtype] = None, + k_scale: float = 1.0, + v_scale: float = 1.0, ) -> None: """ Write the key and values to the KV cache. Args: - key: shape = [num_tokens, num_kv_heads * head_size] - value: shape = [num_tokens, num_kv_heads * head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] kv_cache = [num_blocks, block_size, num_kv_heads * 2, head_size] num_slices_per_kv_cache_update_block: int """ _, page_size, num_combined_kv_heads, head_size = kv_cache.shape head_size = cdiv(head_size, TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT + + if kv_cache_quantized_dtype is not None: + dtype_info = torch.finfo(kv_cache_quantized_dtype) + key = key.to(torch.float32) / k_scale + # NOTE: clamp is added here to avoid out of range of quantized dtype + key = torch.clamp(key, dtype_info.min, dtype_info.max) + key = key.to(kv_cache_quantized_dtype) + value = value.to(torch.float32) / v_scale + value = torch.clamp(value, dtype_info.min, dtype_info.max) + value = value.to(kv_cache_quantized_dtype) + kv = torch.cat([key, value], axis=-1).reshape(-1, num_combined_kv_heads, head_size) diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 1b55e5d61aa9..7ed1cf41011b 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -32,9 +32,10 @@ from vllm.multimodal.utils import group_mm_inputs_by_modality from vllm.pooling_params import PoolingTask from vllm.sequence import IntermediateTensors -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv, - is_pin_memory_available, prev_power_of_2) -from vllm.v1.attention.backends.pallas import (PallasAttentionBackend, +from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available, + prev_power_of_2) +from vllm.v1.attention.backends.pallas import (TPU_STR_DTYPE_TO_TORCH_DTYPE, + PallasAttentionBackend, PallasMetadata, get_page_size_bytes) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -142,11 +143,11 @@ def __init__( if cache_config.cache_dtype == "auto": model_dtype = self.dtype if isinstance(model_dtype, str): - self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype] + self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[model_dtype] else: self.kv_cache_dtype = model_dtype else: - self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ + self.kv_cache_dtype = TPU_STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] self._hidden_states_dtype = self.dtype From d1fb65bde367aa6e3d72520c84b60be3d1539917 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Sat, 19 Jul 2025 20:22:02 -0700 Subject: [PATCH 32/57] Enable v1 metrics tests (#20953) Signed-off-by: Seiji Eicher --- .buildkite/test-pipeline.yaml | 1 + tests/v1/metrics/test_ray_metrics.py | 18 ++++++++++++------ vllm/v1/metrics/ray_wrappers.py | 8 +++++++- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 7f1848b4bfbc..114c48dba531 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -264,6 +264,7 @@ steps: - pytest -v -s v1/structured_output - pytest -v -s v1/spec_decode - pytest -v -s v1/kv_connector/unit + - pytest -v -s v1/metrics - pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py index 0898ae65e7cd..92f6c6f0e89c 100644 --- a/tests/v1/metrics/test_ray_metrics.py +++ b/tests/v1/metrics/test_ray_metrics.py @@ -1,8 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import os + import pytest import ray +from vllm.config import ModelDType from vllm.sampling_params import SamplingParams from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger @@ -27,7 +30,7 @@ def use_v1_only(monkeypatch): def test_engine_log_metrics_ray( example_prompts, model: str, - dtype: str, + dtype: ModelDType, max_tokens: int, ) -> None: """ Simple smoke test, verifying this can be used without exceptions. @@ -37,11 +40,14 @@ def test_engine_log_metrics_ray( class EngineTestActor: async def run(self): - engine_args = AsyncEngineArgs( - model=model, - dtype=dtype, - disable_log_stats=False, - ) + # Set environment variable inside the Ray actor since environment + # variables from pytest fixtures don't propagate to Ray actors + os.environ['VLLM_USE_V1'] = '1' + + engine_args = AsyncEngineArgs(model=model, + dtype=dtype, + disable_log_stats=False, + enforce_eager=True) engine = AsyncLLM.from_engine_args( engine_args, stat_loggers=[RayPrometheusStatLogger]) diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index cce692d6c09e..8384310062dd 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -51,7 +51,13 @@ class RayGaugeWrapper(RayPrometheusMetric): def __init__(self, name: str, documentation: Optional[str] = "", - labelnames: Optional[list[str]] = None): + labelnames: Optional[list[str]] = None, + multiprocess_mode: Optional[str] = ""): + + # All Ray metrics are keyed by WorkerId, so multiprocess modes like + # "mostrecent", "all", "sum" do not apply. This logic can be manually + # implemented at the observability layer (Prometheus/Grafana). + del multiprocess_mode labelnames_tuple = tuple(labelnames) if labelnames else None self.metric = ray_metrics.Gauge(name=name, description=documentation, From 51ba839555a5d122eadd91e9c16463ac288f5fa1 Mon Sep 17 00:00:00 2001 From: Calvin Chen Date: Sun, 20 Jul 2025 16:15:50 +0800 Subject: [PATCH 33/57] [Model] use AutoWeightsLoader for bart (#18299) Signed-off-by: calvin chen <120380290@qq.com> --- vllm/model_executor/models/bart.py | 172 ++++++++++++----------------- 1 file changed, 71 insertions(+), 101 deletions(-) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index a0ec12674f19..3d328c88ff6e 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -46,7 +46,7 @@ from vllm.sequence import IntermediateTensors from .interfaces import SupportsQuant, SupportsV0Only -from .utils import maybe_prefix +from .utils import AutoWeightsLoader, WeightsMapper, maybe_prefix logger = logging.get_logger(__name__) @@ -700,7 +700,8 @@ def forward( class BartModel(nn.Module, SupportsQuant): _tied_weights_keys = [ - "encoder.embed_tokens.weight", "decoder.embed_tokens.weight" + "encoder.embed_tokens.weight", + "decoder.embed_tokens.weight", ] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -763,10 +764,54 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, return decoder_outputs + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + other_weights = [] + loaded_stacked_params = [] + model_params_dict = dict(self.named_parameters()) + + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + if name not in model_params_dict: + continue + param = model_params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + loaded_stacked_params.append(name) + break + else: + if name in model_params_dict: + other_weights.append((name, loaded_weight)) + + loader = AutoWeightsLoader(self) + loaded_params = loader.load_weights(other_weights) + loaded_params.update(loaded_stacked_params) + return loaded_params + class BartForConditionalGeneration(nn.Module, SupportsV0Only, SupportsQuant): - packed_modules_mapping = {"qkv_proj": ["q_proj", "k_proj", "v_proj"]} - base_model_prefix = "model" + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "decoder.": "model.decoder.", + "encoder.": "model.encoder.", + "shared.": "model.shared." + }, + orig_to_new_substr={ + "beta": "bias", + "gamma": "weight", + "LayerNorm": "layernorm", + }, + ) def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -789,7 +834,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.lm_head = BartParallelLMHead(config.vocab_size, config.d_model, embed_scale=embed_scale) - self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, config.vocab_size) @@ -828,61 +872,12 @@ def compute_logits( sampling_metadata) return logits - stacked_params_mapping = { - "q_proj": { - "param_name": "qkv_proj", - "shard_id": "q", - }, - "k_proj": { - "param_name": "qkv_proj", - "shard_id": "k", - }, - "v_proj": { - "param_name": "qkv_proj", - "shard_id": "v", - }, - } - - params_mapping = { - "beta": "bias", - "gamma": "weight", - "LayerNorm": "layernorm", - } - - def _rename_key(self, key: str): - prefix = f"{self.base_model_prefix}." - key = key[len(prefix):] if key.startswith(prefix) else key - - for src, dst in self.params_mapping.items(): - key = key.replace(src, dst) - - return key - - def _rename_stacked_param( - self, - name: str, - ) -> tuple[str, Optional[str]]: - for key, mapping in self.stacked_params_mapping.items(): - if key in name: - name = name.replace(key, mapping["param_name"]) - return name, mapping["shard_id"] - return name, None - - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - - model_params_dict = dict(self.model.named_parameters()) - top_params_dict = dict(self.named_parameters()) - + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights_tuple_list = list(weights) shared_embedding_weight = None - shared_embedding_shard_id = None - for name, loaded_weight in weights_tuple_list: - - name = self._rename_key(name) - name, shard_id = self._rename_stacked_param(name) - if ('shared.weight' in name or 'encoder.embed_tokens.weight' in name or 'decoder.embed_tokens.weight' in name @@ -890,49 +885,24 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): assert shared_embedding_weight is None, ( "Conflicting embedding weights.") shared_embedding_weight = loaded_weight - shared_embedding_shard_id = shard_id - else: - # Skip the specific downstream task weight. - if name.startswith('cls.'): - continue - # use Pooler instead. - if name.startswith('pooler.'): - continue - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in model_params_dict: - continue - param = model_params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - if shard_id: - weight_loader(param, loaded_weight, shard_id) - else: - weight_loader(param, loaded_weight) - - # Assign shared weight values - encoder_in_param = model_params_dict['encoder.embed_tokens.weight'] - encoder_in_weight_loader = getattr(encoder_in_param, "weight_loader", - default_weight_loader) - - decoder_in_param = model_params_dict['decoder.embed_tokens.weight'] - decoder_in_weight_loader = getattr(decoder_in_param, "weight_loader", - default_weight_loader) - - lm_head_in_param = top_params_dict['lm_head.weight'] - lm_head_in_weight_loader = getattr(lm_head_in_param, "weight_loader", - default_weight_loader) - - assert shared_embedding_weight is not None - - if shared_embedding_shard_id: - encoder_in_weight_loader(encoder_in_param, shared_embedding_weight, - shared_embedding_shard_id) - decoder_in_weight_loader(decoder_in_param, shared_embedding_weight, - shared_embedding_shard_id) - lm_head_in_weight_loader(lm_head_in_param, shared_embedding_weight, - shared_embedding_shard_id) - else: - encoder_in_weight_loader(encoder_in_param, shared_embedding_weight) - decoder_in_weight_loader(decoder_in_param, shared_embedding_weight) - lm_head_in_weight_loader(lm_head_in_param, shared_embedding_weight) + loader = AutoWeightsLoader( + self, + skip_prefixes=(["cls.", "pooler."]), + ) + loaded_params = loader.load_weights(weights_tuple_list, + mapper=self.hf_to_vllm_mapper) + + if shared_embedding_weight is not None: + weight_loader = getattr(self.lm_head.weight, "weight_loader", + default_weight_loader) + weight_loader(self.lm_head.weight, shared_embedding_weight) + + self.model.encoder.embed_tokens.weight = self.lm_head.weight + self.model.decoder.embed_tokens.weight = self.lm_head.weight + loaded_params.update({ + 'model.encoder.embed_tokens.weight', 'lm_head.weight', + 'model.decoder.embed_tokens.weight' + }) + + return loaded_params From 9499e26e2ae18826bcda99ae7e0883268cde03db Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Sun, 20 Jul 2025 15:25:50 +0200 Subject: [PATCH 34/57] [Model] Support VLMs with transformers backend (#20543) Signed-off-by: raushan Signed-off-by: Isotr0py <2037008807@qq.com> Signed-off-by: Isotr0py Co-authored-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py Co-authored-by: Cyrus Leung --- docs/models/supported_models.md | 9 +- .../multimodal/generation/test_common.py | 75 +++ tests/models/registry.py | 1 + vllm/config.py | 39 +- vllm/model_executor/model_loader/utils.py | 49 +- vllm/model_executor/models/registry.py | 12 +- vllm/model_executor/models/transformers.py | 527 ++++++++++++++++-- 7 files changed, 625 insertions(+), 87 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 306a7851a432..0a2f69bd7711 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -18,7 +18,7 @@ These models are what we list in [supported-text-models][supported-text-models] ### Transformers -vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! +vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! Vision-language models currently accept only image inputs, and require setting `--disable_mm_preprocessor_cache` when running. Support for video inputs and caching of multi-modal preprocessors will be added in future releases. To check if the modeling backend is Transformers, you can simply do this: @@ -28,7 +28,7 @@ llm = LLM(model=..., task="generate") # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` -If it is `TransformersForCausalLM` then it means it's based on Transformers! +If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers! !!! tip You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference](../serving/offline_inference.md) or `--model-impl transformers` for the [openai-compatible-server](../serving/openai_compatible_server.md). @@ -36,6 +36,9 @@ If it is `TransformersForCausalLM` then it means it's based on Transformers! !!! note vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. +!!! note + In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Transformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. + #### Custom models If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! @@ -99,7 +102,7 @@ Here is what happens in the background when this model is loaded: 1. The config is loaded. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into `TransformersForCausalLM` (see ) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. +3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see ) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. That's it! diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 98461676aa47..9859ac5a89dd 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -35,6 +35,8 @@ REQUIRES_V0_MODELS = [ # V1 Test: not enough KV cache space in C1. "fuyu", + # V1 Test: Deadlock issue when processing mm_inputs + "llava-onevision-transformers", ] # yapf: disable @@ -170,6 +172,79 @@ hf_output_post_proc=model_utils.ultravox_trunc_hf_output, marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), + #### Transformers fallback to test + ## To reduce test burden, we only test batching arbitrary image size + # Dynamic image length and number of patches + "llava-onevision-transformers": VLMTestInfo( + models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], + test_type=VLMTestType.IMAGE, + prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + max_model_len=16384, + hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501 + auto_cls=AutoModelForImageTextToText, + vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, + image_size_factors=[(0.25, 0.5, 1.0)], + vllm_runner_kwargs={ + "model_impl": "transformers", + "disable_mm_preprocessor_cache": True, + "enable_prefix_caching": False, + }, + marks=[pytest.mark.core_model], + ), + # FIXME(Isotr0py): Enable this test after + # https://github.com/huggingface/transformers/pull/39470 released + # "idefics3-transformers": VLMTestInfo( + # models=["HuggingFaceTB/SmolVLM-256M-Instruct"], + # test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + # prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}\nAssistant:", # noqa: E501 + # img_idx_to_prompt=lambda idx: "", + # max_model_len=8192, + # max_num_seqs=2, + # auto_cls=AutoModelForImageTextToText, + # hf_output_post_proc=model_utils.idefics3_trunc_hf_output, + # image_size_factors=[(0.25, 0.5, 1.0)], + # vllm_runner_kwargs={ + # "model_impl": "transformers", + # "disable_mm_preprocessor_cache": True, + # "enable_prefix_caching": False, + # }, + # marks=[pytest.mark.core_model], + # ), + # Pixel values from processor are not 4D or 5D arrays + "qwen2_5_vl-transformers": VLMTestInfo( + models=["Qwen/Qwen2.5-VL-3B-Instruct"], + test_type=VLMTestType.IMAGE, + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForImageTextToText, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + image_size_factors=[(0.25, 0.2, 0.15)], + vllm_runner_kwargs={ + "model_impl": "transformers", + "disable_mm_preprocessor_cache": True, + "enable_prefix_caching": False, + }, + marks=[large_gpu_mark(min_gb=32)], + ), + # Check "auto" with fallback to transformers + "internvl-transformers": VLMTestInfo( + models=["OpenGVLab/InternVL3-1B-hf"], + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), + prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 + img_idx_to_prompt=lambda idx: "", + max_model_len=4096, + use_tokenizer_eos=True, + image_size_factors=[(0.25, 0.5, 1.0)], + vllm_runner_kwargs={ + "model_impl": "auto", + "disable_mm_preprocessor_cache": True, + "enable_prefix_caching": False, + }, + auto_cls=AutoModelForImageTextToText, + marks=[pytest.mark.core_model], + ), #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], diff --git a/tests/models/registry.py b/tests/models/registry.py index c2f1089af2ac..19725acd6c45 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -499,6 +499,7 @@ def check_available_online( _TRANSFORMERS_MODELS = { "TransformersForCausalLM": _HfExamplesInfo("ArthurZ/Ilama-3.2-1B", trust_remote_code=True), # noqa: E501 + "TransformersForMultimodalLM": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"), } _EXAMPLE_MODELS = { diff --git a/vllm/config.py b/vllm/config.py index c261f968e7fc..44106dd279b6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -562,6 +562,10 @@ def __post_init__(self) -> None: self.task = "embed" + model_info, arch = self.registry.inspect_model_cls(self.architectures) + self._model_info = model_info + self._architecture = arch + all_supported_tasks = self._get_supported_tasks(self.task) logger.debug("Tasks supported by runner type: %s", all_supported_tasks) supported_runner_types = self._get_supported_runner_types( @@ -587,10 +591,6 @@ def __post_init__(self) -> None: else: self.truncation_side = "right" - model_info, arch = self.registry.inspect_model_cls(self.architectures) - self._model_info = model_info - self._architecture = arch - self.pooler_config = self._init_pooler_config() self.dtype = _get_and_verify_dtype( @@ -674,6 +674,16 @@ def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": "max_model_len must be an integer after __post_init__.") return self + def _get_transformers_backend_cls(self) -> str: + """Determine which Transformers backend class will be used if + `model_impl` is set to `transformers` or `auto`.""" + if self.hf_config != self.hf_text_config: + # If 'hf_text_config' is the same as 'hf_config'. If not, it is + # probably a composite config, i.e. multimodal + return "TransformersForMultimodalLM" + else: + return "TransformersForCausalLM" + @property def registry(self): return me_models.ModelRegistry @@ -681,7 +691,19 @@ def registry(self): @property def architectures(self) -> list[str]: # architectures in the model config. - return getattr(self.hf_config, "architectures", []) + architectures = getattr(self.hf_config, "architectures", []) + # The registry assumes that it can always inspect the vLLM model class + # for a given architecture. This assumption breaks down for the + # Transformers backend, which may use a different class depending on + # the model type. To work around this, we add the correct Transformers + # backend class to the architectures list. We must do this here because + # we need access to the `hf_config` to determine the backend class. + transformers_backend_cls = self._get_transformers_backend_cls() + if (self.model_impl != ModelImpl.VLLM.value + and all(arch != transformers_backend_cls + for arch in architectures)): + architectures.append(transformers_backend_cls) + return architectures @property def architecture(self) -> str: @@ -827,10 +849,9 @@ def _get_preferred_pooling_task( ("EmbeddingModel", "embed"), ("RewardModel", "reward"), ] - _, arch = self.registry.inspect_model_cls(architectures) for suffix, pref_task in suffix_to_preferred_task: - if arch.endswith(suffix): + if self.architecture.endswith(suffix): return pref_task return "embed" @@ -944,10 +965,10 @@ def _resolve_runner( ("EmbeddingModel", "pooling"), ("RewardModel", "pooling"), ] - _, arch = self.registry.inspect_model_cls(self.architectures) for suffix, pref_runner in suffix_to_preferred_runner: - if arch.endswith(suffix) and pref_runner in supported_runner_types: + if self.architecture.endswith( + suffix) and pref_runner in supported_runner_types: return pref_runner if "generate" in supported_runner_types: diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 190d1f006bc4..42c5512905f2 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -25,6 +25,7 @@ as_reward_model, as_seq_cls_model) from vllm.model_executor.models.interfaces import SupportsQuant +from vllm.model_executor.models.registry import _TRANSFORMERS_MODELS from vllm.utils import is_pin_memory_available logger = init_logger(__name__) @@ -169,9 +170,22 @@ def device_loading_context(module: torch.nn.Module, def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str]): + if model_config.model_impl == ModelImpl.VLLM: + raise ValueError( + "Attempting to resolve architecture from the Transformers library " + "but the model implementation is set to vLLM. This should never " + "happen.") + for i, arch in enumerate(architectures): - if arch == "TransformersForCausalLM": + if arch in _TRANSFORMERS_MODELS: continue + + if model_config.model_impl == ModelImpl.AUTO: + logger.warning( + "%s has no vLLM implementation, falling back to Transformers " + "implementation. Some features may not be supported and " + "performance may not be optimal.", arch) + auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map", None) or dict() # Make sure that config class is always initialized before model class, @@ -199,25 +213,13 @@ def resolve_transformers_arch(model_config: ModelConfig, "not present in the model config's 'auto_map' (relevant " "if the model is custom).") model_module = auto_modules["AutoModel"] - # TODO(Isotr0py): Further clean up these raises. - # perhaps handled them in _ModelRegistry._raise_for_unsupported? - if model_config.model_impl == ModelImpl.TRANSFORMERS: - if not model_module.is_backend_compatible(): - raise ValueError( - f"The Transformers implementation of {arch} is not " - "compatible with vLLM.") - architectures[i] = "TransformersForCausalLM" - if model_config.model_impl == ModelImpl.AUTO: - if not model_module.is_backend_compatible(): - raise ValueError( - f"{arch} has no vLLM implementation and the Transformers " - "implementation is not compatible with vLLM. Try setting " - "VLLM_USE_V1=0.") - logger.warning( - "%s has no vLLM implementation, falling back to Transformers " - "implementation. Some features may not be supported and " - "performance may not be optimal.", arch) - architectures[i] = "TransformersForCausalLM" + + if not model_module.is_backend_compatible(): + raise ValueError( + f"The Transformers implementation of '{arch}' is not " + "compatible with vLLM.") + + architectures[i] = model_config._get_transformers_backend_cls() return architectures @@ -237,8 +239,9 @@ def get_model_architecture( ] vllm_supported_archs = ModelRegistry.get_supported_archs() - vllm_not_supported = not any(arch in vllm_supported_archs - for arch in architectures) + is_supported = lambda arch: (arch in vllm_supported_archs and arch not in + _TRANSFORMERS_MODELS) + vllm_not_supported = not any(is_supported(arch) for arch in architectures) if vllm_not_supported: # try automatic conversion in adapters.py @@ -259,7 +262,7 @@ def get_model_architecture( break if (model_config.model_impl == ModelImpl.TRANSFORMERS or - model_config.model_impl != ModelImpl.VLLM and vllm_not_supported): + model_config.model_impl == ModelImpl.AUTO and vllm_not_supported): architectures = resolve_transformers_arch(model_config, architectures) logger.debug_once("Resolve transformers arch %s", str(architectures)) elif (model_config.quantization is not None diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b57130ec84c9..a85e8b0e7b1b 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -253,6 +253,7 @@ } _TRANSFORMERS_MODELS = { + "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), } # yapf: enable @@ -504,9 +505,14 @@ def _normalize_archs( if causal_lm_arch in self.models: normalized_arch.append(arch) - # make sure Transformers backend is put at the last as a fallback - if len(normalized_arch) != len(architectures): - normalized_arch.append("TransformersForCausalLM") + # NOTE(Isotr0py): Be careful of architectures' order! + # Make sure Transformers backend architecture is at the end of the + # list, otherwise pooling models automatic conversion will fail! + for arch in normalized_arch: + if arch.startswith("TransformersFor"): + normalized_arch.remove(arch) + normalized_arch.append(arch) + return normalized_arch def inspect_model_cls( diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 04ee3a454f9d..47cff29caab0 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,8 +15,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Wrapper around `transformers` models""" -from collections.abc import Iterable -from contextlib import nullcontext +from collections.abc import Iterable, Mapping +from contextlib import contextmanager, nullcontext from typing import Literal, Optional, Union import regex as re @@ -41,11 +41,21 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputs, PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo) +from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processor import cached_get_processor +from vllm.utils import is_list_of -from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant +from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, + SupportsQuant) from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - is_pp_missing_parameter, + flatten_bn, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, maybe_prefix) logger = init_logger(__name__) @@ -112,6 +122,269 @@ def replace_linear_class( ) +# Copied from `accelerate` +@contextmanager +def init_on_device_without_buffers(device: torch.device): + """ + A context manager under which models are initialized with all + parameters on the specified device. However buffers are not + initialized on specified device. + + Args: + device (`torch.device`): + Device to initialize all parameters on. + """ + + old_register_parameter = nn.Module.register_parameter + + def register_empty_parameter(module, name, param): + old_register_parameter(module, name, param) + if param is not None: + param_cls = type(module._parameters[name]) + kwargs = module._parameters[name].__dict__ + kwargs["requires_grad"] = param.requires_grad + module._parameters[name] = param_cls( + module._parameters[name].to(device), **kwargs) + + tensor_constructors_to_patch = {} + + def patch_tensor_constructor(fn): + + def wrapper(*args, **kwargs): + kwargs["device"] = device + return fn(*args, **kwargs) + + return wrapper + + try: + nn.Module.register_parameter = register_empty_parameter + for torch_function_name in tensor_constructors_to_patch: + setattr( + torch, torch_function_name, + patch_tensor_constructor(getattr(torch, torch_function_name))) + yield + finally: + nn.Module.register_parameter = old_register_parameter + for torch_function_name, old_torch_function in ( + tensor_constructors_to_patch.items()): + setattr(torch, torch_function_name, old_torch_function) + + +class MultiModalProcessingInfo(BaseProcessingInfo): + + def get_hf_config(self): + return self.ctx.model_config.hf_config + + def get_supported_mm_limits(self): + return {"image": None} + + def get_mm_max_tokens_per_item(self, seq_len, mm_counts): + return {"image": self.get_max_image_tokens()} + + def get_max_image_tokens(self) -> int: + width, height = self.get_max_image_size() + processor = self.get_hf_processor() + mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {} + mm_tokens = processor._get_num_multimodal_tokens( + image_sizes=([height, width], ), **mm_processor_kwargs) + image_tokens = mm_tokens["num_image_tokens"][0] + return image_tokens + + def get_hf_processor(self): + processor = cached_get_processor(self.ctx.model_config.model) + return processor + + def get_max_image_size(self): + return 10_000, 10_000 # hardcode for arbitrary very large size + + +class MultiModalDummyInputsBuilder( + BaseDummyInputsBuilder[MultiModalProcessingInfo]): + + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + + processor = self.info.get_hf_processor() + if "gemma3" in processor.__class__.__name__.lower(): + image_token = processor.boi_token + else: + image_token = getattr(processor, "image_token", "") + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self.info.get_max_image_size() + + return { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + } + + +class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]): + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ): + """ + Given the original multi-modal items for this modality + and HF-processed data, output the updates to perform. + + The information returned by this method is used to update token inputs + which bypass the HF processor. It is also used to update the output of + HF processor if the HF process does not apply prompt updates to text + inputs. + + Moreover, this information is critical to determine the token positions + in order to construct :class:`~vllm-multimodal.input.PlaceholderRange` + for each multi-modal item. + """ + return None + + def _get_mm_fields_config( + self, + hf_inputs, + hf_processor_mm_kwargs, + num_image_patches: torch.Tensor = None, + ): + # HF Processors always return a mask but vLLM doesn't need it + hf_inputs.pop("attention_mask", None) + mm_fields = { + key: MultiModalFieldConfig.flat_from_sizes("image", + num_image_patches) + for key in hf_inputs + } + mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes( + "image", num_image_patches) + mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image") + return mm_fields + + def _apply_hf_processor_text_mm( + self, + prompt_text: str, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ): + """ + Apply the HF processor on the prompt text and multi-modal data + together. + + In addition, return whether prompt replacements have been applied. + """ + processor_data, passthrough_data = self._get_hf_mm_data(mm_items) + processor_data["return_mm_token_type_ids"] = True + + processed_data = self._call_hf_processor( + prompt=prompt_text, + mm_data=processor_data, + mm_kwargs=hf_processor_mm_kwargs, + tok_kwargs=tokenization_kwargs, + ) + processed_data.update(passthrough_data) + + prompt_ids, = processed_data.pop("input_ids").tolist() + mm_token_type_ids = processed_data.pop( + "mm_token_type_ids" + ) if "mm_token_type_ids" in processed_data else processed_data.pop( + "token_type_ids") # for gemma3 only + + return prompt_ids, processed_data, mm_token_type_ids + + def apply( + self, + prompt: Union[str, list[int]], + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Optional[Mapping[str, object]] = None, + return_mm_hashes: bool = False, + ) -> MultiModalInputs: + """ + Process multi-modal inputs to be used in vLLM. + + Apply HF Processor on prompt text and multi-modal data together, + outputting token IDs and processed tensors. + """ + if return_mm_hashes: + raise ValueError( + "TransformersForMultimodalLM doesn't support mm hashing yet! " + "Probably you didn't set `disable_mm_preprocessor_cache=True`") + + if tokenization_kwargs is None: + tokenization_kwargs = {} + + mm_items = self._to_mm_items(mm_data) + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + (prompt_ids, processed_data, + mm_token_type_ids) = self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + tokenization_kwargs=tokenization_kwargs, + ) + + # HF processor will return `mm_token_type_ids` from which + # we can infer mm_placeholders. Until then hardcode to make code run + # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1 + mm_positions = torch.where(mm_token_type_ids == 1)[1] + images = mm_items.get_items("image", ImageProcessorItems) + mm_processor_kwargs = (self.info.ctx.model_config.mm_processor_kwargs + or {}) + image_sizes = [] + for item_idx in range(len(images)): + image_size = images.get_image_size(item_idx) + image_sizes.append((image_size.height, image_size.width)) + + mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens( + image_sizes=image_sizes, **mm_processor_kwargs) + + mm_placeholders = {} + split_sizes = mm_tokens_per_modality["num_image_tokens"] + if split_sizes: + chunked_mm_positions = torch.split(mm_positions, split_sizes) + mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()] + chunked_mm_tokens = torch.split(mm_tokens, split_sizes) + ranges = [ + PlaceholderRange( + offset=positions[0].item(), + length=positions.shape[0], + is_embed=(mm_tokens == hf_processor.image_token_id).bool()) + for positions, mm_tokens in zip(chunked_mm_positions, + chunked_mm_tokens) + ] + mm_placeholders = {"image": ranges} + + num_image_patches = torch.tensor( + mm_tokens_per_modality["num_image_patches"] + ) if "num_image_patches" in mm_tokens_per_modality else None + processed_data['num_image_patches'] = num_image_patches + mm_kwargs = MultiModalKwargs.from_hf_inputs( + processed_data, + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs, + num_image_patches), + ) + + return MultiModalInputs( + type="multimodal", + prompt=prompt, + prompt_token_ids=prompt_ids, + mm_kwargs=mm_kwargs, + mm_hashes=None, + mm_placeholders=mm_placeholders, + ) + + class ConfigOverride: """Context manager to temporarily override config attributes.""" @@ -153,6 +426,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config: QuantizationConfig = vllm_config.quant_config self.config = config + self.text_config = config.get_text_config() self.cache_config = cache_config self.device_config = device_config self.model_config = model_config @@ -173,14 +447,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config_override = ConfigOverride( config, sliding_window=config.interleaved_sliding_window) - # Use meta device to delay allocating GPU tensors - with torch.device("meta"), config_override: + # Set correct attn and init on "meta" to delay allocating GPU tensors + # TODO: @raushan, use the public `model.set_attn_implementation()` + # method after v4.54.0 is released + self.text_config._attn_implementation = "vllm" + with init_on_device_without_buffers("meta"), config_override: # FIXME(Isotr0py): We need to refactor this part in the future to # avoid registering an extra model layer, otherwise we will need a # weights mapper to rename weights. self.model: PreTrainedModel = AutoModel.from_config( config, - attn_implementation="vllm", torch_dtype=model_config.dtype, trust_remote_code=model_config.trust_remote_code, ) @@ -189,27 +465,25 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.tensor_parallel() # Input embeddings + text_config = config.get_text_config() if not isinstance(self.model.get_input_embeddings(), PPMissingLayer): self.model.set_input_embeddings( VocabParallelEmbedding( - config.vocab_size, - config.hidden_size, - org_num_embeddings=config.vocab_size, + text_config.vocab_size, + text_config.hidden_size, + org_num_embeddings=text_config.vocab_size, quant_config=quant_config, )) # Attention layers self.attention_instances = self.create_attention_instances() - # Initialize buffers (e.g. rotary embedding inverse frequency) - self.init_buffers(self.model) - # Initialize any parameters that have not had their modules replaced self.init_parameters(self.model) self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory(["hidden_states"], - config.hidden_size)) + text_config.hidden_size)) def pipeline_parallel(self): """ @@ -240,14 +514,15 @@ def pipeline_parallel(self): # Layers before module list for name in pp_plan[:module_list_idx]: - if self.pp_group.is_first_rank or (self.config.tie_word_embeddings - and self.pp_group.is_last_rank): + if self.pp_group.is_first_rank or ( + self.text_config.tie_word_embeddings + and self.pp_group.is_last_rank): continue setattr(self.model, name, PPMissingLayer()) # Module list - start_layer, end_layer = get_pp_indices(self.config.num_hidden_layers, - self.pp_rank, self.pp_size) + start_layer, end_layer = get_pp_indices( + self.text_config.num_hidden_layers, self.pp_rank, self.pp_size) layers_name = pp_plan[module_list_idx] layers = getattr(self.model, layers_name) for i in range(len(layers)): @@ -298,7 +573,7 @@ def create_attention_instances(self) -> dict[int, Attention]: self.parallel_config) head_size = self.model_config.get_head_size() num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - start, end = get_pp_indices(self.config.num_hidden_layers, + start, end = get_pp_indices(self.text_config.num_hidden_layers, self.pp_rank, self.pp_size) attention_instances = {} @@ -323,35 +598,6 @@ def create_attention_instances(self) -> dict[int, Attention]: prefix=f"{i}.attn") return attention_instances - def init_buffers(self, module: nn.Module): - """ - If a `buffer` is on the `meta` device, then its parent - `module` is the original module created by: - - ```python - with torch.device("meta"): - self.model: PreTrainedModel = AutoModel.from_config(...) - ``` - - This means that: - - `type(module)` is a class from `transformers` - - This class is constructed using a `PretrainedConfig` - """ - for name, buffer in module.named_buffers(recurse=False): - if buffer.device == torch.device("meta"): - if module == self.model: - logger.warning( - "To initialize buffers correctly, we instantiate the " - "parent module and and extract the value of the " - "buffer from it. In this case, the parent module is " - "the base model. Instantiating the entire model here " - "risks GPU OOM. Could this buffer be moved to a child " - "module?") - new_buffer = getattr(type(module)(self.config), name) - setattr(module, name, new_buffer) - for child in module.children(): - self.init_buffers(child) - def init_parameters(self, module: nn.Module): """ If a `parameter` is on the `meta` device, then its parent @@ -366,6 +612,7 @@ def init_parameters(self, module: nn.Module): if param.device == torch.device("meta"): new_param = nn.Parameter( torch.empty_like(param.data, + dtype=self.model_config.dtype, device=self.device_config.device)) setattr(module, name, new_param) for child in module.children(): @@ -391,11 +638,16 @@ def forward( if inputs_embeds is not None: inputs_embeds = inputs_embeds[None, ...] + if self.model_config.uses_mrope: + position_ids = positions[:, None] + else: + position_ids = positions[None, ...] + hidden_states = self.model( input_ids=input_ids, inputs_embeds=inputs_embeds, use_cache=False, - position_ids=positions[None, ...], + position_ids=position_ids, attention_instances=self.attention_instances, return_dict=False)[0][0, ...] # we remove batch dimension for now @@ -507,3 +759,180 @@ def load_weights(self, weights: Iterable[tuple[str, if self.config.tie_word_embeddings else None), ) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + +@MULTIMODAL_REGISTRY.register_processor( + MultiModalProcessor, + info=MultiModalProcessingInfo, + dummy_inputs=MultiModalDummyInputsBuilder) +class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA, + SupportsPP, SupportsMultiModal): + embedding_padding_modules = ["lm_head"] + embedding_modules = ["embed_tokens"] + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config: PretrainedConfig = vllm_config.model_config.hf_config + quant_config: QuantizationConfig = vllm_config.quant_config + + self.config = config + self.dtype = vllm_config.model_config.dtype + + self.model = TransformersModel(vllm_config=vllm_config, prefix=prefix) + text_config = config.get_text_config() + + if get_pp_group().is_last_rank: + self.unpadded_vocab_size = text_config.vocab_size + self.lm_head = ParallelLMHead( + text_config.vocab_size, + text_config.hidden_size, + quant_config=quant_config, + prefix=maybe_prefix(prefix, "lm_head"), + ) + if text_config.tie_word_embeddings: + self.lm_head = self.lm_head.tie_weights( + self.model.get_input_embeddings()) + + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + text_config.vocab_size, + logit_scale) + else: + self.lm_head = PPMissingLayer() + + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) + + @property + def hf_to_vllm_mapper(self): + # Backwards compatibility for prev released models + # State dicts back then had different formats + # and cannot be loaded with `AutoModel` mapping + # as is + prefix_mapper = { + "language_model.model": "model.language_model", + "text_model.model": "model.text_model", + "vision_tower": "model.vision_tower", + "vqmodel": "model.vqmodel", + "vision_model": "model.vision_model", + "vision_embed_tokens": "model.vision_embed_tokens", + "image_newline": "model.image_newline", + "multi_modal_projector": "model.multi_modal_projector", + "text_model.lm_head": "lm_head", + "language_model.lm_head": "lm_head", + } + # Don't change the order for QwenVL + if 'Qwen2' in self.config.__class__.__name__: + prefix_mapper["model"] = "model.language_model" + prefix_mapper["visual"] = "model.visual" + + return WeightsMapper(orig_to_new_prefix=prefix_mapper, ) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, + ) -> Union[torch.Tensor, IntermediateTensors]: + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. + if inputs_embeds is None: + multimodal_embeds = self.get_multimodal_embeddings(**kwargs) + if multimodal_embeds is not None: + inputs_embeds = self.get_input_embeddings( + input_ids, multimodal_embeds) + input_ids = None + + model_output = self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) + return model_output + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + loader = AutoWeightsLoader( + self, + skip_prefixes=([ + "lm_head." + ] if self.config.get_text_config().tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_multimodal_embeddings(self, **kwargs): + pixel_values = kwargs.pop("pixel_values", None) + pixel_values = pixel_values if pixel_values is not None else kwargs.pop( + "image_patches", None) + image_embeds = kwargs.pop("image_embeds", None) + + if image_embeds is not None: + return image_embeds + + if pixel_values is None and image_embeds is None: + return None + + num_image_patches = kwargs.pop("num_image_patches") + if pixel_values is not None: + if isinstance(pixel_values, torch.Tensor): + pixel_values = flatten_bn(pixel_values).to(self.dtype) + elif is_list_of(pixel_values, torch.Tensor): + pixel_values = flatten_bn(flatten_bn(pixel_values), + concat=True).to(self.dtype) + else: + raise ValueError( + f"Unsupported pixel_values type {type(pixel_values)}. " + "Expected `torch.Tensor` or list of `torch.Tensor`.") + + if isinstance(num_image_patches, list): + num_image_patches = torch.cat(num_image_patches) + + vision_embeddings = self.model.model.get_image_features( + pixel_values, + **{ + k: v.flatten(0, 1) + for k, v in kwargs.items() + }, + ) + + if isinstance(vision_embeddings, torch.Tensor): + if vision_embeddings.ndim == 2: + vision_embeddings = vision_embeddings.unsqueeze(0) + + # Embeddings have to be 2D tensors of length `num_images` + # but transformers returns concat tensors if each patch + # is of different size. We split it back to make vLLM happy + vision_embeddings = torch.split( + vision_embeddings, + num_image_patches.flatten().tolist()) + vision_embeddings = [ + embed.flatten(start_dim=0, end_dim=-2) + for embed in vision_embeddings + ] + + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings=None, + ) -> torch.Tensor: + inputs_embeds = self.model.model.get_input_embeddings()(input_ids) + if (multimodal_embeddings is not None + and len(multimodal_embeddings) != 0): + mask = (input_ids == self.config.image_token_id) + mask = mask.unsqueeze(-1).expand_as(inputs_embeds) + multimodal_embeddings = torch.cat(multimodal_embeddings) + + inputs_embeds = inputs_embeds.masked_scatter( + mask, multimodal_embeddings) + return inputs_embeds From 7ba34b1241ada58f8212f350a8b17382cb412cf2 Mon Sep 17 00:00:00 2001 From: Jiayi Yan <66017932+1195343015@users.noreply.github.com> Date: Mon, 21 Jul 2025 01:12:10 +0800 Subject: [PATCH 35/57] [bugfix] fix syntax warning caused by backslash (#21251) --- examples/offline_inference/neuron_eagle.py | 2 +- tests/v1/kv_connector/unit/test_nixl_connector.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py index 0b2070c8e253..8b1d235ff974 100644 --- a/examples/offline_inference/neuron_eagle.py +++ b/examples/offline_inference/neuron_eagle.py @@ -54,7 +54,7 @@ def main(): for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}") + print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}") if __name__ == "__main__": diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py index a0dfd54fb825..99bde919c725 100644 --- a/tests/v1/kv_connector/unit/test_nixl_connector.py +++ b/tests/v1/kv_connector/unit/test_nixl_connector.py @@ -341,7 +341,7 @@ def test_abort_timeout_on_prefiller(monkeypatch, distributed_executor_backend): Test lifecycle of an aborted Remote Prefill request hitting the timeout. -----> P | {process request} - <-\--- | {result is NOT delivered, eg proxy is down} + <-/--- | {result is NOT delivered, eg proxy is down} | | | {eventually free blocks} From 8188196a1c8af26134d8e366ebe564c18fb95379 Mon Sep 17 00:00:00 2001 From: Kay Yan Date: Mon, 21 Jul 2025 11:13:02 +0800 Subject: [PATCH 36/57] [CI] Cleanup modelscope version constraint in Dockerfile (#21243) Signed-off-by: Kay Yan --- docker/Dockerfile | 2 +- docker/Dockerfile.xpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index b06c4d33626d..d1fa92ce6d19 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -510,7 +510,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ else \ BITSANDBYTES_VERSION="0.46.1"; \ fi; \ - uv pip install --system accelerate hf_transfer 'modelscope!=1.15.0' "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] + uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3] ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docker/Dockerfile.xpu b/docker/Dockerfile.xpu index 41b4c42e4c4b..3130435ca721 100644 --- a/docker/Dockerfile.xpu +++ b/docker/Dockerfile.xpu @@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer pytest 'modelscope!=1.15.0' + pip install accelerate hf_transfer pytest modelscope ENV VLLM_USAGE_SOURCE production-docker-image \ TRITON_XPU_PROFILE 1 From 92615d7fe80b68206f71b26b00583e6c530d4387 Mon Sep 17 00:00:00 2001 From: Simon Mo Date: Sun, 20 Jul 2025 21:58:07 -0700 Subject: [PATCH 37/57] [Docs] Add RFC Meeting to Issue Template (#21279) Signed-off-by: simon-mo --- .github/ISSUE_TEMPLATE/750-RFC.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml index e447c077473f..7ee57c42895c 100644 --- a/.github/ISSUE_TEMPLATE/750-RFC.yml +++ b/.github/ISSUE_TEMPLATE/750-RFC.yml @@ -46,7 +46,7 @@ body: - type: markdown attributes: value: > - Thanks for contributing 🎉! + Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit). - type: checkboxes id: askllm attributes: From 940af1f03a6d47415655ba32c0ba551b24161faa Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sun, 20 Jul 2025 22:29:18 -0700 Subject: [PATCH 38/57] Add the instruction to run e2e validation manually before release (#21023) Signed-off-by: Huy Do --- RELEASE.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 7f5270715212..9352e7ef706c 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -52,3 +52,36 @@ After branch cut, we approach finalizing the release branch with clear criteria * Release branch specific changes (e.g. change version identifiers or CI fixes) Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes. + +## Manual validations + +### E2E Performance Validation + +Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI. + +**Current Coverage:** +* Models: Llama3, Llama4, and Mixtral +* Hardware: NVIDIA H100 and AMD MI300x +* *Note: Coverage may change based on new model releases and hardware availability* + +**Performance Validation Process:** + +**Step 1: Get Access** +Request write access to the [pytorch/pytorch-integration-testing](https://github.com/pytorch/pytorch-integration-testing) repository to run the benchmark workflow. + +**Step 2: Review Benchmark Setup** +Familiarize yourself with the benchmark configurations: +* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda) +* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm) + +**Step 3: Run the Benchmark** +Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure: +* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`) +* **vLLM commit**: Set to the RC commit hash + +**Step 4: Review Results** +Once the workflow completes, benchmark results will be available on the [vLLM benchmark dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) under the corresponding branch and commit. + +**Step 5: Performance Comparison** +Compare the current results against the previous release to verify no performance regressions have occurred. Here is an +example of [v0.9.1 vs v0.9.2](https://hud.pytorch.org/benchmark/llms?startTime=Thu%2C%2017%20Apr%202025%2021%3A43%3A50%20GMT&stopTime=Wed%2C%2016%20Jul%202025%2021%3A43%3A50%20GMT&granularity=week&lBranch=releases/v0.9.1&lCommit=b6553be1bc75f046b00046a4ad7576364d03c835&rBranch=releases/v0.9.2&rCommit=a5dd03c1ebc5e4f56f3c9d3dc0436e9c582c978f&repoName=vllm-project%2Fvllm&benchmarkName=&modelName=All%20Models&backendName=All%20Backends&modeName=All%20Modes&dtypeName=All%20DType&deviceName=All%20Devices&archName=All%20Platforms). From 378d33c3929aab549282ebaab193fe43918e591a Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 21 Jul 2025 13:50:06 +0800 Subject: [PATCH 39/57] [Bugfix] Fix missing placeholder in logger debug (#21280) Signed-off-by: DarkLight1337 --- vllm/transformers_utils/configs/mistral.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index e66f762eb809..8a9c660b882f 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -42,7 +42,7 @@ def adapt_config_dict(config_dict: dict[str, Any], config = PretrainedConfig.from_dict(config_dict) - logger.debug("Initialized config", config) + logger.debug("Initialized config %s", config) return config From 042af0c8d3f0b8b5319f34e4cb9b690981bb5da4 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 21 Jul 2025 17:22:21 +0800 Subject: [PATCH 40/57] [Model][1/N] Support multiple poolers at model level (#21227) Signed-off-by: DarkLight1337 --- docs/models/pooling_models.md | 53 ++- tests/models/test_transformers.py | 2 +- .../my_gemma_embedding.py | 15 +- vllm/config.py | 8 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/model_executor/layers/pooler.py | 346 +++++++++--------- vllm/model_executor/models/adapters.py | 108 +++--- vllm/model_executor/models/bert.py | 132 +++++-- vllm/model_executor/models/gpt2.py | 16 +- vllm/model_executor/models/gritlm.py | 39 +- vllm/model_executor/models/internlm2.py | 12 +- vllm/model_executor/models/jamba.py | 29 +- vllm/model_executor/models/jina_vl.py | 18 +- vllm/model_executor/models/modernbert.py | 50 ++- vllm/model_executor/models/qwen2_rm.py | 35 +- vllm/model_executor/models/roberta.py | 44 ++- vllm/model_executor/pooling_metadata.py | 7 + vllm/v1/pool/metadata.py | 8 + vllm/v1/worker/gpu_model_runner.py | 16 +- vllm/v1/worker/tpu_model_runner.py | 7 +- vllm/worker/model_runner_base.py | 7 +- vllm/worker/pooling_model_runner.py | 10 +- 22 files changed, 550 insertions(+), 414 deletions(-) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index f0de84a66f8b..eef8f20e4e5c 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -11,26 +11,51 @@ before returning them. As shown in the [Compatibility Matrix](../features/compatibility_matrix.md), most vLLM features are not applicable to pooling models as they only work on the generation or decode stage, so performance may not improve as much. -For pooling models, we support the following `--task` options. -The selected option sets the default pooler used to extract the final hidden states: +If the model doesn't implement this interface, you can set `--task` which tells vLLM +to convert the model into a pooling model. -| Task | Pooling Type | Normalization | Softmax | -|---------------------------------|----------------|-----------------|-----------| -| Embedding (`embed`) | `LAST` | ✅︎ | ❌ | -| Classification (`classify`) | `LAST` | ❌ | ✅︎ | -| Sentence Pair Scoring (`score`) | \* | \* | \* | +| `--task` | Model type | Supported pooling tasks | +|------------|----------------------|-------------------------------| +| `embed` | Embedding model | `encode`, `embed` | +| `classify` | Classification model | `encode`, `classify`, `score` | +| `reward` | Reward model | `encode` | -\*The default pooler is always defined by the model. +## Pooling Tasks -!!! note - If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table. +In vLLM, we define the following pooling tasks and corresponding APIs: + +| Task | APIs | +|------------|--------------------| +| `encode` | `encode` | +| `embed` | `embed`, `score`\* | +| `classify` | `classify` | +| `score` | `score` | + +\*The `score` API falls back to `embed` task if the model does not support `score` task. + +Each pooling model in vLLM supports one or more of these tasks according to [Pooler.get_supported_tasks][vllm.model_executor.layers.Pooler.get_supported_tasks]. + +By default, the pooler assigned to each task has the following attributes: + +| Task | Pooling Type | Normalization | Softmax | +|------------|----------------|---------------|---------| +| `encode` | `ALL` | ❌ | ❌ | +| `embed` | `LAST` | ✅︎ | ❌ | +| `classify` | `LAST` | ❌ | ✅︎ | + +These defaults may be overridden by the model's implementation in vLLM. When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models, -we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`). +we attempt to override the defaults based on its Sentence Transformers configuration file (`modules.json`), +which takes priority over the model's defaults. + +You can further customize this via the `--override-pooler-config` option, +which takes priority over both the model's and Sentence Transformers's defaults. + +!!! note -!!! tip - You can customize the model's pooling method via the `--override-pooler-config` option, - which takes priority over both the model's and Sentence Transformers's defaults. + The above configuration may be disregarded if the model's implementation in vLLM defines its own pooler + that is not based on [PoolerConfig][vllm.config.PoolerConfig]. ## Offline Inference diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index b87290e96a27..16b9bcffd265 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -144,7 +144,7 @@ def test_quantization( "model", ["jason9693/Qwen2.5-1.5B-apeach"], ) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", ["float"]) def test_classify( hf_runner, vllm_runner, diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py index 797353e4f7a8..fc654f20fff2 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py @@ -8,7 +8,7 @@ import torch.nn as nn from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.models.gemma2 import Gemma2Model from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix from vllm.sequence import IntermediateTensors @@ -26,12 +26,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = Gemma2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) - self.pooler = Pooler.from_config_with_defaults( - vllm_config.model_config.pooler_config, - pooling_type=PoolingType.LAST, - normalize=True, - softmax=False, - ) + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": Pooler.for_encode(pooler_config), + "embed": Pooler.for_embed(pooler_config), + }) self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) diff --git a/vllm/config.py b/vllm/config.py index 44106dd279b6..4cafbc926052 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -94,7 +94,7 @@ TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription", "draft"] -_ResolvedTask = Literal["generate", "transcription", "pooling", "embed", +_ResolvedTask = Literal["generate", "transcription", "encode", "embed", "classify", "reward", "draft"] RunnerOption = Literal["auto", "generate", "pooling", "draft"] @@ -103,7 +103,7 @@ _RUNNER_TASKS: dict[RunnerType, list[_ResolvedTask]] = { "generate": ["generate", "transcription"], - "pooling": ["pooling", "embed", "classify", "reward"], + "pooling": ["encode", "embed", "classify", "reward"], "draft": [], } @@ -579,7 +579,7 @@ def __post_init__(self) -> None: # user-selected task if runner_type == "pooling" and self.task == "auto": selected_task = all_supported_tasks[runner_type][-1] - assert selected_task != "pooling" + assert selected_task != "encode" self.task = selected_task self.supported_runner_types = supported_runner_types self.runner_type = runner_type @@ -884,7 +884,7 @@ def _get_supported_pooling_tasks( supported_tasks = list[_ResolvedTask]() if registry.is_pooling_model(architectures): - supported_tasks.append("pooling") + supported_tasks.append("encode") # For now, users must specify the task (other than "pooling") # to use for pooling models diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3f0c1c85dee6..57240bb4f333 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1668,7 +1668,7 @@ async def init_app_state( request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, - ) if "pooling" in model_config.supported_tasks else None + ) if "encode" in model_config.supported_tasks else None state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 6a474b8e73a3..c06cca080227 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -1,15 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import ABC, abstractmethod +from collections.abc import Mapping, Set from dataclasses import dataclass from enum import IntEnum +from itertools import groupby from typing import Callable, Optional, TypeVar, Union import torch import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from typing_extensions import assert_never from vllm.config import ModelConfig, PoolerConfig from vllm.model_executor.pooling_metadata import ( # noqa: E501 @@ -21,6 +22,10 @@ from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata] +PoolingFn = Callable[ + [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata], + Union[torch.Tensor, list[torch.Tensor]]] +ClassifierFn = Callable[[torch.Tensor], torch.Tensor] class PoolingType(IntEnum): @@ -79,37 +84,81 @@ class Pooler(nn.Module, ABC): """The interface required for all poolers used in pooling models in vLLM.""" @staticmethod - def from_config_with_defaults( + def for_encode( pooler_config: PoolerConfig, - pooling_type: PoolingType, - normalize: bool, - softmax: bool, - step_tag_id: Optional[int] = None, - returned_token_ids: Optional[list[int]] = None, - ) -> "Pooler": + *, + default_pooling_type: PoolingType = PoolingType.ALL, + default_normalize: bool = False, + default_softmax: bool = False, + default_step_tag_id: Optional[int] = None, + default_returned_token_ids: Optional[list[int]] = None, + ): resolved_config = ResolvedPoolingConfig.from_config_with_defaults( pooler_config=pooler_config, - pooling_type=pooling_type, - normalize=normalize, - softmax=softmax, - step_tag_id=step_tag_id, - returned_token_ids=returned_token_ids, + pooling_type=default_pooling_type, + normalize=default_normalize, + softmax=default_softmax, + step_tag_id=default_step_tag_id, + returned_token_ids=default_returned_token_ids, ) - if pooling_type == PoolingType.STEP: + if resolved_config.pooling_type == PoolingType.STEP: return StepPooler.from_config(resolved_config) return SimplePooler.from_config(resolved_config) - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + @staticmethod + def for_embed( + pooler_config: PoolerConfig, + *, + default_pooling_type: PoolingType = PoolingType.LAST, + default_normalize: bool = True, + default_softmax: bool = False, + ): + resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + pooler_config=pooler_config, + pooling_type=default_pooling_type, + normalize=default_normalize, + softmax=default_softmax, + ) + + return SimplePooler.from_config(resolved_config) + + @staticmethod + def for_classify( + pooler_config: PoolerConfig, + classifier: Optional[ClassifierFn], + *, + default_pooling_type: PoolingType = PoolingType.LAST, + default_normalize: bool = False, + default_softmax: bool = True, + ): + resolved_config = ResolvedPoolingConfig.from_config_with_defaults( + pooler_config=pooler_config, + pooling_type=default_pooling_type, + normalize=default_normalize, + softmax=default_softmax, + ) + base_pooler = SimplePooler.from_config(resolved_config) + if classifier is None: + return base_pooler + + return ClassifierPooler( + pooling=base_pooler.pooling, + classifier=classifier, + act_fn=base_pooler.head.activation, + ) + + @abstractmethod + def get_supported_tasks(self) -> Set[PoolingTask]: + """Determine which pooling tasks are supported.""" + raise NotImplementedError + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: """ - Construct the pooling parameters to use for a task, - or `None` if the task is not supported. + Construct the updated pooling parameters to use for a supported task. """ - return None + return PoolingParamsUpdate() @abstractmethod def forward( @@ -127,9 +176,8 @@ def get_prompt_lens( if isinstance(pooling_metadata, V1PoolingMetadata): return pooling_metadata.prompt_lens - assert isinstance(hidden_states, torch.Tensor) return PoolingTensors.from_pooling_metadata( - pooling_metadata, hidden_states.device).prompt_lens + pooling_metadata, hidden_states[0].device).prompt_lens def get_prompt_token_ids( @@ -149,6 +197,21 @@ def get_prompt_token_ids( ] +def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]: + if isinstance(pooling_metadata, V0PoolingMetadata): + pooling_params = [p for _, p in pooling_metadata.seq_groups] + else: + pooling_params = pooling_metadata.pooling_params + + tasks: list[PoolingTask] = [ + task for pooling_param in pooling_params + if (task := pooling_param.task) is not None + ] + assert len(pooling_params) == len(tasks) + + return tasks + + def get_classification_activation_function(config: PretrainedConfig): return PoolerClassify() @@ -172,7 +235,8 @@ def get_cross_encoder_activation_function(config: PretrainedConfig): return PoolerScore() -def build_output(all_data: torch.Tensor) -> PoolerOutput: +def build_output( + all_data: Union[torch.Tensor, list[torch.Tensor]], ) -> PoolerOutput: all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data] return PoolerOutput(outputs=all_outputs) @@ -193,12 +257,12 @@ def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod": raise NotImplementedError(f"Unsupported method: {pooling_type}") @abstractmethod - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: raise NotImplementedError + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate() + @abstractmethod def forward_one( self, @@ -237,16 +301,8 @@ def forward( class CLSPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if (task == "encode" or task == "embed" or task == "classify" - or task == "score"): - return PoolingParamsUpdate() - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed", "classify", "score"} def forward_one( self, @@ -270,16 +326,8 @@ def forward_all( class LastPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if (task == "encode" or task == "embed" or task == "classify" - or task == "score"): - return PoolingParamsUpdate() - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed", "classify", "score"} def forward_one( self, @@ -299,18 +347,8 @@ def forward_all( class AllPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - if task == "encode": - return PoolingParamsUpdate() - - # The equalities are split up to keep mypy happy - if task == "embed" or task == "classify" or task == "score": - return None - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode"} def forward_one( self, @@ -327,28 +365,13 @@ def forward_all( hidden_states: torch.Tensor, prompt_lens: torch.Tensor, ) -> Union[list[torch.Tensor], torch.Tensor]: - offset = 0 - pooled_data = list[torch.Tensor]() - - for prompt_len in prompt_lens: - pooled_data.append(hidden_states[offset:offset + prompt_len]) - offset += prompt_len - - return pooled_data + return list(hidden_states.split_with_sizes(prompt_lens.tolist())) class MeanPool(PoolingMethod): - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if (task == "encode" or task == "embed" or task == "classify" - or task == "score"): - return PoolingParamsUpdate() - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed", "classify", "score"} def forward_one( self, @@ -529,24 +552,6 @@ class SimplePooler(Pooler): 3. Returns structured results as `PoolerOutput`. """ - @classmethod - def from_config_with_defaults( # type: ignore[override] - cls, - pooler_config: PoolerConfig, - pooling_type: PoolingType, - normalize: bool, - softmax: bool, - ) -> "SimplePooler": - resolved_config = ResolvedPoolingConfig.from_config_with_defaults( - pooler_config=pooler_config, - pooling_type=pooling_type, - normalize=normalize, - softmax=softmax, - ) - assert resolved_config.pooling_type != PoolingType.STEP - - return cls.from_config(resolved_config) - @classmethod def from_config( cls, @@ -563,10 +568,10 @@ def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None: self.pooling = pooling self.head = head - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) def forward( @@ -627,18 +632,11 @@ def extract_states( return pooled_data - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - if task == "encode": - return PoolingParamsUpdate(requires_token_ids=True) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode"} - # The equalities are split up to keep mypy happy - if task == "embed" or task == "classify" or task == "score": - return None - - assert_never(task) + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate(requires_token_ids=True) def forward( self, @@ -650,68 +648,43 @@ def forward( return build_output(pooled_data) -PoolingFn = Callable[ - [Union[torch.Tensor, list[torch.Tensor]], PoolingMetadata], - Union[torch.Tensor, list[torch.Tensor]]] -ClassifierFn = Callable[[torch.Tensor], torch.Tensor] - - -class ClassifierPooler(nn.Module): +class ClassifierPooler(Pooler): """A pooling layer for classification tasks. This layer does the following: 1. Applies a classification layer to the hidden states. 2. Optionally applies a pooler layer. - 3. Applies an activation function to the output. In the case of - classification models it is either sigmoid or softmax. In the - case of scoring models, the same behavior is configuration - dependent, as in the sentence-transformers library. + 3. Applies an activation function to the output. """ + @staticmethod + def act_fn_for_seq_cls(config: ModelConfig): + return get_classification_activation_function(config.hf_config) + + @staticmethod + def act_fn_for_cross_encoder(config: ModelConfig): + return get_cross_encoder_activation_function(config.hf_config) + def __init__( self, - config: ModelConfig, pooling: PoolingFn, classifier: ClassifierFn, - act_fn: Optional[PoolerActivation] = None, + act_fn: PoolerActivation, ) -> None: super().__init__() self.pooling = pooling self.classifier = classifier + self.act_fn = act_fn - self.classification_act_fn = get_classification_activation_function( - config.hf_config) if act_fn is None else act_fn - self.cross_encoder_act_fn = get_cross_encoder_activation_function( - config.hf_config) if act_fn is None else act_fn - - def _get_act_fn(self, task: PoolingTask): - if task == "encode" or task == "classify": - return self.classification_act_fn - if task == "score": - return self.cross_encoder_act_fn - - raise ValueError(f"Unsupported task: {task!r}") - - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if task == "encode" or task == "classify" or task == "score": - return PoolingParamsUpdate() - - if task == "embed": - return None - - assert_never(task) + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"classify", "score"} def forward( self, hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> PoolerOutput: - """Pools sentence pair scores from the hidden_states.""" pooled_data = self.pooling(hidden_states, pooling_metadata) # apply classifier once on the full batch if possible @@ -722,28 +695,59 @@ def forward( else: pooled_output = [self.classifier(data) for data in pooled_data] - task_list: list[PoolingTask] - if isinstance(pooling_metadata, V0PoolingMetadata): - task_list = [ - task for _, pooling_param in pooling_metadata.seq_groups - if (task := pooling_param.task) is not None - ] - else: - task_list = [ - task for pooling_param in pooling_metadata.pooling_params - if (task := pooling_param.task) is not None - ] + scores = self.act_fn(pooled_output) + + return build_output(scores) + + +class DispatchPooler(Pooler): + """Dispatches calls to a sub-pooler based on the pooling task.""" + + def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None: + super().__init__() + + for task, pooler in poolers_by_task.items(): + if task not in pooler.get_supported_tasks(): + raise ValueError( + f"{pooler=} does not support {task=}. " + f"Supported tasks: {pooler.get_supported_tasks()}") + + self.poolers_by_task = poolers_by_task + + def get_supported_tasks(self) -> Set[PoolingTask]: + return set(self.poolers_by_task) - assert len(task_list) == len(pooled_output) + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return self.poolers_by_task[task].get_pooling_updates(task) - # shape of scores: (batch_size, num_labels) - if len(set(task_list)) <= 1: - act_fn = self._get_act_fn(task_list[0]) - scores = act_fn(pooled_output) + def forward( + self, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], + pooling_metadata: PoolingMetadata, + ) -> PoolerOutput: + poolers_by_task = self.poolers_by_task + + if isinstance(hidden_states, list): + hidden_states_lst = hidden_states else: - scores = torch.stack([ - self._get_act_fn(task)(vecs) - for task, vecs in zip(task_list, pooled_output) - ]) + prompt_lens = get_prompt_lens(hidden_states, pooling_metadata) + hidden_states_lst = list(hidden_states.split(prompt_lens.tolist())) - return build_output(scores) + outputs = list[PoolingSequenceGroupOutput]() + offset = 0 + for task, group in groupby(get_tasks(pooling_metadata)): + if not (pooler := poolers_by_task.get(task)): + raise ValueError( + f"Unsupported task: {task} " + f"Supported tasks: {self.get_supported_tasks()}") + + num_items = len(list(group)) + group_output: PoolerOutput = pooler( + hidden_states_lst[offset:offset + num_items], + pooling_metadata[offset:offset + num_items], + ) + + outputs.extend(group_output.outputs) + offset += num_items + + return PoolerOutput(outputs) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 31b1d9a8b3c0..867de2c68b4c 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -13,7 +13,6 @@ if TYPE_CHECKING: from vllm.config import VllmConfig - from vllm.model_executor.layers.pooler import PoolingType _T = TypeVar("_T", bound=type[nn.Module]) @@ -34,16 +33,8 @@ def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str: return model_name + pooling_suffix -def _create_pooling_model_cls( - orig_cls: _T, - *, - default_pooling_type: "PoolingType", - default_normalize: bool, - default_softmax: bool, -) -> _T: +def _create_pooling_model_cls(orig_cls: _T) -> _T: # Lazy import - from vllm.model_executor.layers.pooler import Pooler - from .utils import AutoWeightsLoader, WeightsMapper class ModelForPooling(orig_cls, VllmModelForPooling): @@ -71,15 +62,7 @@ def __init__( self._init_pooler(vllm_config, prefix=prefix) def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): - pooler_config = vllm_config.model_config.pooler_config - assert pooler_config is not None - - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=default_pooling_type, - normalize=default_normalize, - softmax=default_softmax, - ) + raise NotImplementedError def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # TODO: Support uninitialized params tracking @@ -132,14 +115,20 @@ def as_embedding_model(cls: _T) -> _T: return cls # Lazy import - from vllm.model_executor.layers.pooler import PoolingType - - ModelForEmbedding = _create_pooling_model_cls( - cls, - default_pooling_type=PoolingType.LAST, - default_normalize=True, - default_softmax=False, - ) + from vllm.model_executor.layers.pooler import DispatchPooler, Pooler + + class ModelForEmbedding(_create_pooling_model_cls(cls)): + + def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler( + { + "encode": Pooler.for_encode(pooler_config), + "embed": Pooler.for_embed(pooler_config), + }, ) + ModelForEmbedding.__name__ = \ _get_pooling_model_name(cls.__name__, "ForEmbedding") @@ -165,20 +154,14 @@ def as_seq_cls_model(cls: _T) -> _T: # Lazy import from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.pooler import (ClassifierPooler, - PoolingType, SimplePooler) + DispatchPooler, Pooler, + PoolingMethod, PoolingType) from vllm.model_executor.models.interfaces import SupportsCrossEncoding from vllm.sequence import IntermediateTensors from .utils import maybe_prefix - ModelForPooling = _create_pooling_model_cls( - cls, - default_pooling_type=PoolingType.LAST, - default_normalize=False, - default_softmax=True, - ) - - class ModelForSequenceClassification(ModelForPooling, + class ModelForSequenceClassification(_create_pooling_model_cls(cls), SupportsCrossEncoding): def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): @@ -198,19 +181,28 @@ def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - pooler = SimplePooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True, - ) - - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=pooler.pooling, - classifier=self._classifier, - act_fn=pooler.head.activation, - ) + pooling_type_str = pooler_config.pooling_type + pooling_type = (PoolingType.LAST if pooling_type_str is None else + PoolingType[pooling_type_str]) + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=PoolingMethod.from_pooling_type(pooling_type), + classifier=self._classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=PoolingMethod.from_pooling_type(pooling_type), + classifier=self._classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def _classifier(self, x: torch.Tensor): x, _ = self.score(x.float()) @@ -259,14 +251,16 @@ def as_reward_model(cls: _T) -> _T: return cls # Lazy import - from vllm.model_executor.layers.pooler import PoolingType - - ModelForReward = _create_pooling_model_cls( - cls, - default_pooling_type=PoolingType.ALL, - default_normalize=False, - default_softmax=False, - ) + from vllm.model_executor.layers.pooler import DispatchPooler, Pooler + + class ModelForReward(_create_pooling_model_cls(cls)): + + def _init_pooler(self, vllm_config: "VllmConfig", prefix: str = ""): + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}, ) ModelForReward.__name__ = \ _get_pooling_model_name(cls.__name__, "ForReward") diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 006f547bb461..9dc6115f850e 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable +from collections.abc import Iterable, Set from typing import Optional, Union import torch @@ -17,7 +17,8 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler, +from vllm.model_executor.layers.pooler import (ClassifierPooler, + DispatchPooler, Pooler, PoolingMethod, PoolingParamsUpdate, PoolingType) @@ -92,20 +93,29 @@ def __init__(self, config: BertConfig): self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) + def _head(self, pooled_output: torch.Tensor): + pooled_output = self.dense(pooled_output) + pooled_output = self.activation(pooled_output) + return pooled_output + def forward( self, hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[torch.Tensor, list[torch.Tensor]]: pooled_output = self.pooling(hidden_states, pooling_metadata) - pooled_output = self.dense(pooled_output) - pooled_output = self.activation(pooled_output) + + if isinstance(pooled_output, list): + pooled_output = [self._head(output) for output in pooled_output] + else: + pooled_output = self._head(pooled_output) + return pooled_output @@ -333,18 +343,19 @@ class BertModel(nn.Module, SupportsQuant): packed_modules_mapping = {"qkv_proj": ["query", "key", "value"]} - def __init__(self, - *, - vllm_config: VllmConfig, - prefix: str = "", - embedding_class: type = BertEmbedding, - add_pooling_layer: bool = False): + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + embedding_class: type[nn.Module] = BertEmbedding, + ) -> None: super().__init__() + config = vllm_config.model_config.hf_config self.embeddings = embedding_class(config) self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder") - self.pooler = BertPooler(config) if add_pooling_layer else None def forward( self, @@ -366,8 +377,7 @@ def forward( token_type_ids=token_type_ids) return self.encoder(hidden_states) - def load_weights(self, weights: Iterable[tuple[str, - torch.Tensor]]) -> set[str]: + def _load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -395,10 +405,43 @@ def load_weights(self, weights: Iterable[tuple[str, if name in params_dict: other_weights.append((name, loaded_weight)) - loader = AutoWeightsLoader( - self, - skip_prefixes=(["pooler."] if self.pooler is None else []), + return other_weights, loaded_stacked_params + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + other_weights, loaded_stacked_params = self._load_weights(weights) + + loader = AutoWeightsLoader(self, skip_prefixes=["pooler."]) + loaded_params = loader.load_weights(other_weights) + loaded_params.update(loaded_stacked_params) + return loaded_params + + +class BertPoolingModel(BertModel): + + is_pooling_model = True + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + embedding_class: type[nn.Module] = BertEmbedding, + ) -> None: + super().__init__( + vllm_config=vllm_config, + prefix=prefix, + embedding_class=embedding_class, ) + + config = vllm_config.model_config.hf_config + self.pooler = BertPooler(config) + + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: + other_weights, loaded_stacked_params = self._load_weights(weights) + + loader = AutoWeightsLoader(self) loaded_params = loader.load_weights(other_weights) loaded_params.update(loaded_stacked_params) return loaded_params @@ -421,6 +464,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + self.model = self._build_model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self.pooler = self._build_pooler(pooler_config) @@ -456,10 +501,15 @@ def _build_model(self, embedding_class=BertEmbedding) def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: - return Pooler.from_config_with_defaults(pooler_config, - pooling_type=PoolingType.CLS, - normalize=True, - softmax=False) + return DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "embed": + Pooler.for_embed( + pooler_config, + default_pooling_type=PoolingType.CLS, + ), + }) class BertForSequenceClassification(nn.Module, SupportsV0Only, @@ -481,16 +531,32 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config self.num_labels = config.num_labels - self.bert = BertModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "bert"), - embedding_class=BertEmbedding, - add_pooling_layer=True) + self.bert = BertPoolingModel(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "bert"), + embedding_class=BertEmbedding) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=self.bert.pooler, - classifier=self.classifier, - ) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=self.bert.pooler, + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=self.bert.pooler, + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index 82883bfa890d..98d76337395b 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -43,7 +43,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from ..layers.pooler import Pooler, PoolingType +from ..layers.pooler import DispatchPooler, Pooler from .interfaces import SupportsPP from .utils import (AutoWeightsLoader, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, @@ -339,12 +339,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.transformer = GPT2Model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "gpt2")) self.score = nn.Linear(config.n_embd, config.num_labels, bias=False) + pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True) + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + Pooler.for_classify(pooler_config, classifier=None), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 8443482119b0..8a3fbc6a49f0 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -1,17 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - +from collections.abc import Set from typing import Optional, Union import numpy as np import torch import torch.nn as nn -from typing_extensions import assert_never from vllm.config import ModelConfig, VllmConfig from vllm.logger import init_logger -from vllm.model_executor.layers.pooler import (Pooler, PoolerHead, - PoolerNormalize, +from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, + PoolerHead, PoolerNormalize, PoolingParamsUpdate, build_output, get_prompt_lens, get_prompt_token_ids) @@ -135,18 +134,11 @@ def _get_instruction_len(self, prompt_token_ids: np.ndarray) -> int: return instruction_len - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: - # The equalities are split up to keep mypy happy - if task == "encode" or task == "embed": - return PoolingParamsUpdate(requires_token_ids=True) - - if task == "classify" or task == "score": - return None + def get_supported_tasks(self) -> Set[PoolingTask]: + return {"encode", "embed"} - assert_never(task) + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: + return PoolingParamsUpdate(requires_token_ids=True) def forward_one( self, @@ -207,10 +199,10 @@ def __init__(self, model_config: ModelConfig): self.pooling = GritLMMeanPool(model_config) self.head = PoolerHead(PoolerNormalize()) - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) def forward( @@ -262,4 +254,11 @@ def __init__( super().__init__(vllm_config=vllm_config, prefix=prefix, **kwargs) - self.pooler = GritLMPooler(vllm_config.model_config) + pooler_config = vllm_config.model_config.pooler_config + if pooler_config is not None: + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "embed": + GritLMPooler(vllm_config.model_config), + }) diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index d9bbee0a2463..d29779a35e5c 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -22,7 +22,7 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -429,12 +429,10 @@ def __init__( ) pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False, - ) + assert pooler_config is not None + + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}, ) def forward( self, diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index e95f3491c6b6..34281b2e99ee 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -19,8 +19,8 @@ RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer -from vllm.model_executor.layers.pooler import (ClassifierPooler, PoolingType, - SimplePooler) +from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, + PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) @@ -584,16 +584,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - pooler = SimplePooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=False, - ) - - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=pooler.pooling, - classifier=self.score, - act_fn=pooler.head.activation, - ) + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + Pooler.for_classify( + pooler_config, + classifier=self.score, + default_pooling_type=PoolingType.LAST, + default_normalize=False, + default_softmax=False, + ), + }) diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index 6b191b09b4bf..0c4284f7daaa 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -12,7 +12,7 @@ from vllm.logger import init_logger from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.pooler import DispatchPooler, Pooler from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import IntermediateTensors @@ -96,11 +96,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.score = JinaVLScorer(config) - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.LAST, - normalize=False, - softmax=True) + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + Pooler.for_classify(pooler_config, classifier=None), + "score": + Pooler.for_classify(pooler_config, classifier=None), + }) @classmethod def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]: diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 74986f9f5734..be1c3438d9db 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Iterable +from collections.abc import Iterable, Set from typing import Optional, Union import torch @@ -13,7 +13,8 @@ from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import (ClassifierPooler, Pooler, +from vllm.model_executor.layers.pooler import (ClassifierPooler, + DispatchPooler, Pooler, PoolingMethod, PoolingParamsUpdate, PoolingType) @@ -271,19 +272,27 @@ def __init__(self, config: ModernBertConfig): eps=config.norm_eps, bias=config.norm_bias) - def get_pooling_updates( - self, - task: PoolingTask, - ) -> Optional[PoolingParamsUpdate]: + def get_supported_tasks(self) -> Set[PoolingTask]: + return self.pooling.get_supported_tasks() + + def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return self.pooling.get_pooling_updates(task) + def _head(self, pooled_output: torch.Tensor): + return self.norm(self.act(self.dense(pooled_output))) + def forward( self, hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[torch.Tensor, list[torch.Tensor]]: pooled_output = self.pooling(hidden_states, pooling_metadata) - pooled_output = self.norm(self.act(self.dense(pooled_output))) + + if isinstance(pooled_output, list): + pooled_output = [self._head(output) for output in pooled_output] + else: + pooled_output = self._head(pooled_output) + return pooled_output @@ -299,11 +308,28 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = ModernBertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "modernbert")) self.classifier = nn.Linear(config.hidden_size, config.num_labels) - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=ModernBertPooler(config), - classifier=self.classifier, - ) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=ModernBertPooler(config), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=ModernBertPooler(config), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 58f95d6eebfb..f12e9a041a94 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -15,7 +15,8 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) -from vllm.model_executor.layers.pooler import Pooler, PoolingType, SimplePooler +from vllm.model_executor.layers.pooler import (DispatchPooler, Pooler, + PoolingType) from vllm.sequence import IntermediateTensors from .interfaces import SupportsLoRA, SupportsPP @@ -26,7 +27,7 @@ class Qwen2RewardBaseModel(nn.Module, SupportsLoRA, SupportsPP): is_pooling_model = True - pooler: SimplePooler + pooler: Pooler packed_modules_mapping = { "qkv_proj": [ @@ -94,12 +95,12 @@ class Qwen2ForRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config.model_config.hf_config.num_labels = 1 super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.ALL, - normalize=False, - softmax=False) + assert pooler_config is not None + + self.pooler = DispatchPooler( + {"encode": Pooler.for_encode(pooler_config)}, ) class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): @@ -107,11 +108,17 @@ class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): vllm_config.model_config.hf_config.num_labels = 2 super().__init__(vllm_config=vllm_config, prefix=prefix) + pooler_config = vllm_config.model_config.pooler_config - self.pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.STEP, - normalize=False, - softmax=True, - step_tag_id=151651, - ) + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode( + pooler_config, + default_pooling_type=PoolingType.STEP, + default_normalize=False, + default_softmax=True, + default_step_tag_id=151651, + ) + }) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 7d3b56ced5c4..c6b411644034 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -9,7 +9,8 @@ from transformers import RobertaConfig from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import ClassifierPooler, CLSPool +from vllm.model_executor.layers.pooler import (ClassifierPooler, CLSPool, + DispatchPooler, Pooler) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel @@ -63,16 +64,10 @@ def forward( # References: # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 - pos_list = [] - token_list = [] - offset = 0 - for seq_len in seq_lens: - pos_list.append(position_ids[offset:offset + seq_len]) - token_list.append(input_ids[offset:offset + seq_len]) - offset += seq_len - + seq_lens_list = seq_lens.tolist() new_pos_list = [] - for positions, tokens in zip(pos_list, token_list): + for positions, tokens in zip(position_ids.split(seq_lens_list), + input_ids.split(seq_lens_list)): # Verify assumption that incoming position are # always a sequence from 0 to N. expected_pos = torch.arange(positions.size()[0], @@ -184,15 +179,30 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.num_labels = config.num_labels self.roberta = BertModel(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "bert"), - embedding_class=RobertaEmbedding, - add_pooling_layer=False) + embedding_class=RobertaEmbedding) self.classifier = RobertaClassificationHead(config) - self.pooler = ClassifierPooler( - vllm_config.model_config, - pooling=CLSPool(), - classifier=self.classifier, - ) + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({ + "encode": + Pooler.for_encode(pooler_config), + "classify": + ClassifierPooler( + pooling=CLSPool(), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_seq_cls( + vllm_config.model_config), + ), + "score": + ClassifierPooler( + pooling=CLSPool(), + classifier=self.classifier, + act_fn=ClassifierPooler.act_fn_for_cross_encoder( + vllm_config.model_config), + ), + }) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) diff --git a/vllm/model_executor/pooling_metadata.py b/vllm/model_executor/pooling_metadata.py index 4dd443bc26ea..e6f1ca61dd29 100644 --- a/vllm/model_executor/pooling_metadata.py +++ b/vllm/model_executor/pooling_metadata.py @@ -38,6 +38,13 @@ def __repr__(self) -> str: f"seq_data={self.seq_data}, " f"prompt_lens={self.prompt_lens})") + def __getitem__(self, indices: slice): + return PoolingMetadata( + seq_groups=self.seq_groups[indices], + seq_data=dict(list(self.seq_data.items())[indices]), + prompt_lens=self.prompt_lens[indices], + ) + @dataclass class PoolingTensors: diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py index 5f321cd87c52..28af720d05fd 100644 --- a/vllm/v1/pool/metadata.py +++ b/vllm/v1/pool/metadata.py @@ -15,3 +15,11 @@ class PoolingMetadata: prompt_lens: torch.Tensor prompt_token_ids: Optional[torch.Tensor] pooling_params: list[PoolingParams] + + def __getitem__(self, indices: slice): + return PoolingMetadata( + prompt_lens=self.prompt_lens[indices], + prompt_token_ids=None if self.prompt_token_ids is None else + self.prompt_token_ids[indices], + pooling_params=self.pooling_params[indices], + ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 670e653929ce..cd66d8bcd634 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5,7 +5,7 @@ import gc import time from contextlib import contextmanager -from typing import TYPE_CHECKING, Any, Optional, Union, cast, get_args +from typing import TYPE_CHECKING, Any, Optional, Union, cast import numpy as np import torch @@ -415,15 +415,11 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: generator = None if pooling_params: - assert pooling_params.task is not None, ( + assert (task := pooling_params.task) is not None, ( "You did not set `task` in the API") model = cast(VllmModelForPooling, self.model) - to_update = (model.pooler.get_pooling_updates( - pooling_params.task)) - assert to_update is not None, ( - f"{pooling_params.task=} is not supported by the model") - + to_update = model.pooler.get_pooling_updates(task) to_update.apply(pooling_params) self.requests[req_id] = CachedRequestState( @@ -1122,10 +1118,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if not is_pooling_model(model): return [] - return [ - task for task in get_args(PoolingTask) - if model.pooler.get_pooling_updates(task) - ] + return list(model.pooler.get_supported_tasks()) def apply_grammar_bitmask( self, @@ -2247,7 +2240,6 @@ def _dummy_pooler_run( dummy_pooling_params = PoolingParams(task=dummy_task) to_update = model.pooler.get_pooling_updates(dummy_task) - assert to_update is not None to_update.apply(dummy_pooling_params) dummy_metadata = PoolingMetadata( diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 7ed1cf41011b..aad45b6abd12 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -3,7 +3,7 @@ import bisect import gc import time -from typing import TYPE_CHECKING, Any, Optional, cast, get_args +from typing import TYPE_CHECKING, Any, Optional, cast from unittest.mock import patch import numpy as np @@ -491,10 +491,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if not is_pooling_model(model): return [] - return [ - task for task in get_args(PoolingTask) - if model.pooler.get_pooling_updates(task) - ] + return list(model.pooler.get_supported_tasks()) def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index b0737dfe3197..62f26ac57a98 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -4,7 +4,7 @@ import dataclasses from abc import ABC, abstractmethod from typing import (TYPE_CHECKING, Any, Dict, Generic, List, Optional, Type, - TypeVar, get_args) + TypeVar) import torch import torch.nn as nn @@ -230,10 +230,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if not is_pooling_model(model): return [] - return [ - task for task in get_args(PoolingTask) - if model.pooler.get_pooling_updates(task) - ] + return list(model.pooler.get_supported_tasks()) def execute_model( self, diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index 2c3f4eb3ad4d..d91b16be83d7 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -199,15 +199,11 @@ def _prepare_pooling( pooling_params = seq_group_metadata.pooling_params assert pooling_params is not None - assert pooling_params.task is not None, ( + assert (task := pooling_params.task) is not None, ( "You did not set `task` in the API") - to_update = (cast(VllmModelForPooling, - self.model).pooler.get_pooling_updates( - pooling_params.task)) - assert to_update is not None, ( - f"{pooling_params.task=} is not supported by the model") - + model = cast(VllmModelForPooling, self.model) + to_update = model.pooler.get_pooling_updates(task) to_update.apply(pooling_params) seq_groups.append((seq_ids, pooling_params)) From be54a951a3bddedc98db3afdacc2382431a2e3d0 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 21 Jul 2025 10:23:57 +0100 Subject: [PATCH 41/57] [Docs] Fix hardcoded links in docs (#21287) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/design/v1/metrics.md | 5 ++--- docs/features/multimodal_inputs.md | 2 +- docs/features/quantization/bitblas.md | 2 +- docs/features/tool_calling.md | 2 +- docs/models/extensions/tensorizer.md | 2 +- 5 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/design/v1/metrics.md b/docs/design/v1/metrics.md index eec42d79d820..e23308f2637c 100644 --- a/docs/design/v1/metrics.md +++ b/docs/design/v1/metrics.md @@ -61,7 +61,7 @@ These are documented under [Inferencing and Serving -> Production Metrics](../.. ### Grafana Dashboard -vLLM also provides [a reference example](https://docs.vllm.ai/en/stable/examples/online_serving/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. +vLLM also provides [a reference example](../../examples/online_serving/prometheus_grafana.md) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard. The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important: @@ -672,8 +672,7 @@ v0 has support for OpenTelemetry tracing: `--collect-detailed-traces` - [OpenTelemetry blog post](https://opentelemetry.io/blog/2024/llm-observability/) -- [User-facing - docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html) +- [User-facing docs](../../examples/online_serving/opentelemetry.md) - [Blog post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f) - [IBM product diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index f9df2c89c600..e820ace4f8fe 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -98,7 +98,7 @@ To substitute multiple images inside the same text prompt, you can pass in a lis Full example: -If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings: +If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings: ```python from vllm import LLM diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index ba014d28cde4..6f53a448ee36 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -5,7 +5,7 @@ vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more effic !!! note Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`). Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper. - For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html). + For details see [supported hardware](supported_hardware.md). Below are the steps to utilize BitBLAS with vLLM. diff --git a/docs/features/tool_calling.md b/docs/features/tool_calling.md index 9b9d6e1360e9..8d89dc4c8d8e 100644 --- a/docs/features/tool_calling.md +++ b/docs/features/tool_calling.md @@ -95,7 +95,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha ## Required Function Calling -vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/usage/v1_guide.html#feature-model) for the V1 engine. +vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](../usage/v1_guide.md#features) for the V1 engine. When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter. diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 5aa647b19927..6ea61b080cda 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -7,7 +7,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html). +the [vLLM example script](../../examples/others/tensorize_vllm_model.md). !!! note Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. From e6b90a2805e809022580f2c1f4928c64b5f531f1 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Mon, 21 Jul 2025 10:25:02 +0100 Subject: [PATCH 42/57] [Docs] Make tables more space efficient in `supported_models.md` (#21291) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- docs/models/supported_models.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0a2f69bd7711..33b297ef2d7d 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -314,6 +314,13 @@ See [this page](generative_models.md) for more information on how to use generat Specified using `--task generate`. + + | Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/distributed_serving.md) | [V1](gh-issue:8779) | |--------------|--------|-------------------|----------------------|---------------------------|---------------------| | `AquilaForCausalLM` | Aquila, Aquila2 | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. | ✅︎ | ✅︎ | ✅︎ | From d97841078b6e0dde8da36d5a2b8e8857a2c37944 Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Mon, 21 Jul 2025 19:18:33 +0800 Subject: [PATCH 43/57] [Misc] unify variable for LLM instance (#20996) Signed-off-by: Andy Xie --- docs/configuration/model_resolution.md | 2 +- docs/features/lora.md | 4 +- docs/features/quantization/fp8.md | 10 ++- docs/features/quantization/int4.md | 3 +- docs/features/quantization/int8.md | 3 +- docs/models/pooling_models.md | 10 +-- examples/offline_inference/basic/classify.py | 4 +- examples/offline_inference/basic/embed.py | 4 +- examples/offline_inference/basic/score.py | 4 +- .../embed_jina_embeddings_v3.py | 4 +- .../offline_inference/embed_matryoshka_fy.py | 4 +- .../offline_inference/neuron_speculation.py | 12 +-- .../prithvi_geospatial_mae.py | 4 +- examples/offline_inference/qwen3_reranker.py | 8 +- .../test_basic_correctness.py | 4 +- tests/basic_correctness/test_preemption.py | 10 +-- tests/conftest.py | 32 ++++---- tests/core/test_num_computed_tokens_update.py | 2 +- tests/detokenizer/test_stop_reason.py | 2 +- tests/detokenizer/test_stop_strings.py | 42 +++++------ tests/lora/test_llama_tp.py | 20 ++--- tests/metrics/test_metrics.py | 14 ++-- .../test_model_load_with_params.py | 10 +-- .../models/language/generation/test_hybrid.py | 2 +- .../language/generation/test_mistral.py | 14 ++-- tests/models/language/pooling/mteb_utils.py | 18 ++--- tests/models/language/pooling/test_gritlm.py | 4 +- tests/models/language/pooling/test_jina.py | 4 +- .../pooling/test_nomic_max_model_len.py | 6 +- .../pooling/test_truncation_control.py | 6 +- .../multimodal/generation/test_pixtral.py | 5 +- .../multimodal/generation/test_whisper.py | 2 +- .../multimodal/generation/vlm_utils/core.py | 2 +- .../multimodal/pooling/test_dse_qwen2_vl.py | 2 +- .../pooling/test_jinavl_reranker.py | 2 +- tests/models/quantization/test_modelopt.py | 6 +- tests/models/quantization/test_nvfp4.py | 6 +- .../test_disable_sliding_window.py | 22 +++--- tests/prefix_caching/test_prefix_caching.py | 6 +- tests/quantization/test_gptq_dynamic.py | 2 +- tests/quantization/test_quark.py | 4 +- .../test_register_quantization_config.py | 2 +- tests/samplers/test_ignore_eos.py | 2 +- tests/samplers/test_logits_processor.py | 10 +-- tests/samplers/test_logprobs.py | 4 +- tests/samplers/test_no_bad_words.py | 12 +-- tests/samplers/test_seeded_generate.py | 2 +- tests/tokenization/test_detokenize.py | 2 +- tests/v1/core/test_scheduler_e2e.py | 12 +-- tests/v1/engine/test_llm_engine.py | 14 ++-- tests/v1/sample/test_logprobs.py | 8 +- tests/v1/sample/test_sampling_params_e2e.py | 74 +++++++++---------- tests/v1/test_oracle.py | 6 +- 53 files changed, 237 insertions(+), 236 deletions(-) diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md index d98142a835c7..49576a8217d0 100644 --- a/docs/configuration/model_resolution.md +++ b/docs/configuration/model_resolution.md @@ -14,7 +14,7 @@ For example: ```python from vllm import LLM -model = LLM( +llm = LLM( model="cerebras/Cerebras-GPT-1.3B", hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 ) diff --git a/docs/features/lora.md b/docs/features/lora.md index 6acfdcce4458..ea1b495138c1 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au return tokenizer.apply_chat_template(chat, tokenize=False) - model = LLM( + llm = LLM( model=model_id, enable_lora=True, max_lora_rank=64, @@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au } - outputs = model.generate( + outputs = llm.generate( inputs, sampling_params=SamplingParams( temperature=0.2, diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index a6c0fd78e76b..0661933acd61 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -86,8 +86,9 @@ Load and run the model in `vllm`: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") -result = model.generate("Hello my name is") + +llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") +result = llm.generate("Hello my name is") print(result[0].outputs[0].text) ``` @@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei ```python from vllm import LLM -model = LLM("facebook/opt-125m", quantization="fp8") + +llm = LLM("facebook/opt-125m", quantization="fp8") # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB -result = model.generate("Hello, my name is") +result = llm.generate("Hello, my name is") print(result[0].outputs[0].text) ``` diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index f26de73c2f0f..1df32a11ed9d 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") + +llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") ``` To evaluate accuracy, you can use `lm_eval`: diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 7e1cb3fee94a..45fae58a6486 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") + +llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") ``` To evaluate accuracy, you can use `lm_eval`: diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index eef8f20e4e5c..741ae2d79c1e 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -174,11 +174,11 @@ You can change the output dimensions of embedding models that support Matryoshka ```python from vllm import LLM, PoolingParams -model = LLM(model="jinaai/jina-embeddings-v3", - task="embed", - trust_remote_code=True) -outputs = model.embed(["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32)) +llm = LLM(model="jinaai/jina-embeddings-v3", + task="embed", + trust_remote_code=True) +outputs = llm.embed(["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32)) print(outputs[0].outputs) ``` diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 219064e97429..aaf0e83c9dee 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -28,10 +28,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="classify" for classification models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate logits. The output is a list of ClassificationRequestOutputs. - outputs = model.classify(prompts) + outputs = llm.classify(prompts) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 1114033d5cea..7ff9c7f5e0eb 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -31,10 +31,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. - outputs = model.embed(prompts) + outputs = llm.embed(prompts) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index 6a08de2d2c38..d37527b0a131 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -27,10 +27,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="score" for cross-encoder models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate scores. The output is a list of ScoringRequestOutputs. - outputs = model.score(text_1, texts_2) + outputs = llm.score(text_1, texts_2) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py index e68128399ba2..7d78b8c63c63 100644 --- a/examples/offline_inference/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -30,11 +30,11 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. # Only text matching task is supported for now. See #16120 - outputs = model.embed(prompts) + outputs = llm.embed(prompts) # Print the outputs. print("\nGenerated Outputs:") diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py index 7f5d74d9a3ae..50a645ba8270 100644 --- a/examples/offline_inference/embed_matryoshka_fy.py +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -30,10 +30,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. - outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32)) + outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32)) # Print the outputs. print("\nGenerated Outputs:") diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index 2ef69f29863d..26276cba202b 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -25,7 +25,7 @@ def config_buckets(): os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" -def initialize_model(): +def initialize_llm(): """Create an LLM with speculative decoding.""" return LLM( model="openlm-research/open_llama_7b", @@ -43,9 +43,9 @@ def initialize_model(): ) -def process_requests(model: LLM, sampling_params: SamplingParams): +def process_requests(llm: LLM, sampling_params: SamplingParams): """Generate texts from prompts and print them.""" - outputs = model.generate(prompts, sampling_params) + outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text @@ -53,12 +53,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams): def main(): - """Main function that sets up the model and processes prompts.""" + """Main function that sets up the llm and processes prompts.""" config_buckets() - model = initialize_model() + llm = initialize_llm() # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, top_k=1) - process_requests(model, sampling_params) + process_requests(llm, sampling_params) if __name__ == "__main__": diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 567c448a8c97..6dc03e85baa9 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -140,7 +140,7 @@ class PrithviMAE: def __init__(self): print("Initializing PrithviMAE model") - self.model = LLM( + self.llm = LLM( model=os.path.join(os.path.dirname(__file__), "./model"), skip_tokenizer_init=True, dtype="float32", @@ -158,7 +158,7 @@ def run(self, input_data, location_coords): prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data} - outputs = self.model.encode(prompt, use_tqdm=False) + outputs = self.llm.encode(prompt, use_tqdm=False) print("################ Inference done (it took seconds) ##############") return outputs[0].outputs.data diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py index fe3cebc348f1..b0fd57237d47 100644 --- a/examples/offline_inference/qwen3_reranker.py +++ b/examples/offline_inference/qwen3_reranker.py @@ -17,13 +17,13 @@ # Models converted offline using this method can not only be more efficient # and support the vllm score API, but also make the init parameters more # concise, for example. -# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") +# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") # If you want to load the official original version, the init parameters are # as follows. -def get_model() -> LLM: +def get_llm() -> LLM: """Initializes and returns the LLM model for Qwen3-Reranker.""" return LLM( model=model_name, @@ -77,8 +77,8 @@ def main() -> None: ] documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] - model = get_model() - outputs = model.score(queries, documents) + llm = get_llm() + outputs = llm.score(queries, documents) print("-" * 30) print([output.outputs.score for output in outputs]) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 2e103019f7af..13ddf035a55e 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None: monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0') with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model: - if isinstance(vllm_model.model.llm_engine, LLMEngineV1): + if isinstance(vllm_model.llm.llm_engine, LLMEngineV1): v1_test_failed_model_execution(vllm_model) def v1_test_failed_model_execution(vllm_model): - engine = vllm_model.model.llm_engine + engine = vllm_model.llm.llm_engine mocked_execute_model = Mock( side_effect=RuntimeError("Mocked Critical Error")) engine.engine_core.engine_core.model_executor.execute_model =\ diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 341a39a42b85..db2fa2f6bef6 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -81,7 +81,7 @@ def test_chunked_prefill_recompute( disable_log_stats=False, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) for i in range(len(example_prompts)): @@ -118,10 +118,10 @@ def test_preemption( distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) total_preemption = ( - vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption) + vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption) check_outputs_equal( outputs_0_lst=hf_outputs, @@ -174,12 +174,12 @@ def test_preemption_infeasible( ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) - req_outputs = vllm_model.model.generate( + req_outputs = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params, ) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) # Verify the request is ignored and not hang. diff --git a/tests/conftest.py b/tests/conftest.py index f3524d1fe2a6..a18dbf58c803 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -784,7 +784,7 @@ def __init__( enforce_eager: Optional[bool] = False, **kwargs, ) -> None: - self.model = LLM( + self.llm = LLM( model=model_name, task=task, tokenizer=tokenizer_name, @@ -854,9 +854,9 @@ def generate( videos=videos, audios=audios) - req_outputs = self.model.generate(inputs, - sampling_params=sampling_params, - **kwargs) + req_outputs = self.llm.generate(inputs, + sampling_params=sampling_params, + **kwargs) outputs: list[tuple[list[list[int]], list[str]]] = [] for req_output in req_outputs: @@ -902,9 +902,9 @@ def generate_w_logprobs( videos=videos, audios=audios) - req_outputs = self.model.generate(inputs, - sampling_params=sampling_params, - **kwargs) + req_outputs = self.llm.generate(inputs, + sampling_params=sampling_params, + **kwargs) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) @@ -924,8 +924,8 @@ def generate_encoder_decoder_w_logprobs( ''' assert sampling_params.logprobs is not None - req_outputs = self.model.generate(encoder_decoder_prompts, - sampling_params=sampling_params) + req_outputs = self.llm.generate(encoder_decoder_prompts, + sampling_params=sampling_params) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) # Omit prompt logprobs if not required by sampling params @@ -1018,7 +1018,7 @@ def generate_beam_search( videos=videos, audios=audios) - outputs = self.model.beam_search( + outputs = self.llm.beam_search( inputs, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) returned_outputs = [] @@ -1029,7 +1029,7 @@ def generate_beam_search( return returned_outputs def classify(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.model.classify(prompts) + req_outputs = self.llm.classify(prompts) return [req_output.outputs.probs for req_output in req_outputs] def embed(self, @@ -1044,11 +1044,11 @@ def embed(self, videos=videos, audios=audios) - req_outputs = self.model.embed(inputs, *args, **kwargs) + req_outputs = self.llm.embed(inputs, *args, **kwargs) return [req_output.outputs.embedding for req_output in req_outputs] def encode(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.model.encode(prompts) + req_outputs = self.llm.encode(prompts) return [req_output.outputs.data for req_output in req_outputs] def score( @@ -1058,18 +1058,18 @@ def score( *args, **kwargs, ) -> list[float]: - req_outputs = self.model.score(text_1, text_2, *args, **kwargs) + req_outputs = self.llm.score(text_1, text_2, *args, **kwargs) return [req_output.outputs.score for req_output in req_outputs] def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - executor = self.model.llm_engine.model_executor + executor = self.llm.llm_engine.model_executor return executor.apply_model(func) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - del self.model + del self.llm cleanup_dist_env_and_memory() diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index 1b958e34df87..9e1b7913dfb9 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, num_scheduler_steps=num_scheduler_steps, enable_chunked_prefill=enable_chunked_prefill, enforce_eager=enforce_eager) - engine: LLMEngine = runner.model.llm_engine + engine: LLMEngine = runner.llm.llm_engine # In multi-step + chunked-prefill there is no separate single prompt step. # What is scheduled will run for num_scheduler_steps always. diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py index 9716f7d72a58..1ff679789c95 100644 --- a/tests/detokenizer/test_stop_reason.py +++ b/tests/detokenizer/test_stop_reason.py @@ -28,7 +28,7 @@ def vllm_model(vllm_runner): def test_stop_reason(vllm_model, example_prompts): tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL) stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR) - llm = vllm_model.model + llm = vllm_model.llm # test stop token outputs = llm.generate(example_prompts, diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index efe938a20c4f..cb87c44cc399 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -101,42 +101,42 @@ def _stop_token_id(llm): def test_stop_strings(): # If V0, must set enforce_eager=False since we use # async output processing below. - vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) + llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) if envs.VLLM_USE_V1: - _stop_basic(vllm_model) + _stop_basic(llm) else: - _set_async_mode(vllm_model, True) - _stop_basic(vllm_model) + _set_async_mode(llm, True) + _stop_basic(llm) - _set_async_mode(vllm_model, False) - _stop_basic(vllm_model) + _set_async_mode(llm, False) + _stop_basic(llm) if envs.VLLM_USE_V1: - _stop_multi_tokens(vllm_model) + _stop_multi_tokens(llm) else: - _set_async_mode(vllm_model, True) - _stop_multi_tokens(vllm_model) + _set_async_mode(llm, True) + _stop_multi_tokens(llm) - _set_async_mode(vllm_model, False) - _stop_multi_tokens(vllm_model) + _set_async_mode(llm, False) + _stop_multi_tokens(llm) if envs.VLLM_USE_V1: - _stop_partial_token(vllm_model) + _stop_partial_token(llm) else: - _set_async_mode(vllm_model, True) - _stop_partial_token(vllm_model) + _set_async_mode(llm, True) + _stop_partial_token(llm) - _set_async_mode(vllm_model, False) - _stop_partial_token(vllm_model) + _set_async_mode(llm, False) + _stop_partial_token(llm) if envs.VLLM_USE_V1: # FIXME: this does not respect include_in_output=False - # _stop_token_id(vllm_model) + # _stop_token_id(llm) pass else: - _set_async_mode(vllm_model, True) - _stop_token_id(vllm_model) + _set_async_mode(llm, True) + _stop_token_id(llm) - _set_async_mode(vllm_model, False) - _stop_token_id(vllm_model) + _set_async_mode(llm, False) + _stop_token_id(llm) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index bebf44b6dfd7..b1ad1fdd0606 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, model_uri = tmp_path / "vllm" / model_ref / suffix / model_name tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) - loaded_vllm_model = LLM(model=model_ref, - load_format="tensorizer", - enable_lora=True, - enforce_eager=True, - model_loader_extra_config=tensorizer_config, - max_num_seqs=13, - tensor_parallel_size=2, - max_loras=2) + loaded_llm = LLM(model=model_ref, + load_format="tensorizer", + enable_lora=True, + enforce_eager=True, + model_loader_extra_config=tensorizer_config, + max_num_seqs=13, + tensor_parallel_size=2, + max_loras=2) tc_as_dict = tensorizer_config.to_serializable() print("lora adapter created") - assert do_sample(loaded_vllm_model, + assert do_sample(loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 1") - assert do_sample(loaded_vllm_model, + assert do_sample(loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1) == EXPECTED_LORA_OUTPUT diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 54dbb747de09..8cae8a80d38e 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -41,7 +41,7 @@ def test_metric_counter_prompt_tokens( dtype=dtype, disable_log_stats=False, gpu_memory_utilization=0.4) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() prompt_token_counts = [ len(tokenizer.encode(p)) for p in example_prompts ] @@ -53,7 +53,7 @@ def test_metric_counter_prompt_tokens( vllm_prompt_token_count = sum(prompt_token_counts) _ = vllm_model.generate_greedy(example_prompts, max_tokens) - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_prompt_tokens.labels( **stat_logger.labels)._value.get() @@ -77,8 +77,8 @@ def test_metric_counter_generation_tokens( disable_log_stats=False, gpu_memory_utilization=0.4) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.model.get_tokenizer() - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + tokenizer = vllm_model.llm.get_tokenizer() + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_generation_tokens.labels( **stat_logger.labels)._value.get() vllm_generation_count = 0 @@ -113,8 +113,8 @@ def test_metric_counter_generation_tokens_multi_step( disable_async_output_proc=disable_async_output_proc, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.model.get_tokenizer() - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + tokenizer = vllm_model.llm.get_tokenizer() + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_generation_tokens.labels( **stat_logger.labels)._value.get() vllm_generation_count = 0 @@ -145,7 +145,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, disable_log_stats=False, gpu_memory_utilization=0.3, served_model_name=served_model_name) as vllm_model: - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metrics_tag_content = stat_logger.labels["model_name"] if envs.VLLM_CI_USE_S3: diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 1d2d9f9a65bb..273747630215 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_config = vllm_model.model.llm_engine.model_config - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_config = vllm_model.model.llm_engine.model_config - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_tokenizer = vllm_model.llm.llm_engine.tokenizer assert model_tokenizer.tokenizer_id == model_name def check_model(model): diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index e4294512338b..2238924c1b50 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -274,7 +274,7 @@ def test_models_preemption_recompute( Tests that outputs are identical with and w/o preemptions (recompute). """ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - scheduler = vllm_model.model.llm_engine.scheduler[0] + scheduler = vllm_model.llm.llm_engine.scheduler[0] scheduler.ENABLE_ARTIFICIAL_PREEMPT = True preempt_vllm_outputs = vllm_model.generate_greedy( example_prompts, max_tokens) diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index c70698ede37a..81a88f2d485e 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str, load_format="mistral") as vllm_model: for prompt in SYMBOLIC_LANG_PROMPTS: msg = {"role": "user", "content": prompt} - outputs = vllm_model.model.chat([msg], - sampling_params=SAMPLING_PARAMS) + outputs = vllm_model.llm.chat([msg], + sampling_params=SAMPLING_PARAMS) assert "�" not in outputs[0].outputs[0].text.strip() @@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: load_format="mistral") as vllm_model: msgs = copy.deepcopy(MSGS) - outputs = vllm_model.model.chat(msgs, - tools=TOOLS, - sampling_params=SAMPLING_PARAMS) + outputs = vllm_model.llm.chat(msgs, + tools=TOOLS, + sampling_params=SAMPLING_PARAMS) - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() tool_parser = MistralToolParser(tokenizer) model_output = outputs[0].outputs[0].text.strip() @@ -308,7 +308,7 @@ def test_mistral_guided_decoding( f"Give an example JSON for an employee profile that " f"fits this schema: {SAMPLE_JSON_SCHEMA}" }] - outputs = vllm_model.model.chat(messages, sampling_params=params) + outputs = vllm_model.llm.chat(messages, sampling_params=params) generated_text = outputs[0].outputs[0].text json_response = json.loads(generated_text) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 6c4fde5fdfa9..97362f641665 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder): def __init__(self, vllm_model): super().__init__() - self.model = vllm_model + self.llm = vllm_model self.rng = np.random.default_rng(seed=42) def encode( @@ -43,7 +43,7 @@ def encode( # issues by randomizing the order. r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] - outputs = self.model.embed(sentences, use_tqdm=False) + outputs = self.llm.embed(sentences, use_tqdm=False) embeds = np.array(outputs) embeds = embeds[np.argsort(r)] return embeds @@ -61,10 +61,10 @@ def predict( queries = [s[0] for s in sentences] corpus = [s[1] for s in sentences] - outputs = self.model.score(queries, - corpus, - truncate_prompt_tokens=-1, - use_tqdm=False) + outputs = self.llm.score(queries, + corpus, + truncate_prompt_tokens=-1, + use_tqdm=False) scores = np.array(outputs) scores = scores[np.argsort(r)] return scores @@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner, if model_info.architecture: assert (model_info.architecture - in vllm_model.model.llm_engine.model_config.architectures) + in vllm_model.llm.llm_engine.model_config.architectures) vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) - vllm_dtype = vllm_model.model.llm_engine.model_config.dtype + vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype with hf_runner(model_info.name, is_sentence_transformer=True, @@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner, max_num_seqs=8, **vllm_extra_kwargs) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config if model_info.architecture: assert (model_info.architecture in model_config.architectures) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 1274657991bf..efa119bb7659 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -120,7 +120,7 @@ def test_gritlm_offline_embedding(vllm_runner): task="embed", max_model_len=MAX_MODEL_LEN, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm d_rep = run_llm_encode( llm, @@ -167,7 +167,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): task="generate", max_model_len=MAX_MODEL_LEN, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm sampling_params = SamplingParams(temperature=0.0, max_tokens=256) outputs = llm.generate(input, sampling_params=sampling_params) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 9bfe7411e16b..16c711407aea 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -87,10 +87,10 @@ def test_matryoshka( task="embed", dtype=dtype, max_model_len=None) as vllm_model: - assert vllm_model.model.llm_engine.model_config.is_matryoshka + assert vllm_model.llm.llm_engine.model_config.is_matryoshka matryoshka_dimensions = ( - vllm_model.model.llm_engine.model_config.matryoshka_dimensions) + vllm_model.llm.llm_engine.model_config.matryoshka_dimensions) assert matryoshka_dimensions is not None if dimensions not in matryoshka_dimensions: diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 250b3a52835a..7413ef578e38 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -23,7 +23,7 @@ def test_default(model_info, vllm_runner): with vllm_runner(model_info.name, task="embed", max_model_len=None) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. @@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): # set max_model_len <= 512 with vllm_runner(model_info.name, task="embed", max_model_len=256) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config assert model_config.max_model_len == 256 # set 512 < max_model_len <= 2048 @@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): else: with vllm_runner(model_info.name, task="embed", max_model_len=1024) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config assert model_config.max_model_len == 1024 diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py index 33aff1c873fc..c7399e01c735 100644 --- a/tests/models/language/pooling/test_truncation_control.py +++ b/tests/models/language/pooling/test_truncation_control.py @@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner, with vllm_runner(model_name, task="embed", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.model.encode( + vllm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner, with vllm_runner(model_name, task="embed", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.model.encode( + vllm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner, model_name, task="embed", max_model_len=max_model_len) as vllm_model: - llm_output = vllm_model.model.encode( + llm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) assert llm_output == f"""truncate_prompt_tokens value diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index 1def825ab087..e157d6f4a79d 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -180,8 +180,7 @@ def test_chat( ) as vllm_model: outputs = [] for msg in MSGS: - output = vllm_model.model.chat(msg, - sampling_params=SAMPLING_PARAMS) + output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS) outputs.extend(output) @@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt, max_model_len=8192, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, ) as vllm_model: - outputs = vllm_model.model.generate(prompt) + outputs = vllm_model.llm.generate(prompt) assert len(outputs) == 1, f"{len(outputs)=}" output: RequestOutput = outputs[0] diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 363d55153aac..4a65e8c95204 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -106,7 +106,7 @@ def run_test( tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm sampling_params = SamplingParams( temperature=0, diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index 8c83d8f8a8a2..cf8962ce4975 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -85,7 +85,7 @@ def run_test( enforce_eager=enforce_eager, task=task, **vllm_runner_kwargs_) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() vllm_kwargs: dict[str, Any] = {} if get_stop_token_ids is not None: diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py index f889eea5e839..a6f5aeccf94e 100644 --- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py +++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py @@ -96,7 +96,7 @@ def _run_test( dtype=dtype, enforce_eager=True, max_model_len=8192) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() texts = [ # this is necessary because vllm_model.embed will not apply any # templating to the prompt, and therefore lacks an image_pad diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py index 50c91f1f81ca..712b6801de45 100644 --- a/tests/models/multimodal/pooling/test_jinavl_reranker.py +++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py @@ -56,7 +56,7 @@ def create_image_param(url: str) -> ChatCompletionContentPartImageParam: mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, ) as vllm_model: - outputs = vllm_model.model.score(query, documents) + outputs = vllm_model.llm.score(query, documents) return [output.outputs.score for output in outputs] diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py index 6ad526cc893f..e23d4d9d211d 100644 --- a/tests/models/quantization/test_modelopt.py +++ b/tests/models/quantization/test_modelopt.py @@ -45,7 +45,7 @@ reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: - model = LLM( + llm = LLM( model=model_name, max_model_len=MAX_MODEL_LEN, trust_remote_code=True, @@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None: # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: - outputs = model.generate(prompt, params) + outputs = llm.generate(prompt, params) generations.append(outputs[0].outputs[0].text) - del model + del llm print(model_name, generations) expected_strs = EXPECTED_STRS_MAP[model_name] diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index b95dad9a4eff..b3c217e729e4 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -46,7 +46,7 @@ reason="modelopt_fp4 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: - model = LLM( + llm = LLM( model=model_name, max_model_len=MAX_MODEL_LEN, trust_remote_code=True, @@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None: # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: - outputs = model.generate(prompt, params) + outputs = llm.generate(prompt, params) generations.append(outputs[0].outputs[0].text) - del model + del llm print(model_name, generations) expected_strs = EXPECTED_STRS_MAP[model_name] diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index f00a8f6998cb..b940ab416e67 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -25,25 +25,25 @@ @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) def test_disable_sliding_window(model_len_len, ): model, sliding_len, full_len = model_len_len - vllm_disabled_model = LLM(model, disable_sliding_window=True) - vllm_disabled_model.generate("Hi my name is") - model_config = vllm_disabled_model.llm_engine.model_config + disabled_llm = LLM(model, disable_sliding_window=True) + disabled_llm.generate("Hi my name is") + model_config = disabled_llm.llm_engine.model_config assert model_config.max_model_len == sliding_len, ( "Max len expected to equal sliding_len of %s, but got %s", sliding_len, model_config.max_model_len) - del vllm_disabled_model + del disabled_llm cleanup_dist_env_and_memory() - vllm_enabled_model = LLM(model, - enforce_eager=True, - disable_sliding_window=False, - enable_prefix_caching=False) - vllm_enabled_model.generate("Hi my name is") - model_config = vllm_enabled_model.llm_engine.model_config + enabled_llm = LLM(model, + enforce_eager=True, + disable_sliding_window=False, + enable_prefix_caching=False) + enabled_llm.generate("Hi my name is") + model_config = enabled_llm.llm_engine.model_config assert model_config.max_model_len == full_len, ( "Max len expected to equal full_len of %s, but got %s", full_len, model_config.max_model_len) - del vllm_enabled_model + del enabled_llm cleanup_dist_env_and_memory() diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index a65fc934b16a..5bf6ed957c74 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -93,8 +93,8 @@ def test_mixed_requests( # Run all the promopts greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - req_outputs = vllm_model.model.generate(example_prompts, - greedy_params) + req_outputs = vllm_model.llm.generate(example_prompts, + greedy_params) # Verify number of cached tokens for i in range(len(req_outputs)): @@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model): max_num_batched_tokens=max_num_batched_tokens, max_num_seqs=max_num_batched_tokens, ) - engine: LLMEngine = runner.model.llm_engine + engine: LLMEngine = runner.llm.llm_engine scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore engine.scheduler[0] = scheduler diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 23b999e7c679..aea50e99c1dd 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( GPTQLinearMethod) - for name, submodule in (vllm_model.model.llm_engine.model_executor. + for name, submodule in (vllm_model.llm.llm_engine.model_executor. driver_worker.model_runner.model.named_modules()): if name == "lm_head": assert isinstance(submodule.quant_method, linear_method_cls) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 2db11cb997d1..4a0c8ba4d8a9 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -107,11 +107,11 @@ def test_quark_fp8_parity(vllm_runner): } with (vllm_runner(quark_model_id, **llm_kwargs) as quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle): - quark_model = (quark_handle.model.llm_engine.model_executor. + quark_model = (quark_handle.llm.llm_engine.model_executor. driver_worker.model_runner.model) quark_state_dict = quark_model.state_dict() - fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker. + fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker. model_runner.model) fp8_state_dict = fp8_model.state_dict() diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 6c541fdbeeae..84705e92c85b 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -111,7 +111,7 @@ def test_custom_quant(vllm_runner, model, monkeypatch): quantization="custom_quant", enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 7eb9c0b5fb8c..ea4a17dd2306 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -36,7 +36,7 @@ def test_ignore_eos( ignore_eos=True) for prompt in example_prompts: - ignore_eos_output = vllm_model.model.generate( + ignore_eos_output = vllm_model.llm.generate( prompt, sampling_params=sampling_params) output_length = len(ignore_eos_output[0].outputs[0].token_ids) assert output_length == max_tokens diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 901c87591264..123f9595e97b 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -26,7 +26,7 @@ def test_logits_processor_force_generate( dtype: str, ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() repeat_times = 2 enforced_answers = " vLLM" vllm_token_ids = tokenizer.encode(enforced_answers, @@ -45,13 +45,13 @@ def pick_vllm(token_ids, logits): ) # test logits_processors when prompt_logprobs is not None - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[0], params=params_with_logprobs, ) # test prompt_logprobs is not None - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[1], params=SamplingParams( prompt_logprobs=3, @@ -60,11 +60,11 @@ def pick_vllm(token_ids, logits): ) # test grouped requests - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[2], params=SamplingParams(max_tokens=max_tokens), ) - outputs = vllm_model.model._run_engine(use_tqdm=False) + outputs = vllm_model.llm._run_engine(use_tqdm=False) assert outputs[0].outputs[0].text == enforced_answers * repeat_times diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 86c8a03eee10..87f40b100531 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -64,7 +64,7 @@ def test_get_prompt_logprobs( prompt_logprobs=num_top_logprobs, temperature=0.0, detokenize=detokenize) - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( example_prompts, sampling_params=vllm_sampling_params) # Test whether logprobs are included in the results. @@ -174,7 +174,7 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, logprobs=None, temperature=0.0, detokenize=detokenize) - results_logprobs_none = vllm_model.model.generate( + results_logprobs_none = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_none) for i in range(len(results_logprobs_none)): diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 42b529ae169d..11803b8d7a5e 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -20,7 +20,7 @@ def v1(run_with_both_engines): def _generate( - model: LLM, + llm: LLM, prompt: str, num_prompt_tokens: int, temperature: float = 0, @@ -32,7 +32,7 @@ def _generate( ) # [([output_token_ids, ], [output_text, ]), ] - output = model.generate([prompt], sampling_params=sampling_params) + output = llm.generate([prompt], sampling_params=sampling_params) output_token_ids = output[0][0][0][num_prompt_tokens:] # [0] first (and only) request output @@ -66,10 +66,10 @@ def test_one_token_bad_word(self, vllm_runner): assert self.target_token_id not in output_token_ids def _generate(self, - model: LLM, + llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]: return _generate( - model=model, + llm=llm, prompt=self.PROMPT, num_prompt_tokens=self.num_prompt_tokens, bad_words=bad_words, @@ -156,10 +156,10 @@ def test_two_token_bad_word(self, vllm_runner): or (self.neighbour_token_id2 in output_token_ids)) def _generate(self, - model: LLM, + llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]: return _generate( - model=model, + llm=llm, prompt=self.PROMPT, num_prompt_tokens=self.num_prompt_tokens, bad_words=bad_words, diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index b339b4b2ddf3..5a0efd98acc1 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -49,7 +49,7 @@ def test_random_sample_with_seed( sampling_params_seed_2 = copy.deepcopy(sampling_params) sampling_params_seed_2.seed = 200 - llm = vllm_model.model + llm = vllm_model.llm for prompt in example_prompts: for params in ( diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index f8aeba8301b1..ccafc8846127 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -393,7 +393,7 @@ def test_decode_prompt_logprobs_chunked_prefill( logprobs=5, prompt_logprobs=5, temperature=0.0) - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( example_prompts, sampling_params=vllm_sampling_params) for idx, result in enumerate(vllm_results): diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 85415f6ad4b6..bd0320baef87 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -14,7 +14,7 @@ @pytest.fixture(scope="module") -def model() -> LLM: +def llm() -> LLM: return LLM(MODEL, enforce_eager=True, enable_prefix_caching=True, @@ -24,16 +24,16 @@ def model() -> LLM: block_size=16) -def test_concurrent_partial_prefill(model): - outputs = model.generate([PROMPT] * 3) +def test_concurrent_partial_prefill(llm): + outputs = llm.generate([PROMPT] * 3) assert len(outputs) == 3 for output in outputs: assert len(output.outputs) == 1 -def test_prefix_cache_stats_is_recorded(model): +def test_prefix_cache_stats_is_recorded(llm): # 17 tokens will make sure first 16 tokens are cached in a block input_tokens = {"prompt_token_ids": [101] * 17} - _ = model.generate([input_tokens]) - outputs = model.generate([input_tokens]) + _ = llm.generate([input_tokens]) + outputs = llm.generate([input_tokens]) assert outputs[0].num_cached_tokens == 16 diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index 059106c62a20..f37686317fd1 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -112,9 +112,9 @@ def test_compatibility_with_skip_tokenizer_init( example_prompts, structured_outputs=True, ) - model: LLM = vllm_model_skip_tokenizer_init.model + llm: LLM = vllm_model_skip_tokenizer_init.llm with pytest.raises(ValueError): - _ = model.generate(example_prompts, sampling_params_list) + _ = llm.generate(example_prompts, sampling_params_list) def test_parallel_sampling(vllm_model, example_prompts) -> None: @@ -125,8 +125,8 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: example_prompt: test fixture providing prompts for testing. """ sampling_params_list, n_list = _get_test_sampling_params(example_prompts) - model: LLM = vllm_model.model - outputs = model.generate(example_prompts, sampling_params_list) + llm: LLM = vllm_model.llm + outputs = llm.generate(example_prompts, sampling_params_list) # Validate each request response for out, n in zip(outputs, n_list): @@ -166,10 +166,10 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): speculative_config=speculative_config, disable_log_stats=False, ) as vllm_model: - model: LLM = vllm_model.model + llm: LLM = vllm_model.llm sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs = model.generate(example_prompts, sampling_params) + outputs = llm.generate(example_prompts, sampling_params) n_prompts = len(example_prompts) assert len(outputs) == n_prompts @@ -180,7 +180,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): total_tokens += len(out.outputs[0].token_ids) assert total_tokens == max_tokens * n_prompts - metrics = model.get_metrics() + metrics = llm.get_metrics() def find_metric(name) -> list[Metric]: found = [] diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 69180e6e5db4..4f1f340a4ccb 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -112,7 +112,7 @@ def _run_and_validate( max_tokens: int, do_apc: bool, ) -> None: - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( test_prompts, sampling_params=vllm_sampling_params) for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip( @@ -288,7 +288,7 @@ def test_get_logprobs_and_prompt_logprobs( """ with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching + do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT): # Skip some test-cases to save time. @@ -378,7 +378,7 @@ def test_none_logprobs(vllm_model, example_prompts, prompt_logprobs=None, temperature=0.0, ) - results_logprobs_none = vllm_model.model.generate( + results_logprobs_none = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_none, ) @@ -408,7 +408,7 @@ def test_zero_logprobs(vllm_model, example_prompts, logprobs=0, prompt_logprobs=0, temperature=0.0) - results_logprobs_zero = vllm_model.model.generate( + results_logprobs_zero = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_zero) for i in range(len(results_logprobs_zero)): diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index ac0f3eb58836..f53e1e1c485d 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -14,30 +14,30 @@ @pytest.fixture(scope="module") -def model() -> LLM: +def llm() -> LLM: # Disable prefix caching so that we can test prompt logprobs. # TODO remove this after https://github.com/vllm-project/vllm/pull/13949 # is merged return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False) -def test_n_gt_1(model): +def test_n_gt_1(llm): """ParallelSampling is supported.""" params = SamplingParams(n=3) - outputs = model.generate(PROMPT, params) + outputs = llm.generate(PROMPT, params) assert len(outputs[0].outputs) == 3 -def test_best_of(model): +def test_best_of(llm): """Raise a ValueError since best_of is deprecated.""" params = SamplingParams(n=2, best_of=3) with pytest.raises(ValueError): - _ = model.generate(PROMPT, params) + _ = llm.generate(PROMPT, params) -def test_penalties(model): +def test_penalties(llm): """Check that we do not get errors if applied.""" params = SamplingParams( @@ -49,18 +49,18 @@ def test_penalties(model): top_p=0.5, top_k=3, ) - _ = model.generate(PROMPT, params) + _ = llm.generate(PROMPT, params) -def test_stop(model): +def test_stop(llm): """Check that we respect the stop words.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) split_text = output[0].outputs[0].text.split() STOP_IDX = 5 params = SamplingParams(temperature=0, stop=split_text[STOP_IDX]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_split_text = output[0].outputs[0].text.split() # Output should not contain the stop word. @@ -69,40 +69,40 @@ def test_stop(model): params = SamplingParams(temperature=0, stop=split_text[STOP_IDX], include_stop_str_in_output=True) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_split_text = output[0].outputs[0].text.split() # Output should contain the stop word. assert len(new_split_text) == STOP_IDX + 1 -def test_stop_token_ids(model): +def test_stop_token_ids(llm): """Check that we respect the stop token ids.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) stop_token_id_0 = output[0].outputs[0].token_ids[5] stop_token_id_1 = output[0].outputs[0].token_ids[6] stop_token_ids = [stop_token_id_1, stop_token_id_0] params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 stop_token_ids = [stop_token_id_0, stop_token_id_1] params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 -def test_detokenize_false(model): +def test_detokenize_false(llm): """Check that detokenize=False option works.""" - output = model.generate(PROMPT, SamplingParams(detokenize=False)) + output = llm.generate(PROMPT, SamplingParams(detokenize=False)) assert len(output[0].outputs[0].token_ids) > 0 assert len(output[0].outputs[0].text) == 0 - output = model.generate( + output = llm.generate( PROMPT, SamplingParams(detokenize=False, logprobs=3, prompt_logprobs=3)) assert len(output[0].outputs[0].token_ids) > 0 @@ -118,28 +118,28 @@ def test_detokenize_false(model): assert all(lp.decoded_token is None for lp in logprobs.values()) -def test_bad_words(model): +def test_bad_words(llm): """Check that we respect bad words.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) split_text = output[0].outputs[0].text.split() bad_words_1 = " ".join(split_text[:2]) params = SamplingParams(temperature=0, bad_words=[bad_words_1]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text assert bad_words_1 not in new_text bad_words_2 = new_text.split()[-1] params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text assert bad_words_1 not in new_text assert bad_words_2 not in new_text -def test_logits_processor(model): +def test_logits_processor(llm): """Check that we reject logits processor.""" # This sample logits processor gives infinite score to the i-th token, @@ -150,47 +150,45 @@ def pick_ith(token_ids, logits): return logits with pytest.raises(ValueError): - _ = model.generate(PROMPT, - SamplingParams(logits_processors=[pick_ith])) + _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith])) -def test_allowed_token_ids(model): +def test_allowed_token_ids(llm): """Check that we can use allowed_token_ids.""" TOKEN_ID = 10 allowed_token_ids = [TOKEN_ID] - output = model.generate( - PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids)) + output = llm.generate(PROMPT, + SamplingParams(allowed_token_ids=allowed_token_ids)) assert output[0].outputs[0].token_ids[-1] == TOKEN_ID # Reject empty allowed_token_ids. with pytest.raises(ValueError): - _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[])) # Reject negative token id. with pytest.raises(ValueError): - _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1])) # Reject out of vocabulary. with pytest.raises(ValueError): - _ = model.generate(PROMPT, - SamplingParams(allowed_token_ids=[10000000])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000])) -def test_priority(model): +def test_priority(llm): """Check that we reject requests with priority.""" # Reject all allowed token ids with pytest.raises(ValueError): - _ = model.generate(PROMPT, priority=[1]) + _ = llm.generate(PROMPT, priority=[1]) -def test_seed(model): +def test_seed(llm): """Check that seed impacts randomness.""" - out_1 = model.generate(PROMPT, SamplingParams(seed=42)) - out_2 = model.generate(PROMPT, SamplingParams(seed=42)) - out_3 = model.generate(PROMPT, SamplingParams(seed=43)) + out_1 = llm.generate(PROMPT, SamplingParams(seed=42)) + out_2 = llm.generate(PROMPT, SamplingParams(seed=42)) + out_3 = llm.generate(PROMPT, SamplingParams(seed=43)) assert out_1[0].outputs[0].text == out_2[0].outputs[0].text assert out_1[0].outputs[0].text != out_3[0].outputs[0].text diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 39515d710e81..b4d4348c7fd9 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -106,9 +106,9 @@ def test_v1_llm_by_default(monkeypatch): m.delenv("VLLM_USE_V1") # Should default to V1 for supported config. - model = LLM(MODEL, enforce_eager=True, enable_lora=True) - print(model.generate("Hello my name is")) - assert hasattr(model.llm_engine, "engine_core") + llm = LLM(MODEL, enforce_eager=True, enable_lora=True) + print(llm.generate("Hello my name is")) + assert hasattr(llm.llm_engine, "engine_core") m.delenv("VLLM_USE_V1") From 6b46c4b653d1d730a9b75d32b59b9d60f879b9d7 Mon Sep 17 00:00:00 2001 From: Zhiyu Date: Mon, 21 Jul 2025 07:02:58 -0700 Subject: [PATCH 44/57] Add Nvidia ModelOpt config adaptation (#19815) Signed-off-by: Zhiyu Cheng --- tests/quantization/test_modelopt.py | 91 ++++++++ vllm/config.py | 20 +- .../layers/quantization/modelopt.py | 208 +++++++++++++++--- 3 files changed, 287 insertions(+), 32 deletions(-) create mode 100644 tests/quantization/test_modelopt.py diff --git a/tests/quantization/test_modelopt.py b/tests/quantization/test_modelopt.py new file mode 100644 index 000000000000..fcbfa681d75c --- /dev/null +++ b/tests/quantization/test_modelopt.py @@ -0,0 +1,91 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Test ModelOpt quantization method setup and weight loading. + +Run `pytest tests/quantization/test_modelopt.py`. +""" + +import os + +import pytest +import torch + +from tests.quantization.utils import is_quant_method_supported +from vllm.platforms import current_platform + + +@pytest.fixture(scope="function", autouse=True) +def use_v0_only(monkeypatch): + """ + This module relies on V0 internals, so set VLLM_USE_V1=0. + """ + if not current_platform.is_cpu(): + monkeypatch.setenv('VLLM_USE_V1', '0') + + +@pytest.mark.skipif(not is_quant_method_supported("modelopt"), + reason="ModelOpt FP8 is not supported on this GPU type.") +def test_modelopt_fp8_checkpoint_setup(vllm_runner): + """Test ModelOpt FP8 checkpoint loading and structure validation.""" + # TODO: provide a small publically available test checkpoint + model_path = ("/home/scratch.omniml_data_1/zhiyu/ckpts/test_ckpts/" + "TinyLlama-1.1B-Chat-v1.0-fp8-0710") + + # Skip test if checkpoint doesn't exist + if not os.path.exists(model_path): + pytest.skip(f"Test checkpoint not found at {model_path}. " + "This test requires a local ModelOpt FP8 checkpoint.") + + with vllm_runner(model_path, quantization="modelopt", + enforce_eager=True) as llm: + + def check_model(model): + layer = model.model.layers[0] + + qkv_proj = layer.self_attn.qkv_proj + o_proj = layer.self_attn.o_proj + gate_up_proj = layer.mlp.gate_up_proj + down_proj = layer.mlp.down_proj + + # Check that ModelOpt quantization method is properly applied + from vllm.model_executor.layers.quantization.modelopt import ( + ModelOptFp8LinearMethod) + assert isinstance(qkv_proj.quant_method, ModelOptFp8LinearMethod) + assert isinstance(o_proj.quant_method, ModelOptFp8LinearMethod) + assert isinstance(gate_up_proj.quant_method, + ModelOptFp8LinearMethod) + assert isinstance(down_proj.quant_method, ModelOptFp8LinearMethod) + + # Check weight dtype is FP8 + assert qkv_proj.weight.dtype == torch.float8_e4m3fn + assert o_proj.weight.dtype == torch.float8_e4m3fn + assert gate_up_proj.weight.dtype == torch.float8_e4m3fn + assert down_proj.weight.dtype == torch.float8_e4m3fn + + # Check scales are present and have correct dtype + assert hasattr(qkv_proj, 'weight_scale') + assert hasattr(qkv_proj, 'input_scale') + assert qkv_proj.weight_scale.dtype == torch.float32 + assert qkv_proj.input_scale.dtype == torch.float32 + + assert hasattr(o_proj, 'weight_scale') + assert hasattr(o_proj, 'input_scale') + assert o_proj.weight_scale.dtype == torch.float32 + assert o_proj.input_scale.dtype == torch.float32 + + assert hasattr(gate_up_proj, 'weight_scale') + assert hasattr(gate_up_proj, 'input_scale') + assert gate_up_proj.weight_scale.dtype == torch.float32 + assert gate_up_proj.input_scale.dtype == torch.float32 + + assert hasattr(down_proj, 'weight_scale') + assert hasattr(down_proj, 'input_scale') + assert down_proj.weight_scale.dtype == torch.float32 + assert down_proj.input_scale.dtype == torch.float32 + + llm.apply_model(check_model) + + # Run a simple generation test to ensure the model works + output = llm.generate_greedy(["Hello my name is"], max_tokens=20) + assert output + print(f"ModelOpt FP8 output: {output}") diff --git a/vllm/config.py b/vllm/config.py index 4cafbc926052..3e6aa2a93e6a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -346,11 +346,11 @@ class ModelConfig: """Maximum number of data items per modality per prompt. Only applicable for multimodal models.""" interleave_mm_strings: bool = False - """Enable fully interleaved support for multimodal prompts, while using + """Enable fully interleaved support for multimodal prompts, while using --chat-template-content-format=string. Defaults to False.""" media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ use_async_output_proc: bool = True """Whether to use async output processor.""" @@ -1000,9 +1000,13 @@ def _verify_quantization(self) -> None: quant_cfg = self._parse_quant_hf_config() if quant_cfg is not None: + # Use the community standard 'quant_method' quant_method = quant_cfg.get("quant_method", "").lower() + + # Normalize library names quant_method = quant_method.replace("compressed_tensors", "compressed-tensors") + quant_cfg["quant_method"] = quant_method # Quantization methods which are overrides (i.e. they have a @@ -1017,6 +1021,8 @@ def _verify_quantization(self) -> None: "awq_marlin", "ipex", "moe_wna16", + "modelopt", + "modelopt_fp4", ] quantization_methods = [ q for q in supported_quantization if q not in overrides @@ -3185,8 +3191,8 @@ class MultiModalConfig: """ media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set `--media-io-kwargs '{"video": {"num_frames": 40} }'` """ mm_processor_kwargs: Optional[dict[str, object]] = None @@ -4086,7 +4092,7 @@ class CompilationConfig: - True: inductor compilation is used (custom_ops disabled by default). One graph for symbolic shape and one graph per size in compile_sizes are compiled using configurations in inductor_compile_config. - + This setting is ignored if level` can be used to directly specify the compilation level `n`: `-O3` is equivalent to `-O.level=3` (same as `-O='{"level":3}'`). - Currently, -O and -O= are supported as well but this will likely be + Currently, -O and -O= are supported as well but this will likely be removed in favor of clearer -O syntax in the future. NOTE: level 0 is the default level without any optimization. level 1 and 2 diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 20def70d1976..460334d77f0a 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -75,20 +75,64 @@ def get_min_capability(cls) -> int: def get_config_filenames(cls) -> list[str]: return ["hf_quant_config.json"] + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: + """Detect if this ModelOpt config should be used based on + quantization config.""" + + if hf_quant_cfg is None: + return None + + # Use the community standard 'quant_method' + quant_method = hf_quant_cfg.get("quant_method", "").lower() + + # Only proceed if the method is explicitly "modelopt" + if quant_method != "modelopt": + return None + + # Look for ModelOpt-specific config structure + if "quantization" in hf_quant_cfg: + quant_config = hf_quant_cfg["quantization"] + if isinstance(quant_config, dict): + quant_algo = quant_config.get("quant_algo", "") + if "FP8" in quant_algo: + return "modelopt" + else: + # Check for compressed-tensors style config with specific quant_algo + quant_algo = hf_quant_cfg.get("quant_algo", "") + if isinstance(quant_algo, str) and "FP8" in quant_algo: + return "modelopt" + + return None + @classmethod def from_config(cls, config: dict[str, Any]) -> "ModelOptFp8Config": - quant_config = cls.get_from_keys(config, ["quantization"]) - quant_method = quant_config["quant_algo"] - kv_cache_quant_method = cls.get_from_keys( - config, ["quantization"]).get("kv_cache_quant_algo") - exclude_modules = cls.get_from_keys( - config, ["quantization"]).get("exclude_modules") + # Handle both ModelOpt format and compressed-tensors style format + if "quantization" in config: + # ModelOpt format: {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError( + "Expected 'quantization' to be a dictionary in config") + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + kv_cache_quant_method = quant_config.get("kv_cache_quant_algo") + exclude_modules = quant_config.get("exclude_modules") + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo", "") + kv_cache_quant_method = config.get("kv_cache_quant_algo") + exclude_modules = config.get("exclude_modules") if quant_method not in QUANT_ALGOS: - raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}" - " quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration.") + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration.") is_checkpoint_fp8_serialized = ("FP8" in quant_method) return cls(is_checkpoint_fp8_serialized, kv_cache_quant_method, @@ -434,7 +478,7 @@ class ModelOptNvFp4Config(QuantizationConfig): def __init__( self, is_checkpoint_nvfp4_serialized: bool, - kv_cache_quant_algo: str, + kv_cache_quant_algo: Optional[str], exclude_modules: list[str], group_size: int = 16, ) -> None: @@ -465,24 +509,138 @@ def get_min_capability(cls) -> int: def get_config_filenames(cls) -> list[str]: return ["hf_quant_config.json"] + @classmethod + def override_quantization_method( + cls, hf_quant_cfg, user_quant) -> Optional[QuantizationMethods]: + """Detect if this ModelOpt FP4 config should be used based on + quantization config.""" + if hf_quant_cfg is None: + return None + + # Use the community standard 'quant_method' + quant_method = hf_quant_cfg.get("quant_method", "").lower() + + # Only proceed if the method is explicitly "modelopt" + if quant_method != "modelopt": + return None + + # Look for ModelOpt-specific config structure + if "quantization" in hf_quant_cfg: + quant_config = hf_quant_cfg["quantization"] + if isinstance(quant_config, dict): + quant_algo = quant_config.get("quant_algo", "") + if "NVFP4" in quant_algo: + return "modelopt_fp4" + else: + # Check for compressed-tensors style config with specific + # quant_algo field + quant_algo = hf_quant_cfg.get("quant_algo", "") + if isinstance(quant_algo, str) and "FP4" in quant_algo.upper(): + return "modelopt_fp4" + + return None + @classmethod def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config": - quant_config = cls.get_from_keys(config, ["quantization"]) - quant_method = quant_config["quant_algo"] + # Handle both traditional ModelOpt format and compressed-tensors + # style format + if "quantization" in config: + # Traditional ModelOpt format: + # {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError( + "Expected 'quantization' to be a dictionary in config") + + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw + else: + raise ValueError(f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_algo_raw)}") + + # Handle group_size with proper type validation + group_size_raw = quant_config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError(f"group_size must be an integer, got " + f"{type(group_size_raw)}") from None + + exclude_modules = quant_config.get("exclude_modules", []) + if not isinstance(exclude_modules, list): + raise ValueError(f"exclude_modules must be a list, got " + f"{type(exclude_modules)}") + else: + # Compressed-tensors style format: + # {"quant_algo": "...", "quant_method": "modelopt"} + quant_method = config.get("quant_algo", "") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw + else: + raise ValueError(f"kv_cache_quant_algo must be a string, got " + f"{type(kv_cache_quant_algo_raw)}") + + # Handle group_size with proper type validation + group_size_raw = config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError(f"group_size must be an integer, got " + f"{type(group_size_raw)}") from None + + exclude_modules = config.get("exclude_modules", []) + if not isinstance(exclude_modules, list): + raise ValueError(f"exclude_modules must be a list, got " + f"{type(exclude_modules)}") + if quant_method not in QUANT_ALGOS: - raise ValueError(f"ModelOpt currently only supports: {QUANT_ALGOS}" - " quantizations in vLLM. Please check the " - "`hf_quant_config.json` file for your model's " - "quant configuration.") + raise ValueError( + f"ModelOpt currently only supports: {QUANT_ALGOS} " + "quantizations in vLLM. Please check the " + "`hf_quant_config.json` file for your model's " + "quant configuration.") is_checkpoint_nvfp4_serialized = ("NVFP4" in quant_method) - if ("group_size" and "kv_cache_quant_algo" - and "exclude_modules") not in quant_config: - raise ValueError("NVFP4 quantization requires group size and " - "kv_cache_quant_algo specified in " - "hf_quant_config.json") - kv_cache_quant_algo = quant_config["kv_cache_quant_algo"] - group_size = quant_config["group_size"] - exclude_modules = quant_config["exclude_modules"] + + # For FP4, these fields are required + if is_checkpoint_nvfp4_serialized and "quantization" in config: + # Check if required fields are present in the quantization config + quant_config = config["quantization"] + required_fields = [ + "group_size", "kv_cache_quant_algo", "exclude_modules" + ] + missing_fields = [ + field for field in required_fields if field not in quant_config + ] + if missing_fields: + raise ValueError( + f"NVFP4 quantization requires the following fields in " + f"hf_quant_config.json: {missing_fields}") + return cls(is_checkpoint_nvfp4_serialized, kv_cache_quant_algo, exclude_modules, group_size) From 6dda13c86ba17ca6bc054293d135bad2d1ab7129 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 21 Jul 2025 08:37:49 -0700 Subject: [PATCH 45/57] [Misc] Add sliding window to flashinfer test (#21282) Signed-off-by: Woosuk Kwon --- tests/kernels/attention/test_flashinfer.py | 49 ++++++++++++++-------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index 3ad6e1d32911..8f9b4eceaa72 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -77,6 +77,7 @@ def ref_paged_attn( @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@pytest.mark.parametrize("sliding_window", [None, 64]) @torch.inference_mode def test_flashinfer_decode_with_paged_kv( kv_lens: list[int], @@ -85,6 +86,7 @@ def test_flashinfer_decode_with_paged_kv( dtype: torch.dtype, block_size: int, soft_cap: Optional[float], + sliding_window: Optional[int], ) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) @@ -136,17 +138,20 @@ def test_flashinfer_decode_with_paged_kv( use_tensor_cores=( (num_query_heads//num_kv_heads) > 4) ) - wrapper.plan(kv_indptr, - kv_indices, - kv_last_page_lens, - num_query_heads, - num_kv_heads, - head_size, - block_size, - "NONE", - q_data_type=dtype, - kv_data_type=dtype, - logits_soft_cap=soft_cap) + wrapper.plan( + kv_indptr, + kv_indices, + kv_last_page_lens, + num_query_heads, + num_kv_heads, + head_size, + block_size, + "NONE", + window_left=sliding_window - 1 if sliding_window is not None else -1, + q_data_type=dtype, + kv_data_type=dtype, + logits_soft_cap=soft_cap, + ) output = wrapper.run(query, key_value_cache) @@ -157,7 +162,8 @@ def test_flashinfer_decode_with_paged_kv( kv_lens=kv_lens, block_tables=block_tables, scale=scale, - soft_cap=soft_cap) + soft_cap=soft_cap, + sliding_window=sliding_window) torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" @@ -168,12 +174,17 @@ def test_flashinfer_decode_with_paged_kv( @pytest.mark.parametrize("block_size", BLOCK_SIZES) @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("soft_cap", [None, 30.0, 50.0]) +@pytest.mark.parametrize("sliding_window", [None, 64]) @torch.inference_mode -def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], - num_heads: tuple[int, int], - head_size: int, dtype: torch.dtype, - block_size: int, - soft_cap: Optional[float]) -> None: +def test_flashinfer_prefill_with_paged_kv( + seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], + head_size: int, + dtype: torch.dtype, + block_size: int, + soft_cap: Optional[float], + sliding_window: Optional[int], +) -> None: torch.set_default_device("cuda") current_platform.seed_everything(0) num_seqs = len(seq_lens) @@ -242,6 +253,7 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], num_kv_heads, head_size, block_size, + window_left=sliding_window - 1 if sliding_window is not None else -1, q_data_type=dtype, kv_data_type=dtype, logits_soft_cap=soft_cap, @@ -259,7 +271,8 @@ def test_flashinfer_prefill_with_paged_kv(seq_lens: list[tuple[int, int]], kv_lens=kv_lens, block_tables=block_tables, scale=scale, - soft_cap=soft_cap) + soft_cap=soft_cap, + sliding_window=sliding_window) torch.testing.assert_close(output, ref_output, atol=5e-2, rtol=1e-2), \ f"{torch.max(torch.abs(output - ref_output))}" From a15a50fc17f9918d2cc457e5ef50310b38c28f5f Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Tue, 22 Jul 2025 00:07:08 +0800 Subject: [PATCH 46/57] [CPU] Enable shared-memory based pipeline parallel for CPU backend (#21289) Signed-off-by: jiang1.li --- .../scripts/hardware_ci/run-cpu-test.sh | 18 ++--- csrc/cpu/shm.cpp | 69 +++++++++++++------ docs/getting_started/installation/cpu.md | 14 ++++ .../device_communicators/cpu_communicator.py | 60 +++++++++++++++- vllm/distributed/parallel_state.py | 12 ++++ vllm/engine/arg_utils.py | 9 +-- vllm/envs.py | 7 +- vllm/platforms/cpu.py | 35 ++++------ 8 files changed, 165 insertions(+), 59 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test.sh b/.buildkite/scripts/hardware_ci/run-cpu-test.sh index e3d47a0e6c16..90cc9c844622 100644 --- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh @@ -6,6 +6,7 @@ set -ex # allow to bind to different cores CORE_RANGE=${CORE_RANGE:-48-95} +# used for TP/PP E2E test OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95} NUMA_NODE=${NUMA_NODE:-1} @@ -24,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu . # Run the image, setting --shm-size=4g for tensor parallel. -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" -docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE" +docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2 function cpu_tests() { set -e @@ -78,17 +79,16 @@ function cpu_tests() { # tests/quantization/test_ipex_quant.py" # online serving - docker exec cpu-test-"$NUMA_NODE" bash -c " + docker exec cpu-test-"$NUMA_NODE" bash -c ' set -e - python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & - timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 - VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \ + VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 & + timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1 + python3 benchmarks/benchmark_serving.py \ --backend vllm \ --dataset-name random \ - --model facebook/opt-125m \ + --model meta-llama/Llama-3.2-3B-Instruct \ --num-prompts 20 \ - --endpoint /v1/completions \ - --tokenizer facebook/opt-125m" + --endpoint /v1/completions' # Run multi-lora tests docker exec cpu-test-"$NUMA_NODE" bash -c " diff --git a/csrc/cpu/shm.cpp b/csrc/cpu/shm.cpp index 9adb6f27ec41..7e64e1c52198 100644 --- a/csrc/cpu/shm.cpp +++ b/csrc/cpu/shm.cpp @@ -7,7 +7,7 @@ namespace { #define MAX_SHM_RANK_NUM 8 -#define PER_THREAD_SHM_BUFFER_BYTES (2 * 1024 * 1024) +#define PER_THREAD_SHM_BUFFER_BYTES (4 * 1024 * 1024) static_assert(PER_THREAD_SHM_BUFFER_BYTES % 2 == 0); #define PER_THREAD_SHM_BUFFER_OFFSET (PER_THREAD_SHM_BUFFER_BYTES >> 1) #define MIN_THREAD_PROCESS_SIZE (256) @@ -34,9 +34,10 @@ struct KernelVecType { }; struct ThreadSHMContext { - volatile char _curr_thread_stamp; - volatile char _ready_thread_stamp; - char _padding1[6]; + volatile char _curr_thread_stamp[2]; + volatile char _ready_thread_stamp[2]; + int local_stamp_buffer_idx; + int remote_stamp_buffer_idx; int thread_id; int thread_num; int rank; @@ -45,23 +46,28 @@ struct ThreadSHMContext { int swizzled_ranks[MAX_SHM_RANK_NUM]; void* thread_shm_ptrs[MAX_SHM_RANK_NUM]; ThreadSHMContext* shm_contexts[MAX_SHM_RANK_NUM]; - size_t _thread_buffer_mask; - char _padding2[56]; + size_t _thread_buffer_mask[2]; + char _padding2[40]; ThreadSHMContext(const int thread_id, const int thread_num, const int rank, const int group_size, void* thread_shm_ptr) - : _curr_thread_stamp(1), - _ready_thread_stamp(0), + : local_stamp_buffer_idx(0), + remote_stamp_buffer_idx(0), thread_id(thread_id), thread_num(thread_num), rank(rank), group_size(group_size), - _spinning_count(0), - _thread_buffer_mask(0) { + _spinning_count(0) { static_assert(sizeof(ThreadSHMContext) % 64 == 0); TORCH_CHECK(group_size <= MAX_SHM_RANK_NUM); TORCH_CHECK((size_t)this % 64 == 0); TORCH_CHECK((size_t)thread_shm_ptr % 64 == 0); + _curr_thread_stamp[0] = 1; + _curr_thread_stamp[1] = 1; + _ready_thread_stamp[0] = 0; + _ready_thread_stamp[1] = 0; + _thread_buffer_mask[0] = 0; + _thread_buffer_mask[1] = 0; for (int i = 0; i < MAX_SHM_RANK_NUM; ++i) { shm_contexts[i] = nullptr; thread_shm_ptrs[i] = nullptr; @@ -70,6 +76,11 @@ struct ThreadSHMContext { set_context(rank, this, thread_shm_ptr); } + void set_stamp_buffer_idx(int local, int remote) { + local_stamp_buffer_idx = local; + remote_stamp_buffer_idx = remote; + } + void set_context(int rank, ThreadSHMContext* ptr, void* thread_shm_ptr) { TORCH_CHECK(rank < MAX_SHM_RANK_NUM); TORCH_CHECK(ptr); @@ -84,23 +95,27 @@ struct ThreadSHMContext { T* get_thread_shm_ptr(int rank) { return reinterpret_cast( reinterpret_cast(thread_shm_ptrs[rank]) + - (PER_THREAD_SHM_BUFFER_OFFSET & _thread_buffer_mask)); + (PER_THREAD_SHM_BUFFER_OFFSET & + _thread_buffer_mask[local_stamp_buffer_idx])); } - void next_buffer() { _thread_buffer_mask ^= 0xFFFFFFFFFFFFFFFF; } + void next_buffer() { + _thread_buffer_mask[local_stamp_buffer_idx] ^= 0xFFFFFFFFFFFFFFFF; + } - char get_curr_stamp() const { return _curr_thread_stamp; } + char get_curr_stamp(int idx) const { return _curr_thread_stamp[idx]; } - char get_ready_stamp() const { return _ready_thread_stamp; } + char get_ready_stamp(int idx) const { return _ready_thread_stamp[idx]; } void next_stamp() { _mm_mfence(); - _curr_thread_stamp += 1; + _curr_thread_stamp[local_stamp_buffer_idx] += 1; } void commit_ready_stamp() { _mm_mfence(); - _ready_thread_stamp = _curr_thread_stamp; + _ready_thread_stamp[local_stamp_buffer_idx] = + _curr_thread_stamp[local_stamp_buffer_idx]; } int get_swizzled_rank(int idx) { return swizzled_ranks[idx]; } @@ -117,10 +132,11 @@ struct ThreadSHMContext { void wait_for_one(int rank, Cond&& cond) { ThreadSHMContext* rank_ctx = shm_contexts[rank]; for (;;) { - char local_curr_stamp = get_curr_stamp(); - char local_ready_stamp = get_ready_stamp(); - char rank_curr_stamp = rank_ctx->get_curr_stamp(); - char rank_ready_stamp = rank_ctx->get_ready_stamp(); + char local_curr_stamp = get_curr_stamp(local_stamp_buffer_idx); + char local_ready_stamp = get_ready_stamp(local_stamp_buffer_idx); + char rank_curr_stamp = rank_ctx->get_curr_stamp(remote_stamp_buffer_idx); + char rank_ready_stamp = + rank_ctx->get_ready_stamp(remote_stamp_buffer_idx); if (cond(local_curr_stamp, local_ready_stamp, rank_curr_stamp, rank_ready_stamp)) { break; @@ -361,6 +377,15 @@ void shm_cc_loop(ThreadSHMContext* ctx, int64_t elem_num, F&& inner_func) { } } } + +void reset_threads_stamp_buffer_idx(ThreadSHMContext* ctx, int local, + int remote) { + int thread_num = ctx->thread_num; + for (int i = 0; i < thread_num; ++i) { + ThreadSHMContext* thread_ctx = ctx + i; + thread_ctx->set_stamp_buffer_idx(local, remote); + } +} }; // namespace shm_cc_ops namespace shm_cc_ops { @@ -632,6 +657,7 @@ void shm_send_tensor_list_impl(ThreadSHMContext* ctx, int64_t dst, TensorListMeta* metadata = new (metadata_tensor.data_ptr()) TensorListMeta(); metadata->bind_tensor_list(tensor_list_with_metadata); + shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 0, 1); shm_cc_ops::shm_cc_loop( ctx, metadata->total_bytes, [&](ThreadSHMContext* thread_ctx, int64_t data_offset, @@ -659,6 +685,7 @@ std::vector shm_recv_tensor_list_impl(ThreadSHMContext* ctx, torch::Tensor metadata_tensor = torch::empty({sizeof(TensorListMeta)}, options); + shm_cc_ops::reset_threads_stamp_buffer_idx(ctx, 1, 0); ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready); shm_cc_ops::memcpy(metadata_tensor.data_ptr(), ctx->get_thread_shm_ptr(src), @@ -677,7 +704,7 @@ std::vector shm_recv_tensor_list_impl(ThreadSHMContext* ctx, ctx, metadata.total_bytes, [&](ThreadSHMContext* thread_ctx, int64_t data_offset, int64_t data_elem_num, bool fast_mode) { - ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready); + thread_ctx->wait_for_one(src, ThreadSHMContext::check_stamp_ready); int64_t curr_shm_offset = 0; while (curr_shm_offset < data_elem_num) { MemPiece frag = metadata.get_data(data_offset + curr_shm_offset); diff --git a/docs/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md index d77e7383650c..5721195172dc 100644 --- a/docs/getting_started/installation/cpu.md +++ b/docs/getting_started/installation/cpu.md @@ -166,6 +166,20 @@ Note, it is recommended to manually reserve 1 CPU for vLLM front-end process whe - This value is 4GB by default. Larger space can support more concurrent requests, longer context length. However, users should take care of memory capacity of each NUMA node. The memory usage of each TP rank is the sum of `weight shard size` and `VLLM_CPU_KVCACHE_SPACE`, if it exceeds the capacity of a single NUMA node, the TP worker will be killed with `exitcode 9` due to out-of-memory. +### How to do performance tuning for vLLM CPU? + + - First of all, please make sure the thread-binding and KV cache space are properly set and take effect. You can check the thread-binding by running a vLLM benchmark and observing CPU cores usage via `htop`. + + - Inference batch size is a important parameter for the performance. Larger batch usually provides higher throughput, smaller batch provides lower latency. Tuning max batch size starts from default value to balance throughput and latency is an effective way to improve vLLM CPU performance on specific platforms. There are two important related parameters in vLLM: + - `--max-num-batched-tokens`, defines the limit of token numbers in a single batch, has more impacts on the first token performance. The default value is set as: + - Offline Inference: `4096 * world_size` + - Online Serving: `2048 * world_size` + - `--max-num-seqs`, defines the limit of sequence numbers in a single batch, has more impacts on the output token performance. + - Offline Inference: `256 * world_size` + - Online Serving: `128 * world_size` + + - vLLM CPU supports tensor parallel (TP) and pipeline parallel (PP) to leverage multiple CPU sockets and memory nodes. For more detials of tuning TP and PP, please refer to [Optimization and Tuning](../../configuration/optimization.md). For vLLM CPU, it is recommend to use TP and PP togther if there are enough CPU sockets and memory nodes. + ### Which quantization configs does vLLM CPU support? - vLLM CPU supports quantizations: diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py index 94effa0b2ca8..bda567f8489c 100644 --- a/vllm/distributed/device_communicators/cpu_communicator.py +++ b/vllm/distributed/device_communicators/cpu_communicator.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os -from typing import Optional +from typing import Any, Optional, Union import torch from torch.distributed import ProcessGroup +from vllm.distributed.utils import pickle from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum @@ -26,7 +27,8 @@ def __init__(self, if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) and hasattr( torch.ops._C, - "init_shm_manager") and unique_name.startswith("tp"): + "init_shm_manager") and (unique_name.startswith("tp") + or unique_name.startswith("pp")): self.dist_module = _CPUSHMDistributed(self) def all_reduce(self, input_): @@ -94,6 +96,19 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor: input_size[dim + 1:]) return output_tensor + def send_tensor_dict( + self, + tensor_dict: dict[str, Union[torch.Tensor, Any]], + dst: int, + ) -> None: + return self.dist_module.send_tensor_dict(tensor_dict, dst) + + def recv_tensor_dict( + self, + src: int, + ) -> dict[str, Union[torch.Tensor, Any]]: + return self.dist_module.recv_tensor_dict(src) + class _CPUSHMDistributed: @@ -143,3 +158,44 @@ def all_gather_into_tensor(self, input: torch.Tensor, group: Optional[ProcessGroup] = None) -> None: torch.ops._C.shm_all_gather(self.handle, input, output) + + def send_tensor_dict( + self, + tensor_dict: dict[str, Union[torch.Tensor, Any]], + dst: int, + ) -> None: + key_list = list(tensor_dict.keys()) + value_list = list(tensor_dict.values()) + size_list = [] + for v in value_list: + if not isinstance(v, torch.Tensor): + raise RuntimeError( + "CpuCommunicator only supports sending tensors.") + size_list.append(v.size()) + key_size_tensor = torch.frombuffer(pickle.dumps([key_list, size_list]), + dtype=torch.uint8) + value_list.append(key_size_tensor) + + torch.ops._C.shm_send_tensor_list(self.handle, value_list, dst) + + return None + + def recv_tensor_dict( + self, + src: int, + ) -> dict[str, Union[torch.Tensor, Any]]: + tensor_list = torch.ops._C.shm_recv_tensor_list(self.handle, src) + + value_list: list[torch.Tensor] = tensor_list[:-1] + key_size_tensor = tensor_list[-1] + + key_size = pickle.loads(key_size_tensor.numpy().tobytes()) + key_list = key_size[0] + size_list = key_size[1] + assert len(key_list) == len(size_list) + assert len(key_list) == len(value_list) + + tensor_dict: dict[str, torch.Tensor] = {} + for key, size, t in zip(key_list, size_list, value_list): + tensor_dict[key] = t.view(size) + return tensor_dict diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 1bb0ca79cc1d..1f7a14920c41 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -272,6 +272,9 @@ def __init__( self.use_custom_op_call = (current_platform.is_cuda_alike() or current_platform.is_tpu()) + self.use_cpu_custom_send_recv = (current_platform.is_cpu() and hasattr( + torch.ops._C, "init_shm_manager")) + @property def first_rank(self): """Return the global rank of the first process in the group""" @@ -663,6 +666,11 @@ def send_tensor_dict( dst = (self.rank_in_group + 1) % self.world_size assert dst < self.world_size, f"Invalid dst rank ({dst})" + if self.use_cpu_custom_send_recv: + self.device_communicator.send_tensor_dict( # type: ignore + tensor_dict, dst) + return None + metadata_list: list[tuple[Any, Any]] = [] assert isinstance( tensor_dict, @@ -718,6 +726,10 @@ def recv_tensor_dict( src = (self.rank_in_group - 1) % self.world_size assert src < self.world_size, f"Invalid src rank ({src})" + if self.use_cpu_custom_send_recv: + return self.device_communicator.recv_tensor_dict( # type: ignore + src) + recv_metadata_list = self.recv_object(src=src) tensor_dict: dict[str, Any] = {} for key, value in recv_metadata_list: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 019ff033eda2..28b1c1c363a7 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1639,13 +1639,14 @@ def _set_default_args_v1(self, usage_context: UsageContext, # cpu specific default values. if current_platform.is_cpu(): + world_size = self.pipeline_parallel_size * self.tensor_parallel_size default_max_num_batched_tokens = { - UsageContext.LLM_CLASS: 4096, - UsageContext.OPENAI_API_SERVER: 2048, + UsageContext.LLM_CLASS: 4096 * world_size, + UsageContext.OPENAI_API_SERVER: 2048 * world_size, } default_max_num_seqs = { - UsageContext.LLM_CLASS: 128, - UsageContext.OPENAI_API_SERVER: 32, + UsageContext.LLM_CLASS: 256 * world_size, + UsageContext.OPENAI_API_SERVER: 128 * world_size, } use_context_value = usage_context.value if usage_context else None diff --git a/vllm/envs.py b/vllm/envs.py index c5f97de807a7..16f635b3ac41 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -42,7 +42,7 @@ VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False VLLM_PP_LAYER_PARTITION: Optional[str] = None - VLLM_CPU_KVCACHE_SPACE: int = 0 + VLLM_CPU_KVCACHE_SPACE: Optional[int] = 0 VLLM_CPU_OMP_THREADS_BIND: str = "" VLLM_CPU_NUM_OF_RESERVED_CPU: Optional[int] = None VLLM_CPU_MOE_PREPACK: bool = True @@ -430,9 +430,10 @@ def get_vllm_port() -> Optional[int]: lambda: os.getenv("VLLM_PP_LAYER_PARTITION", None), # (CPU backend only) CPU key-value cache space. - # default is 4 GiB + # default is None and will be set as 4 GB "VLLM_CPU_KVCACHE_SPACE": - lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")), + lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")) + if "VLLM_CPU_KVCACHE_SPACE" in os.environ else None, # (CPU backend only) CPU core ids bound by OpenMP threads, e.g., "0-31", # "0,1,2", "0-31,33". CPU cores of different ranks are separated by '|'. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 70c339c9bc98..31a67183ff12 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -104,8 +104,19 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int, @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: - import psutil - return psutil.virtual_memory().total + import vllm.envs as envs + from vllm.utils import GiB_bytes + + kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE + if kv_cache_space is None: + kv_cache_space = 4 * GiB_bytes # type: ignore + logger.warning_once( + "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) " + "for CPU backend is not set, using 4 by default.") + else: + kv_cache_space *= GiB_bytes + + return kv_cache_space @classmethod def set_device(cls, device: torch.device) -> None: @@ -124,8 +135,6 @@ def inference_mode(cls): @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: - import vllm.envs as envs - from vllm.utils import GiB_bytes model_config = vllm_config.model_config if model_config is not None: @@ -162,20 +171,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: " support fp16 for now, cast to bf16.") model_config.dtype = torch.bfloat16 - kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE - - if kv_cache_space >= 0: - if kv_cache_space == 0: - cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes # type: ignore - logger.warning( - "Environment variable VLLM_CPU_KVCACHE_SPACE (GiB) " - "for CPU backend is not set, using 4 by default.") - else: - cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes # type: ignore # noqa - else: - raise RuntimeError( - "Invalid environment variable VLLM_CPU_KVCACHE_SPACE" - f" {kv_cache_space}, expect a positive integer value.") + cache_config.cpu_kvcache_space_bytes = \ + CpuPlatform.get_device_total_memory() parallel_config = vllm_config.parallel_config if (parallel_config.world_size > 1 @@ -216,8 +213,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: False, "nan_asserts": False, - "memory_planning": - True, "epilogue_fusion": True, }) From a0e827e07c3c6a22283b4de2e0072c09f62162fc Mon Sep 17 00:00:00 2001 From: simpx Date: Tue, 22 Jul 2025 00:07:36 +0800 Subject: [PATCH 47/57] [BugFix] make utils.current_stream thread-safety (#21252) (#21253) Signed-off-by: simpx --- tests/test_utils.py | 44 +++++++++++++++++++++++++++++++++++++++--- vllm/utils/__init__.py | 15 +++++++------- 2 files changed, 48 insertions(+), 11 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 28acacd25190..53a34642e5ba 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -23,9 +23,9 @@ from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot, PlaceholderModule, StoreBoolean, bind_kv_cache, common_broadcastable_dtype, - deprecate_kwargs, get_open_port, get_tcp_uri, - is_lossless_cast, join_host_port, make_zmq_path, - make_zmq_socket, memory_profiling, + current_stream, deprecate_kwargs, get_open_port, + get_tcp_uri, is_lossless_cast, join_host_port, + make_zmq_path, make_zmq_socket, memory_profiling, merge_async_iterators, sha256, split_host_port, split_zmq_path, supports_kw, swap_dict_values) @@ -957,3 +957,41 @@ def test_convert_ids_list_to_tokens(): ] tokens = convert_ids_list_to_tokens(tokenizer, token_ids) assert tokens == ['Hello', ',', ' world', '!'] + + +def test_current_stream_multithread(): + import threading + if not torch.cuda.is_available(): + pytest.skip("CUDA not available") + + main_default_stream = torch.cuda.current_stream() + child_stream = torch.cuda.Stream() + + thread_stream_ready = threading.Event() + thread_can_exit = threading.Event() + + def child_thread_func(): + with torch.cuda.stream(child_stream): + thread_stream_ready.set() + thread_can_exit.wait(timeout=10) + + child_thread = threading.Thread(target=child_thread_func) + child_thread.start() + + try: + assert thread_stream_ready.wait( + timeout=5), "Child thread failed to enter stream context in time" + + main_current_stream = current_stream() + + assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread" + assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream" + + # Notify child thread it can exit + thread_can_exit.set() + + finally: + # Ensure child thread exits properly + child_thread.join(timeout=5) + if child_thread.is_alive(): + pytest.fail("Child thread failed to exit properly") diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index bbcc2a523dcb..e4f495e22e29 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -1383,12 +1383,11 @@ def find_nccl_library() -> str: prev_set_stream = torch.cuda.set_stream -_current_stream = None +_current_stream_tls = threading.local() def _patched_set_stream(stream: torch.cuda.Stream) -> None: - global _current_stream - _current_stream = stream + _current_stream_tls.value = stream prev_set_stream(stream) @@ -1407,16 +1406,16 @@ def current_stream() -> torch.cuda.Stream: from C/C++ code. """ from vllm.platforms import current_platform - global _current_stream - if _current_stream is None: + if not hasattr(_current_stream_tls, + "value") or _current_stream_tls.value is None: # when this function is called before any stream is set, # we return the default stream. # On ROCm using the default 0 stream in combination with RCCL # is hurting performance. Therefore creating a dedicated stream # per process - _current_stream = torch.cuda.Stream() if current_platform.is_rocm( - ) else torch.cuda.current_stream() - return _current_stream + _current_stream_tls.value = torch.cuda.Stream( + ) if current_platform.is_rocm() else torch.cuda.current_stream() + return _current_stream_tls.value def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None: From 6ece16c4fe8c6f8f49b66c95cd3dd06b1c75de35 Mon Sep 17 00:00:00 2001 From: Ming Yang Date: Mon, 21 Jul 2025 09:08:09 -0700 Subject: [PATCH 48/57] [Misc] Add dummy maverick test (#21199) Signed-off-by: Ming Yang Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../multimodal/generation/test_maverick.py | 649 ++++++++++++++++++ 1 file changed, 649 insertions(+) create mode 100644 tests/models/multimodal/generation/test_maverick.py diff --git a/tests/models/multimodal/generation/test_maverick.py b/tests/models/multimodal/generation/test_maverick.py new file mode 100644 index 000000000000..083dc66148e2 --- /dev/null +++ b/tests/models/multimodal/generation/test_maverick.py @@ -0,0 +1,649 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Create a reduced-layer version of the Maverick model for testing purposes. + +This script creates a new model with fewer layers by: +1. Loading the original Maverick model configuration +2. Creating a reduced configuration +3. Generating compatible safetensors files with appropriate weights +4. Creating the necessary index files for vLLM compatibility +""" + +import json +import shutil +from pathlib import Path +from typing import Any + +import pytest +import torch +from safetensors.torch import save_file +from transformers import (AutoConfig, AutoProcessor, AutoTokenizer, + GenerationConfig) + +from vllm import LLM, SamplingParams + +# Sample prompts for testing +PROMPTS: list[str] = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + + +def run_maverick_serving(model: str): + """Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent + options with reduced layers. + """ + + try: + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + llm = LLM( + model=model, + max_model_len=2048, + enforce_eager=True, + tensor_parallel_size=8, + enable_expert_parallel=True, + trust_remote_code=True, + gpu_memory_utilization=0.4, + kv_cache_dtype="fp8", + ) + + outputs = llm.generate(PROMPTS, sampling_params) + + # Print the outputs + print("\nGenerated Outputs:\n" + "-" * 60) + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}") + print(f"Output: {generated_text!r}") + print("-" * 60) + + except Exception as e: + print(f"Error initializing or running model: {e}") + raise + + +def create_reduced_maverick_model( + original_model_name: + str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + output_dir: str = "/tmp/reduced_maverick", + text_layers: int = 4, + num_experts: int = 4, + vision_layers: int = 2, + force_recreate: bool = False, +) -> str: + """ + Create a reduced-layer version of the Maverick model. + + Args: + original_model_name: Name of the original Maverick model + output_dir: Directory to save the reduced model + text_layers: Number of text transformer layers + num_experts: Number of experts per layer + vision_layers: Number of vision transformer layers + force_recreate: Whether to recreate if output_dir already exists + + Returns: + Path to the created reduced model directory + """ + + print( + f"Creating reduced Maverick model with {text_layers} text layers and " + f"{vision_layers} vision layers...") + + # Create output directory + output_path = Path(output_dir) + if output_path.exists(): + if force_recreate: + shutil.rmtree(output_path) + else: + print(f"Output directory {output_dir} already exists. " + "Use --force-recreate to overwrite.") + return str(output_path) + + output_path.mkdir(parents=True, exist_ok=True) + + try: + print("Loading original model configuration...") + original_config = AutoConfig.from_pretrained(original_model_name, + trust_remote_code=True) + + print("Creating reduced configuration...") + reduced_config = create_reduced_config(original_config, text_layers, + num_experts, vision_layers) + + config_path = output_path / "config.json" + with open(config_path, "w") as f: + json.dump(reduced_config, f, indent=2) + print(f"Saved reduced config to {config_path}") + + print("Copying tokenizer files...") + copy_tokenizer_files(original_model_name, output_path) + + print("Creating reduced safetensors files...") + create_reduced_safetensors(original_config, reduced_config, + output_path) + + print("Creating preprocessor config...") + create_preprocessor_config(original_config, output_path) + + try: + gen_config = GenerationConfig.from_pretrained(original_model_name) + gen_config.save_pretrained(output_path) + print("Copied generation config") + except Exception as e: + print(f"Could not copy generation config: {e}") + + print(f"Successfully created reduced Maverick model at {output_path}") + return str(output_path) + + except Exception as e: + print(f"Error creating reduced model: {e}") + # Clean up on failure + if output_path.exists(): + shutil.rmtree(output_path) + raise + + +def create_reduced_config(original_config: Any, text_layers: int, + num_experts: int, + vision_layers: int) -> dict[str, Any]: + """Create a reduced configuration based on the original.""" + + # Convert config to dictionary + config_dict = original_config.to_dict() + + # Reduce text layers + if "text_config" in config_dict: + original_text_layers = config_dict["text_config"]["num_hidden_layers"] + config_dict["text_config"]["num_hidden_layers"] = text_layers + print( + f"Reduced text layers from {original_text_layers} to {text_layers}" + ) + + original_num_experts = config_dict["text_config"]["num_local_experts"] + config_dict["text_config"]["num_local_experts"] = num_experts + print( + f"Reduced num experts from {original_num_experts} to {num_experts}" + ) + + hidden_dim_divisor = 4 + + original_hidden_size = config_dict["text_config"]["hidden_size"] + new_hidden_size = original_hidden_size // hidden_dim_divisor + config_dict["text_config"]["hidden_size"] = new_hidden_size + print(f"Reduced hidden size from {original_hidden_size} to " + f"{new_hidden_size}") + + original_head_dim = config_dict["text_config"]["head_dim"] + new_head_dim = original_head_dim // hidden_dim_divisor + config_dict["text_config"]["head_dim"] = new_head_dim + print(f"Reduced head dim from {original_head_dim} to {new_head_dim}") + + # Reduce vision layers + if "vision_config" in config_dict: + original_vision_layers = config_dict["vision_config"][ + "num_hidden_layers"] + config_dict["vision_config"]["num_hidden_layers"] = vision_layers + print(f"Reduced vision layers from {original_vision_layers} " + f"to {vision_layers}") + + # Update model name to indicate it's a reduced version + config_dict["_name_or_path"] = ( + f"reduced_maverick_{text_layers}t_{vision_layers}v") + + return config_dict + + +def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None: + """Copy tokenizer files from the original model.""" + + try: + tokenizer = AutoTokenizer.from_pretrained(original_model_name, + trust_remote_code=True) + tokenizer.save_pretrained(output_path) + print("Tokenizer files copied successfully") + except Exception as e: + print(f"Warning: Could not copy tokenizer files: {e}") + + +def create_preprocessor_config(original_config: Any, + output_path: Path) -> None: + """Create preprocessor_config.json for multimodal model.""" + + # Try to load the original preprocessor config + try: + processor = AutoProcessor.from_pretrained( + original_config._name_or_path + or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + trust_remote_code=True, + ) + processor.save_pretrained(output_path) + print("Copied original preprocessor config") + return + except Exception as e: + print(f"Could not copy original preprocessor config: {e}") + raise + + +def create_reduced_safetensors(original_config: Any, reduced_config: dict[str, + Any], + output_path: Path) -> None: + """Create safetensors files with weights for the reduced model.""" + + print("Generating synthetic weights for reduced model...") + + text_config = reduced_config["text_config"] + vision_config = reduced_config["vision_config"] + + weights = {} + + print("Creating text model weights...") + weights.update(create_text_model_weights(text_config)) + + print("Creating vision model weights...") + weights.update(create_vision_model_weights(vision_config)) + + print("Creating shared model weights...") + weights.update(create_shared_weights(text_config, vision_config)) + + print("Saving weights to safetensors files...") + save_weights_to_safetensors(weights, output_path) + + +def create_text_model_weights( + text_config: dict[str, Any]) -> dict[str, torch.Tensor]: + """Create synthetic weights for the text model with MoE structure.""" + + weights = {} + + vocab_size = text_config["vocab_size"] + hidden_size = text_config["hidden_size"] + intermediate_size = text_config["intermediate_size"] + intermediate_size_mlp = text_config["intermediate_size_mlp"] + num_layers = text_config["num_hidden_layers"] + num_attention_heads = text_config["num_attention_heads"] + num_key_value_heads = text_config.get("num_key_value_heads", + num_attention_heads) + + # MoE specific parameters + num_experts = text_config.get("num_local_experts") + assert (num_experts + is not None), "num_local_experts must be specified for MoE" + + head_dim = hidden_size // num_attention_heads + + # Embedding layers + weights["language_model.model.embed_tokens.weight"] = torch.randn( + vocab_size, hidden_size, dtype=torch.float16) + + # Transformer layers + for layer_idx in range(num_layers): + layer_prefix = f"language_model.model.layers.{layer_idx}" + print(f"Creating weights for layer {layer_prefix}...") + + # Self-attention weights (separate q, k, v projections) + weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn( + hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn( + hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn( + num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn( + hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16) + print("Self-attention weights created.") + + # Feed-forward weights - MoE pattern based on interleave_moe_layer_step + # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers + # 0,2,4,... are dense + interleave_step = text_config.get("interleave_moe_layer_step", 1) + is_moe_layer = (interleave_step > 0 + and (layer_idx + 1) % interleave_step == 0) + + if is_moe_layer: + # MoE layer structure + # 1. Router weights + weights[ + f"{layer_prefix}.feed_forward.router.weight"] = torch.randn( + num_experts, hidden_size, dtype=torch.float16) + + # 2. Individual expert weights (not fused) + for expert_idx in range(num_experts): + expert_prefix = ( + f"{layer_prefix}.feed_forward.experts.{expert_idx}") + + weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{expert_prefix}.up_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{expert_prefix}.down_proj.weight"] = torch.randn( + hidden_size, intermediate_size, dtype=torch.bfloat16) + + # Expert weight scales (FP8 quantization) + weights[ + f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones( + intermediate_size, 1, dtype=torch.bfloat16) + weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones( + intermediate_size, 1, dtype=torch.bfloat16) + weights[ + f"{expert_prefix}.down_proj.weight_scale"] = torch.ones( + hidden_size, 1, dtype=torch.bfloat16) + + # 3. Shared expert weights + shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert" + weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn( + hidden_size, intermediate_size, dtype=torch.bfloat16) + print(f"MoE feed-forward weights created for layer {layer_idx}.") + else: + # Dense layer structure + weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = ( + torch.randn(intermediate_size_mlp, + hidden_size, + dtype=torch.bfloat16)) + weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = ( + torch.randn(intermediate_size_mlp, + hidden_size, + dtype=torch.bfloat16)) + weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = ( + torch.randn(hidden_size, + intermediate_size_mlp, + dtype=torch.bfloat16)) + print(f"Dense feed-forward weights created for layer {layer_idx}.") + + # Layer norms + weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights[ + f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + print("Layer norms created.") + + # Final layer norm and output projection + weights["language_model.model.norm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights["language_model.lm_head.weight"] = torch.randn( + vocab_size, hidden_size, dtype=torch.bfloat16) + + return weights + + +def create_vision_model_weights( + vision_config: dict[str, Any]) -> dict[str, torch.Tensor]: + """Create synthetic weights for the vision model.""" + + weights = {} + + hidden_size = vision_config["hidden_size"] + intermediate_size = vision_config["intermediate_size"] + num_layers = vision_config["num_hidden_layers"] + + # Vision transformer layers + for layer_idx in range(num_layers): + layer_prefix = f"vision_model.model.layers.{layer_idx}" + + weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn( + hidden_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + + weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn( + intermediate_size, hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros( + intermediate_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn( + hidden_size, intermediate_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + + weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + weights[ + f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones( + hidden_size, dtype=torch.bfloat16) + weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros( + hidden_size, dtype=torch.bfloat16) + + return weights + + +def create_shared_weights( + text_config: dict[str, Any], + vision_config: dict[str, Any]) -> dict[str, torch.Tensor]: + """Create weights for shared components (vision-language connector)""" + + weights = {} + + text_hidden_size = text_config["hidden_size"] + projector_input_dim = vision_config["projector_input_dim"] + + # Vision-language connector (projects vision features to text space) + weights["multi_modal_projector.linear_1.weight"] = torch.randn( + text_hidden_size, projector_input_dim, dtype=torch.bfloat16) + + return weights + + +def save_weights_to_safetensors(weights: dict[str, torch.Tensor], + output_path: Path) -> None: + """Save weights to safetensors files and create index.""" + + # Determine how to shard the weights + max_shard_size = 5 * 1024 * 1024 * 1024 # 5GB per shard + + # Calculate sizes and create shards + shards = [] + current_shard: dict[str, torch.Tensor] = {} + current_size = 0 + + for name, tensor in weights.items(): + tensor_size = tensor.numel() * tensor.element_size() + + if current_size + tensor_size > max_shard_size and current_shard: + shards.append(current_shard) + current_shard = {} + current_size = 0 + + current_shard[name] = tensor + current_size += tensor_size + + if current_shard: + shards.append(current_shard) + + # Save shards and create index + weight_map = {} + + if len(shards) == 1: + # Single file + filename = "model.safetensors" + save_file(shards[0], output_path / filename) + weight_map = {name: filename for name in shards[0]} + print(f"Saved weights to single file: {filename}") + else: + # Multiple shards + for i, shard in enumerate(shards): + filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors" + save_file(shard, output_path / filename) + for name in shard: + weight_map[name] = filename + print(f"Saved shard {i+1}/{len(shards)}: {filename}") + + # Create index file + index_data = { + "metadata": { + "total_size": + sum(tensor.numel() * tensor.element_size() + for tensor in weights.values()) + }, + "weight_map": weight_map, + } + + index_path = output_path / "model.safetensors.index.json" + with open(index_path, "w") as f: + json.dump(index_data, f, indent=2) + + print(f"Created index file: {index_path}") + print(f"Total model size: " + f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB") + + +def run_reduced_model(model_path: str, + should_profile: bool = False, + **kwargs) -> None: + """Test the created reduced model with vLLM.""" + + print(f"\nTesting reduced model at {model_path}...") + + llm = LLM( + model=model_path, + trust_remote_code=True, + max_model_len=512, # Small context for testing + gpu_memory_utilization=0.3, # Conservative memory usage + **kwargs, + ) + + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=50) + + if should_profile: + llm.start_profile() + outputs = llm.generate(PROMPTS, sampling_params) + if should_profile: + llm.stop_profile() + + print("Test generation successful!") + for output in outputs: + print(f"Prompt: {output.prompt}") + print(f"Output: " + f"{output.outputs[0].text}") + print("-" * 40) + + +@pytest.mark.parametrize( + "original_model_name,text_layers,num_experts,vision_layers,", + [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)]) +@pytest.mark.parametrize("enforce_eager", [True, False]) +@pytest.mark.parametrize("tp,ep", [(2, True)]) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_dummy_maverick( + original_model_name: str, + text_layers: int, + num_experts: int, + vision_layers: int, + enforce_eager: bool, + tp: int, + ep: bool, + output_dir: str = "/tmp/reduced_maverick", + force_recreate: bool = True, + profile: bool = False, +) -> None: + model_path = create_reduced_maverick_model( + original_model_name=original_model_name, + output_dir=output_dir, + text_layers=text_layers, + num_experts=num_experts, + vision_layers=vision_layers, + force_recreate=force_recreate, + ) + + print(f"\nReduced model created successfully at: {model_path}") + + run_reduced_model(model_path=model_path, + should_profile=profile, + enforce_eager=enforce_eager, + tensor_parallel_size=tp, + enable_expert_parallel=ep) + + +def main(): + """Main function to create and test the reduced model.""" + + import argparse + + parser = argparse.ArgumentParser( + description="Create a reduced-layer Maverick model") + parser.add_argument( + "--output-dir", + default="/tmp/reduced_maverick", + help="Output directory for the reduced model", + ) + parser.add_argument( + "--text-layers", + type=int, + default=4, + help="Number of text transformer layers", + ) + parser.add_argument("--num-experts", + type=int, + default=4, + help="Number of experts") + parser.add_argument( + "--vision-layers", + type=int, + default=2, + help="Number of vision transformer layers", + ) + parser.add_argument( + "--force-recreate", + action="store_true", + help="Force recreation if output directory exists", + ) + parser.add_argument("--test", + action="store_true", + help="Test the created model with vLLM") + parser.add_argument("--profile", + action="store_true", + help="Profile the created model with vLLM") + parser.add_argument( + "--test-original", + action="store_true", + help="Test the original model with vLLM", + ) + parser.add_argument( + "--original-model", + default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + help="Original model name to base the reduction on", + ) + + args = parser.parse_args() + + if args.test: + test_dummy_maverick(original_model_name=args.original_model, + output_dir=args.output_dir, + text_layers=args.text_layers, + num_experts=args.num_experts, + vision_layers=args.vision_layers, + force_recreate=args.force_recreate, + tp=2, + ep=True, + enforce_eager=True, + profile=args.profile) + + if args.test_original: + run_maverick_serving(args.original_model) + + +if __name__ == "__main__": + exit(main()) From 304dce7ec02769ecea137091caa5413e1a4abf60 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Mon, 21 Jul 2025 12:10:30 -0400 Subject: [PATCH 49/57] [Attention] Clean up iRoPE in V1 (#21188) Signed-off-by: Lucas Wilkinson Co-authored-by: Michael Goin --- vllm/attention/layer.py | 7 +++++++ vllm/v1/attention/backends/cpu_attn.py | 5 ----- vllm/v1/attention/backends/flash_attn.py | 2 -- vllm/v1/attention/backends/flashinfer.py | 2 -- vllm/v1/attention/backends/pallas.py | 5 ----- vllm/v1/attention/backends/rocm_aiter_fa.py | 2 -- vllm/v1/attention/backends/triton_attn.py | 6 ------ vllm/v1/worker/gpu_model_runner.py | 7 +++---- vllm/v1/worker/tpu_model_runner.py | 4 ++++ 9 files changed, 14 insertions(+), 26 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 5d8ffb8e82d3..1b80fa19d54f 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -137,6 +137,13 @@ def __init__( self.num_kv_heads = num_kv_heads self.sliding_window = sliding_window + # For v1 we have backend agnostic iRoPE (local chunked attention) + # we have to store the flag on the layer so gpu model runner can + # set KVSpec appropriately (and pop it so it doesnt get passed to + # the backends) + if envs.VLLM_USE_V1: + self.use_irope = extra_impl_args.pop("use_irope", False) + quant_method = quant_config.get_quant_method( self, prefix=prefix) if quant_config else None if quant_method is not None and not isinstance( diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index 2efbe0de2725..3b6d753863d0 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -446,17 +446,12 @@ def __init__( logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, ) -> None: if kv_sharing_target_layer_name is not None: raise NotImplementedError("KV sharing is not supported in V0.") if logits_soft_cap is not None: logger.warning_once("Torch SPDA does not support logits soft cap. " "Outputs may be slightly off.") - if use_irope: - logger.warning_once( - "Using irope in Torch SPDA is not supported yet, it will fall" - " back to global attention for long context.") self.paged_attn_impl = _get_paged_attn_impl() self.num_heads = num_heads self.head_size = head_size diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ad414ee0a1fc..5fe274f2c65b 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -352,7 +352,6 @@ def __init__( logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[str] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -381,7 +380,6 @@ def __init__( "encoder/decoder cross-attention " "are not implemented for " "FlashAttentionImpl") - self.use_irope = use_irope self.vllm_flash_attn_version = get_flash_attn_version() if is_quantized_kv_cache(self.kv_cache_dtype) \ and not flash_attn_supports_fp8(): diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index e1ffa61a6005..953ef26c8143 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -493,7 +493,6 @@ def __init__( logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -509,7 +508,6 @@ def __init__( self.kv_cache_dtype = kv_cache_dtype self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - self.use_irope = use_irope self.num_queries_per_kv = self.num_heads // self.num_kv_heads diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py index 9307cd937d5d..9b122136afb7 100644 --- a/vllm/v1/attention/backends/pallas.py +++ b/vllm/v1/attention/backends/pallas.py @@ -148,12 +148,7 @@ def __init__( logits_soft_cap: Optional[float] = None, attn_type: str = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: - if use_irope: - logger.warning_once( - "Using irope in Pallas is not supported yet, it will fall back " - "to global attention for long context.") self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py index 8f7567639449..0739d2596676 100644 --- a/vllm/v1/attention/backends/rocm_aiter_fa.py +++ b/vllm/v1/attention/backends/rocm_aiter_fa.py @@ -337,7 +337,6 @@ def __init__( logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -367,7 +366,6 @@ def __init__( "encoder/decoder cross-attention " "are not implemented for " "FlashAttentionImpl") - self.use_irope = use_irope if is_quantized_kv_cache(self.kv_cache_dtype): raise NotImplementedError( "AiterFlashAttention does not support fp8 kv-cache on this " diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index d65ff5ff74ec..83471ca51b73 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -72,9 +72,6 @@ def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig, vllm_config.parallel_config) self.headdim = model_config.get_head_size() - self.attention_chunk_size = getattr(vllm_config.scheduler_config, - 'attention_chunk_size', None) - def build_for_cudagraph_capture( self, common_attn_metadata: CommonAttentionMetadata ) -> TritonAttentionMetadata: @@ -208,7 +205,6 @@ def __init__( logits_soft_cap: Optional[float] = None, attn_type: AttentionType = AttentionType.DECODER, kv_sharing_target_layer_name: Optional[int] = None, - use_irope: bool = False, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -228,8 +224,6 @@ def __init__( self.logits_soft_cap = logits_soft_cap self.kv_sharing_target_layer_name = kv_sharing_target_layer_name - self.use_irope = use_irope - self.num_queries_per_kv = self.num_heads // self.num_kv_heads TritonAttentionBackend.validate_head_size(head_size) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cd66d8bcd634..4c14ac3be3c0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2702,8 +2702,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: # TODO: Support other attention modules, e.g., cross-attention if attn_module.attn_type == AttentionType.DECODER: use_local_attention = (self.attention_chunk_size is not None - and getattr(attn_module.impl, - "use_irope", False)) + and attn_module.use_irope) if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, @@ -2716,13 +2715,13 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: "attention module can not be with ", "both local attention and sliding window") elif use_local_attention: - kv_cache_spec[layer_name] = (ChunkedLocalAttentionSpec( + kv_cache_spec[layer_name] = ChunkedLocalAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, head_size=attn_module.head_size, dtype=self.kv_cache_dtype, attention_chunk_size=self.attention_chunk_size, - use_mla=use_mla)) + use_mla=use_mla) else: kv_cache_spec[layer_name] = FullAttentionSpec( block_size=block_size, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index aad45b6abd12..31e9cff91247 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -519,6 +519,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: continue if attn_module.attn_type == AttentionType.DECODER: + if attn_module.use_irope: + logger.warning_once( + "Using irope in Pallas is not supported yet, it " + "will fall back to global attention for long context.") if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, From 29d1ffc5b4c763ef76aff9e3f617fa60dd292418 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Mon, 21 Jul 2025 12:11:35 -0400 Subject: [PATCH 50/57] [DP] Fix Prometheus Logging (#21257) Signed-off-by: Robert Shaw Co-authored-by: Robert Shaw --- tests/v1/engine/test_async_llm.py | 7 +- tests/v1/test_async_llm_dp.py | 6 +- vllm/v1/engine/async_llm.py | 69 ++-- vllm/v1/engine/core_client.py | 9 +- vllm/v1/metrics/loggers.py | 541 +++++++++++++++++++----------- vllm/v1/metrics/ray_wrappers.py | 4 - 6 files changed, 378 insertions(+), 258 deletions(-) diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py index e137452f2625..412df3acff12 100644 --- a/tests/v1/engine/test_async_llm.py +++ b/tests/v1/engine/test_async_llm.py @@ -336,9 +336,10 @@ async def test_customize_loggers(monkeypatch): await engine.do_log_stats() - assert len(engine.stat_loggers) == 1 - assert len(engine.stat_loggers[0]) == 1 - engine.stat_loggers[0][0].log.assert_called_once() + stat_loggers = engine.logger_manager.per_engine_logger_dict + assert len(stat_loggers) == 1 + assert len(stat_loggers[0]) == 1 + stat_loggers[0][0].log.assert_called_once() @pytest.mark.asyncio(scope="module") diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/test_async_llm_dp.py index 64a41bec3791..6716d27f571f 100644 --- a/tests/v1/test_async_llm_dp.py +++ b/tests/v1/test_async_llm_dp.py @@ -90,8 +90,10 @@ class SimpleStatsLogger(StatLoggerBase): def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): stats_loggers[engine_index] = self - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): if iteration_stats: self.finished_req_count += len( iteration_stats.finished_requests) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6395d2c1875b..b8ba36f3502f 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -36,10 +36,9 @@ from vllm.v1.engine.parallel_sampling import ParentRequest from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.metrics.loggers import (StatLoggerBase, StatLoggerFactory, - setup_default_loggers) +from vllm.v1.metrics.loggers import StatLoggerFactory, StatLoggerManager from vllm.v1.metrics.prometheus import shutdown_prometheus -from vllm.v1.metrics.stats import IterationStats, SchedulerStats +from vllm.v1.metrics.stats import IterationStats logger = init_logger(__name__) @@ -95,14 +94,6 @@ def __init__( self.log_requests = log_requests self.log_stats = log_stats - # Set up stat loggers; independent set for each DP rank. - self.stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( - vllm_config=vllm_config, - log_stats=self.log_stats, - engine_num=vllm_config.parallel_config.data_parallel_size, - custom_stat_loggers=stat_loggers, - ) - # Tokenizer (+ ensure liveness if running in another process). self.tokenizer = init_tokenizer_from_configs( model_config=vllm_config.model_config, @@ -121,7 +112,6 @@ def __init__( log_stats=self.log_stats) # EngineCore (starts the engine in background process). - self.engine_core = EngineCoreClient.make_async_mp_client( vllm_config=vllm_config, executor_class=executor_class, @@ -129,9 +119,17 @@ def __init__( client_addresses=client_addresses, client_index=client_index, ) - if self.stat_loggers: - for stat_logger in self.stat_loggers[0]: - stat_logger.log_engine_initialized() + + # Loggers. + self.logger_manager: Optional[StatLoggerManager] = None + if self.log_stats: + self.logger_manager = StatLoggerManager( + vllm_config=vllm_config, + engine_idxs=self.engine_core.engine_ranks, + custom_stat_loggers=stat_loggers, + ) + self.logger_manager.log_engine_initialized() + self.output_handler: Optional[asyncio.Task] = None try: # Start output handler eagerly if we are in the asyncio eventloop. @@ -370,7 +368,7 @@ def _run_output_handler(self): engine_core = self.engine_core output_processor = self.output_processor log_stats = self.log_stats - stat_loggers = self.stat_loggers if log_stats else None + logger_manager = self.logger_manager async def output_handler(): try: @@ -410,9 +408,9 @@ async def output_handler(): # 4) Logging. # TODO(rob): make into a coroutine and launch it in # background thread once Prometheus overhead is non-trivial. - if stat_loggers: - AsyncLLM._record_stats( - stat_loggers[outputs.engine_index], + if logger_manager: + logger_manager.record( + engine_idx=outputs.engine_index, scheduler_stats=outputs.scheduler_stats, iteration_stats=iteration_stats, ) @@ -431,18 +429,6 @@ async def abort(self, request_id: str) -> None: if self.log_requests: logger.info("Aborted request %s.", request_id) - @staticmethod - def _record_stats( - stat_loggers: list[StatLoggerBase], - scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats], - ): - """static so that it can be used from the output_handler task - without a circular ref to AsyncLLM.""" - for stat_logger in stat_loggers: - stat_logger.record(scheduler_stats=scheduler_stats, - iteration_stats=iteration_stats) - async def encode( self, prompt: PromptType, @@ -547,9 +533,8 @@ async def do_log_stats( scheduler_outputs=None, model_output=None, ) -> None: - for loggers in self.stat_loggers: - for stat_logger in loggers: - stat_logger.log() + if self.logger_manager: + self.logger_manager.log() async def check_health(self) -> None: logger.debug("Called check_health.") @@ -653,18 +638,16 @@ async def scale_elastic_ep(self, new_data_parallel_size # recreate stat loggers - if new_data_parallel_size > old_data_parallel_size: - stat_loggers: list[list[StatLoggerBase]] = setup_default_loggers( + if new_data_parallel_size > old_data_parallel_size and self.log_stats: + # TODO(rob): fix this after talking with Ray team. + # This resets all the prometheus metrics since we + # unregister during initialization. Need to understand + # the intended behavior here better. + self.logger_manager = StatLoggerManager( vllm_config=self.vllm_config, - log_stats=self.log_stats, - engine_num=new_data_parallel_size, + engine_idxs=list(range(new_data_parallel_size)), custom_stat_loggers=None, ) - num_new_engines = len(stat_loggers) - len(self.stat_loggers) - self.stat_loggers.extend(stat_loggers[-num_new_engines:]) - else: - for _ in range(old_data_parallel_size - new_data_parallel_size): - self.stat_loggers.pop() @property def is_running(self) -> bool: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 82fc1fa9937c..2ebb76a97ebe 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -432,14 +432,15 @@ def __init__( external_dp_lb = parallel_config.data_parallel_external_lb offline_mode = parallel_config.data_parallel_rank_local is not None - engine_ranks = [dp_rank] if (offline_mode - or external_dp_lb) else range(dp_size) + self.engine_ranks = ([dp_rank] if + (offline_mode or external_dp_lb) else list( + range(dp_size))) assert parallel_config.data_parallel_size_local <= len( - engine_ranks) + self.engine_ranks) # ZMQ identity of each engine that this client will talk to. self.core_engines: list[EngineIdentity] = [ - index.to_bytes(2, "little") for index in engine_ranks + index.to_bytes(2, "little") for index in self.engine_ranks ] # Wait for ready messages from each engine on the input socket. diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index c720ca13e51b..7f2556bab5a4 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -4,7 +4,7 @@ import logging import time from abc import ABC, abstractmethod -from typing import Callable, Optional +from typing import Callable, Optional, Union import numpy as np import prometheus_client @@ -35,8 +35,10 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): ... @abstractmethod - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): ... @abstractmethod @@ -78,8 +80,10 @@ def _get_throughput(self, tracked_stats: list[int], now: float) -> float: # Compute summary metrics for tracked stats return float(np.sum(tracked_stats) / (now - self.last_log_time)) - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): """Log Stats to standard output.""" if iteration_stats: @@ -146,233 +150,290 @@ class PrometheusStatLogger(StatLoggerBase): _histogram_cls = prometheus_client.Histogram _spec_decoding_cls = SpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): + def __init__(self, + vllm_config: VllmConfig, + engine_indexes: Optional[list[int]] = None): + if engine_indexes is None: + engine_indexes = [0] + self.engine_indexes = engine_indexes unregister_vllm_metrics() self.vllm_config = vllm_config - self.engine_index = engine_index # Use this flag to hide metrics that were deprecated in # a previous release and which will be removed future self.show_hidden_metrics = \ vllm_config.observability_config.show_hidden_metrics labelnames = ["model_name", "engine"] - labelvalues = [ - vllm_config.model_config.served_model_name, - str(engine_index) - ] - + model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len + if (len(self.engine_indexes) > 1 + and vllm_config.speculative_config is not None): + raise NotImplementedError("Prometheus metrics with Spec Decoding " + "with >1 EngineCore per AsyncLLM is not " + "supported yet.") + spec_decode_labelvalues = [ + vllm_config.model_config.served_model_name, + str(self.engine_indexes[0]) + ] self.spec_decoding_prom = self._spec_decoding_cls( - vllm_config.speculative_config, labelnames, labelvalues) + vllm_config.speculative_config, labelnames, + spec_decode_labelvalues) # # Scheduler state # - self.gauge_scheduler_running = self._gauge_cls( + gauge_scheduler_running = self._gauge_cls( name="vllm:num_requests_running", documentation="Number of requests in model execution batches.", multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_scheduler_running = make_per_engine(gauge_scheduler_running, + engine_indexes, + model_name) - self.gauge_scheduler_waiting = self._gauge_cls( + gauge_scheduler_waiting = self._gauge_cls( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_scheduler_waiting = make_per_engine(gauge_scheduler_waiting, + engine_indexes, + model_name) # # GPU cache # # Deprecated in 0.9 - Renamed as vllm:kv_cache_usage_perc # TODO: in 0.10, only enable if show_hidden_metrics=True - self.gauge_gpu_cache_usage = self._gauge_cls( + gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation=( "GPU KV-cache usage. 1 means 100 percent usage." "DEPRECATED: Use vllm:kv_cache_usage_perc instead."), multiprocess_mode="mostrecent", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_gpu_cache_usage = make_per_engine(gauge_gpu_cache_usage, + engine_indexes, + model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_queries # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_queries = self._counter_cls( + counter_gpu_prefix_cache_queries = self._counter_cls( name="vllm:gpu_prefix_cache_queries", - documentation= - ("GPU prefix cache queries, in terms of number of queried tokens." - "DEPRECATED: Use vllm:prefix_cache_queries instead."), - labelnames=labelnames).labels(*labelvalues) + documentation=( + "GPU prefix cache queries, in terms of number of queried" + "tokens. DEPRECATED: Use vllm:prefix_cache_queries instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_queries = make_per_engine( + counter_gpu_prefix_cache_queries, engine_indexes, model_name) # Deprecated in 0.9 - Renamed as vllm:prefix_cache_hits # TODO: in 0.10, only enable if show_hidden_metrics=True - self.counter_gpu_prefix_cache_hits = self._counter_cls( + counter_gpu_prefix_cache_hits = self._counter_cls( name="vllm:gpu_prefix_cache_hits", documentation=( - "GPU prefix cache hits, in terms of number of cached tokens." - "DEPRECATED: Use vllm:prefix_cache_hits instead."), - labelnames=labelnames).labels(*labelvalues) + "GPU prefix cache hits, in terms of number of cached " + "tokens. DEPRECATED: Use vllm:prefix_cache_hits instead."), + labelnames=labelnames) + self.counter_gpu_prefix_cache_hits = make_per_engine( + counter_gpu_prefix_cache_hits, engine_indexes, model_name) - self.gauge_kv_cache_usage = self._gauge_cls( + gauge_kv_cache_usage = self._gauge_cls( name="vllm:kv_cache_usage_perc", documentation="KV-cache usage. 1 means 100 percent usage.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.gauge_kv_cache_usage = make_per_engine(gauge_kv_cache_usage, + engine_indexes, model_name) - self.counter_prefix_cache_queries = self._counter_cls( + counter_prefix_cache_queries = self._counter_cls( name="vllm:prefix_cache_queries", documentation=( "Prefix cache queries, in terms of number of queried tokens."), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_prefix_cache_queries = make_per_engine( + counter_prefix_cache_queries, engine_indexes, model_name) - self.counter_prefix_cache_hits = self._counter_cls( + counter_prefix_cache_hits = self._counter_cls( name="vllm:prefix_cache_hits", documentation=( "Prefix cache hits, in terms of number of cached tokens."), - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_prefix_cache_hits = make_per_engine( + counter_prefix_cache_hits, engine_indexes, model_name) # # Counters # - self.counter_num_preempted_reqs = self._counter_cls( + counter_num_preempted_reqs = self._counter_cls( name="vllm:num_preemptions", documentation="Cumulative number of preemption from the engine.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_num_preempted_reqs = make_per_engine( + counter_num_preempted_reqs, engine_indexes, model_name) - self.counter_prompt_tokens = self._counter_cls( + counter_prompt_tokens = self._counter_cls( name="vllm:prompt_tokens", documentation="Number of prefill tokens processed.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_prompt_tokens = make_per_engine(counter_prompt_tokens, + engine_indexes, + model_name) - self.counter_generation_tokens = self._counter_cls( + counter_generation_tokens = self._counter_cls( name="vllm:generation_tokens", documentation="Number of generation tokens processed.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames) + self.counter_generation_tokens = make_per_engine( + counter_generation_tokens, engine_indexes, model_name) - self.counter_request_success: dict[FinishReason, - prometheus_client.Counter] = {} + self.counter_request_success: dict[FinishReason, dict[ + int, prometheus_client.Counter]] = {} counter_request_success_base = self._counter_cls( name="vllm:request_success", documentation="Count of successfully processed requests.", labelnames=labelnames + ["finished_reason"]) for reason in FinishReason: - self.counter_request_success[ - reason] = counter_request_success_base.labels(*(labelvalues + - [str(reason)])) + self.counter_request_success[reason] = { + idx: + counter_request_success_base.labels(model_name, str(idx), + str(reason)) + for idx in engine_indexes + } # # Histograms of counts # - self.histogram_num_prompt_tokens_request = \ - self._histogram_cls( - name="vllm:request_prompt_tokens", - documentation="Number of prefill tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) - - self.histogram_num_generation_tokens_request = \ - self._histogram_cls( - name="vllm:request_generation_tokens", - documentation="Number of generation tokens processed.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + histogram_num_prompt_tokens_request = self._histogram_cls( + name="vllm:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_prompt_tokens_request = make_per_engine( + histogram_num_prompt_tokens_request, engine_indexes, model_name) + + histogram_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_generation_tokens", + documentation="Number of generation tokens processed.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_num_generation_tokens_request = make_per_engine( + histogram_num_generation_tokens_request, engine_indexes, + model_name) # TODO: This metric might be incorrect in case of using multiple # api_server counts which uses prometheus mp. # See: https://github.com/vllm-project/vllm/pull/18053 - self.histogram_iteration_tokens = \ - self._histogram_cls( - name="vllm:iteration_tokens_total", - documentation="Histogram of number of tokens per engine_step.", - buckets=[ - 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, - 16384 - ], - labelnames=labelnames).labels(*labelvalues) - - self.histogram_max_num_generation_tokens_request = \ - self._histogram_cls( - name="vllm:request_max_num_generation_tokens", - documentation= - "Histogram of maximum number of requested generation tokens.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) - - self.histogram_n_request = \ - self._histogram_cls( - name="vllm:request_params_n", - documentation="Histogram of the n request parameter.", - buckets=[1, 2, 5, 10, 20], - labelnames=labelnames).labels(*labelvalues) - - self.histogram_max_tokens_request = \ - self._histogram_cls( - name="vllm:request_params_max_tokens", - documentation="Histogram of the max_tokens request parameter.", - buckets=build_1_2_5_buckets(max_model_len), - labelnames=labelnames).labels(*labelvalues) + histogram_iteration_tokens = self._histogram_cls( + name="vllm:iteration_tokens_total", + documentation="Histogram of number of tokens per engine_step.", + buckets=[ + 1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384 + ], + labelnames=labelnames) + self.histogram_iteration_tokens = make_per_engine( + histogram_iteration_tokens, engine_indexes, model_name) + + histogram_max_num_generation_tokens_request = self._histogram_cls( + name="vllm:request_max_num_generation_tokens", + documentation= + "Histogram of maximum number of requested generation tokens.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_num_generation_tokens_request = make_per_engine( + histogram_max_num_generation_tokens_request, engine_indexes, + model_name) + + histogram_n_request = self._histogram_cls( + name="vllm:request_params_n", + documentation="Histogram of the n request parameter.", + buckets=[1, 2, 5, 10, 20], + labelnames=labelnames) + self.histogram_n_request = make_per_engine(histogram_n_request, + engine_indexes, model_name) + + histogram_max_tokens_request = self._histogram_cls( + name="vllm:request_params_max_tokens", + documentation="Histogram of the max_tokens request parameter.", + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames) + self.histogram_max_tokens_request = make_per_engine( + histogram_max_tokens_request, engine_indexes, model_name) # # Histogram of timing intervals # - self.histogram_time_to_first_token = \ - self._histogram_cls( - name="vllm:time_to_first_token_seconds", - documentation="Histogram of time to first token in seconds.", - buckets=[ - 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, - 640.0, 2560.0 - ], - labelnames=labelnames).labels(*labelvalues) - - self.histogram_time_per_output_token = \ - self._histogram_cls( - name="vllm:time_per_output_token_seconds", - documentation="Histogram of time per output token in seconds.", - buckets=[ - 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, - 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 - ], - labelnames=labelnames).labels(*labelvalues) + histogram_time_to_first_token = self._histogram_cls( + name="vllm:time_to_first_token_seconds", + documentation="Histogram of time to first token in seconds.", + buckets=[ + 0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, + 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0, + 2560.0 + ], + labelnames=labelnames) + self.histogram_time_to_first_token = make_per_engine( + histogram_time_to_first_token, engine_indexes, model_name) + + histogram_time_per_output_token = self._histogram_cls( + name="vllm:time_per_output_token_seconds", + documentation="Histogram of time per output token in seconds.", + buckets=[ + 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, + 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0 + ], + labelnames=labelnames) + self.histogram_time_per_output_token = make_per_engine( + histogram_time_per_output_token, engine_indexes, model_name) request_latency_buckets = [ 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] - self.histogram_e2e_time_request = \ - self._histogram_cls( - name="vllm:e2e_request_latency_seconds", - documentation="Histogram of e2e request latency in seconds.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_queue_time_request = \ - self._histogram_cls( - name="vllm:request_queue_time_seconds", - documentation= - "Histogram of time spent in WAITING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_inference_time_request = \ - self._histogram_cls( - name="vllm:request_inference_time_seconds", - documentation= - "Histogram of time spent in RUNNING phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_prefill_time_request = \ - self._histogram_cls( - name="vllm:request_prefill_time_seconds", - documentation= - "Histogram of time spent in PREFILL phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) - self.histogram_decode_time_request = \ - self._histogram_cls( - name="vllm:request_decode_time_seconds", - documentation= - "Histogram of time spent in DECODE phase for request.", - buckets=request_latency_buckets, - labelnames=labelnames).labels(*labelvalues) + histogram_e2e_time_request = self._histogram_cls( + name="vllm:e2e_request_latency_seconds", + documentation="Histogram of e2e request latency in seconds.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_e2e_time_request = make_per_engine( + histogram_e2e_time_request, engine_indexes, model_name) + + histogram_queue_time_request = self._histogram_cls( + name="vllm:request_queue_time_seconds", + documentation= + "Histogram of time spent in WAITING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_queue_time_request = make_per_engine( + histogram_queue_time_request, engine_indexes, model_name) + + histogram_inference_time_request = self._histogram_cls( + name="vllm:request_inference_time_seconds", + documentation= + "Histogram of time spent in RUNNING phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_inference_time_request = make_per_engine( + histogram_inference_time_request, engine_indexes, model_name) + + histogram_prefill_time_request = self._histogram_cls( + name="vllm:request_prefill_time_seconds", + documentation= + "Histogram of time spent in PREFILL phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_prefill_time_request = make_per_engine( + histogram_prefill_time_request, engine_indexes, model_name) + + histogram_decode_time_request = self._histogram_cls( + name="vllm:request_decode_time_seconds", + documentation= + "Histogram of time spent in DECODE phase for request.", + buckets=request_latency_buckets, + labelnames=labelnames) + self.histogram_decode_time_request = make_per_engine( + histogram_decode_time_request, engine_indexes, model_name) # # LoRA metrics @@ -382,6 +443,9 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): # api_server counts which uses prometheus mp. self.gauge_lora_info: Optional[prometheus_client.Gauge] = None if vllm_config.lora_config is not None: + if len(self.engine_indexes) > 1: + raise NotImplementedError( + "LoRA in DP mode is not supported yet.") self.labelname_max_lora = "max_lora" self.labelname_waiting_lora_adapters = "waiting_lora_adapters" self.labelname_running_lora_adapters = "running_lora_adapters" @@ -399,9 +463,8 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): ) def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): - metrics_info = config_obj.metrics_info() - metrics_info["engine"] = self.engine_index + metrics_info["engine"] = "" name, documentation = None, None if type == "cache_config": @@ -417,27 +480,36 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): documentation=documentation, multiprocess_mode="mostrecent", labelnames=metrics_info.keys(), - ).labels(**metrics_info) - info_gauge.set(1) - - def record(self, scheduler_stats: Optional[SchedulerStats], - iteration_stats: Optional[IterationStats]): + ) + for engine_index in self.engine_indexes: + metrics_info = config_obj.metrics_info() + metrics_info["engine"] = str(engine_index) + info_gauge.labels(**metrics_info).set(1) + + def record(self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: int = 0): """Log to prometheus.""" if scheduler_stats is not None: - self.gauge_scheduler_running.set(scheduler_stats.num_running_reqs) - self.gauge_scheduler_waiting.set(scheduler_stats.num_waiting_reqs) + self.gauge_scheduler_running[engine_idx].set( + scheduler_stats.num_running_reqs) + self.gauge_scheduler_waiting[engine_idx].set( + scheduler_stats.num_waiting_reqs) - self.gauge_gpu_cache_usage.set(scheduler_stats.kv_cache_usage) - self.gauge_kv_cache_usage.set(scheduler_stats.kv_cache_usage) + self.gauge_gpu_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) + self.gauge_kv_cache_usage[engine_idx].set( + scheduler_stats.kv_cache_usage) - self.counter_gpu_prefix_cache_queries.inc( + self.counter_gpu_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) - self.counter_gpu_prefix_cache_hits.inc( + self.counter_gpu_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.hits) - self.counter_prefix_cache_queries.inc( + self.counter_prefix_cache_queries[engine_idx].inc( scheduler_stats.prefix_cache_stats.queries) - self.counter_prefix_cache_hits.inc( + self.counter_prefix_cache_hits[engine_idx].inc( scheduler_stats.prefix_cache_stats.hits) if scheduler_stats.spec_decoding_stats is not None: @@ -447,42 +519,45 @@ def record(self, scheduler_stats: Optional[SchedulerStats], if iteration_stats is None: return - self.counter_num_preempted_reqs.inc(iteration_stats.num_preempted_reqs) - self.counter_prompt_tokens.inc(iteration_stats.num_prompt_tokens) - self.counter_generation_tokens.inc( + self.counter_num_preempted_reqs[engine_idx].inc( + iteration_stats.num_preempted_reqs) + self.counter_prompt_tokens[engine_idx].inc( + iteration_stats.num_prompt_tokens) + self.counter_generation_tokens[engine_idx].inc( iteration_stats.num_generation_tokens) - self.histogram_iteration_tokens.observe( + self.histogram_iteration_tokens[engine_idx].observe( iteration_stats.num_prompt_tokens + \ iteration_stats.num_generation_tokens) for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter: - self.histogram_max_num_generation_tokens_request.observe( - max_gen_tokens) + self.histogram_max_num_generation_tokens_request[ + engine_idx].observe(max_gen_tokens) for n_param in iteration_stats.n_params_iter: - self.histogram_n_request.observe(n_param) + self.histogram_n_request[engine_idx].observe(n_param) for ttft in iteration_stats.time_to_first_tokens_iter: - self.histogram_time_to_first_token.observe(ttft) + self.histogram_time_to_first_token[engine_idx].observe(ttft) for tpot in iteration_stats.time_per_output_tokens_iter: - self.histogram_time_per_output_token.observe(tpot) + self.histogram_time_per_output_token[engine_idx].observe(tpot) for finished_request in iteration_stats.finished_requests: - self.counter_request_success[finished_request.finish_reason].inc() - self.histogram_e2e_time_request.observe( + self.counter_request_success[ + finished_request.finish_reason][engine_idx].inc() + self.histogram_e2e_time_request[engine_idx].observe( finished_request.e2e_latency) - self.histogram_queue_time_request.observe( + self.histogram_queue_time_request[engine_idx].observe( finished_request.queued_time) - self.histogram_prefill_time_request.observe( + self.histogram_prefill_time_request[engine_idx].observe( finished_request.prefill_time) - self.histogram_inference_time_request.observe( + self.histogram_inference_time_request[engine_idx].observe( finished_request.inference_time) - self.histogram_decode_time_request.observe( + self.histogram_decode_time_request[engine_idx].observe( finished_request.decode_time) - self.histogram_num_prompt_tokens_request.observe( + self.histogram_num_prompt_tokens_request[engine_idx].observe( finished_request.num_prompt_tokens) - self.histogram_num_generation_tokens_request.observe( + self.histogram_num_generation_tokens_request[engine_idx].observe( finished_request.num_generation_tokens) if finished_request.max_tokens_param: - self.histogram_max_tokens_request.observe( + self.histogram_max_tokens_request[engine_idx].observe( finished_request.max_tokens_param) if self.gauge_lora_info is not None: @@ -502,6 +577,18 @@ def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) +PromMetric = Union[ + prometheus_client.Gauge, + prometheus_client.Counter, + prometheus_client.Histogram, +] + + +def make_per_engine(metric: PromMetric, engine_idxs: list[int], + model_name: str) -> dict[int, PromMetric]: + return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs} + + def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by @@ -529,29 +616,79 @@ def build_1_2_5_buckets(max_value: int) -> list[int]: return build_buckets([1, 2, 5], max_value) -def setup_default_loggers( - vllm_config: VllmConfig, - log_stats: bool, - engine_num: int, - custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, -) -> list[list[StatLoggerBase]]: - """Setup logging and prometheus metrics.""" - if not log_stats: - return [] - - factories: list[StatLoggerFactory] - if custom_stat_loggers is not None: - factories = custom_stat_loggers - else: - factories = [PrometheusStatLogger] - if logger.isEnabledFor(logging.INFO): - factories.append(LoggingStatLogger) - - stat_loggers: list[list[StatLoggerBase]] = [] - for i in range(engine_num): - per_engine_stat_loggers: list[StatLoggerBase] = [] - for logger_factory in factories: - per_engine_stat_loggers.append(logger_factory(vllm_config, i)) - stat_loggers.append(per_engine_stat_loggers) - - return stat_loggers +class StatLoggerManager: + """ + StatLoggerManager: + Logging happens at the level of the EngineCore (per scheduler). + * DP: >1 EngineCore per AsyncLLM - loggers for each EngineCore. + * With Local Logger, just make N copies for N EngineCores. + * With Prometheus, we need a single logger with N "labels" + + This class abstracts away this implementation detail from + the AsyncLLM, allowing the AsyncLLM to just call .record() + and .log() to a simple interface. + """ + + def __init__( + self, + vllm_config: VllmConfig, + engine_idxs: Optional[list[int]] = None, + custom_stat_loggers: Optional[list[StatLoggerFactory]] = None, + ): + self.engine_idxs = engine_idxs if engine_idxs else [0] + + factories: list[StatLoggerFactory] + if custom_stat_loggers is not None: + factories = custom_stat_loggers + else: + factories = [] + if logger.isEnabledFor(logging.INFO): + factories.append(LoggingStatLogger) + + # engine_idx: StatLogger + self.per_engine_logger_dict: dict[int, list[StatLoggerBase]] = {} + prometheus_factory = PrometheusStatLogger + for engine_idx in self.engine_idxs: + loggers: list[StatLoggerBase] = [] + for logger_factory in factories: + # If we get a custom prometheus logger, use that + # instead. This is typically used for the ray case. + if (isinstance(logger_factory, type) + and issubclass(logger_factory, PrometheusStatLogger)): + prometheus_factory = logger_factory + continue + loggers.append(logger_factory(vllm_config, + engine_idx)) # type: ignore + self.per_engine_logger_dict[engine_idx] = loggers + + # For Prometheus, need to share the metrics between EngineCores. + # Each EngineCore's metrics are expressed as a unique label. + self.prometheus_logger = prometheus_factory(vllm_config, engine_idxs) + + def record( + self, + scheduler_stats: Optional[SchedulerStats], + iteration_stats: Optional[IterationStats], + engine_idx: Optional[int] = None, + ): + if engine_idx is None: + engine_idx = 0 + + per_engine_loggers = self.per_engine_logger_dict[engine_idx] + for logger in per_engine_loggers: + logger.record(scheduler_stats, iteration_stats, engine_idx) + + self.prometheus_logger.record(scheduler_stats, iteration_stats, + engine_idx) + + def log(self): + for per_engine_loggers in self.per_engine_logger_dict.values(): + for logger in per_engine_loggers: + logger.log() + + def log_engine_initialized(self): + self.prometheus_logger.log_engine_initialized() + + for per_engine_loggers in self.per_engine_logger_dict.values(): + for logger in per_engine_loggers: + logger.log_engine_initialized() diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index 8384310062dd..ae8f9447e9c8 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -3,7 +3,6 @@ import time from typing import Optional, Union -from vllm.config import VllmConfig from vllm.v1.metrics.loggers import PrometheusStatLogger from vllm.v1.spec_decode.metrics import SpecDecodingProm @@ -128,9 +127,6 @@ class RayPrometheusStatLogger(PrometheusStatLogger): _histogram_cls = RayHistogramWrapper _spec_decoding_cls = RaySpecDecodingProm - def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): - super().__init__(vllm_config, engine_index) - @staticmethod def _unregister_vllm_metrics(): # No-op on purpose From 8b296c3b37af9e913edfeac382e22c0157f969d8 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Thu, 10 Jul 2025 17:22:15 -0400 Subject: [PATCH 51/57] docs: Update docs article with usage patterns Signed-off-by: Sanger Steel --- docs/models/extensions/tensorizer.md | 124 ++++++++++++++++++++++++++- 1 file changed, 120 insertions(+), 4 deletions(-) diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 6ea61b080cda..d4f62d7e32e3 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -1,13 +1,129 @@ -# Loading models with CoreWeave's Tensorizer +--- +title: Loading models with CoreWeave's Tensorizer +--- +[](){ #tensorizer } vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized at runtime extremely quickly directly to the GPU, resulting in significantly shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. -For more information on CoreWeave's Tensorizer, please refer to -[CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the [vLLM example script](../../examples/others/tensorize_vllm_model.md). +vLLM fully integrates Tensorizer in to its model loading machinery. The +following will give a brief overview on how to get started with using +Tensorizer on vLLM. + +## The basics +To load a model using Tensorizer, it first needs to be serialized by Tensorizer. +The example script in [examples/others/tensorize_vllm_model.py] takes care of +this process. +(https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html) + +Let's walk through a basic example by serializing `facebook/opt-125m` using the +script, and then loading it for inference. + +## Saving a vLLM model with Tensorizer +To save a model with Tensorizer, call the example script with the necessary +CLI arguments. The docstring for the script itself explains the CLI args +and how to use it properly in great detail, and we'll use one of the +examples from the docstring directly, assuming we want to save our model at +our S3 bucket example `s3://my-bucket`: + +```bash +python examples/others/tensorize_vllm_model.py \ + --model facebook/opt-125m \ + serialize \ + --serialized-directory s3://my-bucket \ + --suffix v1 +``` + +This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If +you intend on applying a LoRA adapter to your tensorized model, you can pass +the HF id of the LoRA adapter in the above command, and the artifacts will be +saved there too: + +```bash +python examples/others/tensorize_vllm_model.py \ + --model facebook/opt-125m \ + --lora-path \ + serialize \ + --serialized-directory s3://my-bucket \ + --suffix v1 +``` + +## Serving the model using Tensorizer +Once the model is serialized where you want it, you can load the model using +`vllm serve` or the `LLM` entrypoint. The directory where the +model artifacts were saved can be passed to the `model` argument for +`LLM()` and `vllm serve`. For example, to serve the tensorized model +saved previously with the LoRA adapter, you'd do: + +```bash +vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \ + --load-format tensorizer \ + --enable-lora +``` + +Or, with `LLM()`: + +```python +from vllm import LLM +llm = LLM( + "s3://my-bucket/vllm/facebook/opt-125m/v1", + load_format="tensorizer", + enable_lora=True +) +``` + +`tensorizer`'s core objects that serialize and deserialize models are +`TensorSerializer` and `TensorDeserializer` respectively. In order to +pass arbitrary kwargs to these, which will configure the serialization +and deserialization processes, you can provide them as keys to +`model_loader_extra_config` with `serialization_kwargs` and +`deserialization_kwargs` respectively. Full docstrings detailing all +parameters for the aforementioned objects can be found in `tensorizer`'s +[serialization.py](https://github. +com/coreweave/tensorizer/blob/main/tensorizer/serialization.py) file. + +As an example, CPU concurrency can be limited when serializing with +`tensorizer` via the `limit_cpu_concurrency` parameter in the +initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to +some arbitrary value, you would do so like this when serializing: + +```bash +python examples/others/tensorize_vllm_model.py \ + --model facebook/opt-125m \ + --lora-path \ + serialize \ + --serialized-directory s3://my-bucket \ + --serialization-kwargs '{"limit_cpu_concurrency": 2}' \ + --suffix v1 +``` + +As an example when customizing the loading process via `TensorDeserializer`, +one could limit the number of concurrency readers during +deserialization with the `num_readers` parameter in the initializer +via `model_loader_extra_config` like so: + +```bash +vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \ + --load-format tensorizer \ + --enable-lora \ + --model-loader-extra-config '{"deserialization_kwargs": {"num_readers": 2}}' +``` + +Or with `LLM()`: + +```python +from vllm import LLM +llm = LLM( + "s3://my-bucket/vllm/facebook/opt-125m/v1", + load_format="tensorizer", + enable_lora=True, + model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} +) +``` + + !!! note Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. From 110a6fdbe8b9ef234ef482e3779bfdf610ff223d Mon Sep 17 00:00:00 2001 From: William Goldby Date: Fri, 11 Jul 2025 10:45:49 -0700 Subject: [PATCH 52/57] fix: Rename headings and move content around Signed-off-by: William Goldby Signed-off-by: Sanger Steel --- docs/models/extensions/tensorizer.md | 66 +++++++++------------------- 1 file changed, 20 insertions(+), 46 deletions(-) diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index d4f62d7e32e3..737711c3b926 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -8,25 +8,24 @@ vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or at runtime extremely quickly directly to the GPU, resulting in significantly shorter Pod startup times and CPU memory usage. Tensor encryption is also supported. -vLLM fully integrates Tensorizer in to its model loading machinery. The -following will give a brief overview on how to get started with using -Tensorizer on vLLM. +vLLM fully integrates Tensorizer in to its model loading machinery. The following will give a brief overview on how to get started with using Tensorizer on vLLM. + +## Installing Tensorizer + +To install `tensorizer`, run `pip install vllm[tensorizer]`. ## The basics -To load a model using Tensorizer, it first needs to be serialized by Tensorizer. -The example script in [examples/others/tensorize_vllm_model.py] takes care of -this process. -(https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html) -Let's walk through a basic example by serializing `facebook/opt-125m` using the -script, and then loading it for inference. +To load a model using Tensorizer, the model first needs to be serialized by +Tensorizer. [The example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html) takes care of this process. + +Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference. -## Saving a vLLM model with Tensorizer -To save a model with Tensorizer, call the example script with the necessary +## Serializing a vLLM model with Tensorizer + +To serialize a model with Tensorizer, call the example script with the necessary CLI arguments. The docstring for the script itself explains the CLI args -and how to use it properly in great detail, and we'll use one of the -examples from the docstring directly, assuming we want to save our model at -our S3 bucket example `s3://my-bucket`: +and how to use it properly in great detail, and we'll use one of the examples from the docstring directly, assuming we want to serialize and save our model at our S3 bucket example `s3://my-bucket`: ```bash python examples/others/tensorize_vllm_model.py \ @@ -36,10 +35,7 @@ python examples/others/tensorize_vllm_model.py \ --suffix v1 ``` -This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If -you intend on applying a LoRA adapter to your tensorized model, you can pass -the HF id of the LoRA adapter in the above command, and the artifacts will be -saved there too: +This saves the model tensors at `s3://my-bucket/vllm/facebook/opt-125m/v1`. If you intend on applying a LoRA adapter to your tensorized model, you can pass the HF id of the LoRA adapter in the above command, and the artifacts will be saved there too: ```bash python examples/others/tensorize_vllm_model.py \ @@ -51,11 +47,8 @@ python examples/others/tensorize_vllm_model.py \ ``` ## Serving the model using Tensorizer -Once the model is serialized where you want it, you can load the model using -`vllm serve` or the `LLM` entrypoint. The directory where the -model artifacts were saved can be passed to the `model` argument for -`LLM()` and `vllm serve`. For example, to serve the tensorized model -saved previously with the LoRA adapter, you'd do: + +Once the model is serialized where you want it, you can load the model using `vllm serve` or the `LLM` entrypoint. You can pass the directory where you saved the model to the `model` argument for `LLM()` and `vllm serve`. For example, to serve the tensorized model saved previously with the LoRA adapter, you'd do: ```bash vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \ @@ -74,20 +67,9 @@ llm = LLM( ) ``` -`tensorizer`'s core objects that serialize and deserialize models are -`TensorSerializer` and `TensorDeserializer` respectively. In order to -pass arbitrary kwargs to these, which will configure the serialization -and deserialization processes, you can provide them as keys to -`model_loader_extra_config` with `serialization_kwargs` and -`deserialization_kwargs` respectively. Full docstrings detailing all -parameters for the aforementioned objects can be found in `tensorizer`'s -[serialization.py](https://github. -com/coreweave/tensorizer/blob/main/tensorizer/serialization.py) file. - -As an example, CPU concurrency can be limited when serializing with -`tensorizer` via the `limit_cpu_concurrency` parameter in the -initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to -some arbitrary value, you would do so like this when serializing: +`tensorizer`'s core objects that serialize and deserialize models are `TensorSerializer` and `TensorDeserializer` respectively. In order to pass arbitrary kwargs to these, which will configure the serialization and deserialization processes, you can provide them as keys to `model_loader_extra_config` with `serialization_kwargs` and `deserialization_kwargs` respectively. Full docstrings detailing all parameters for the aforementioned objects can be found in `tensorizer`'s [serialization.py](https://github.com/coreweave/tensorizer/blob/main/tensorizer/serialization.py) file. + +As an example, CPU concurrency can be limited when serializing with `tensorizer` via the `limit_cpu_concurrency` parameter in the initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to some arbitrary value, you would do so like this when serializing: ```bash python examples/others/tensorize_vllm_model.py \ @@ -99,10 +81,7 @@ python examples/others/tensorize_vllm_model.py \ --suffix v1 ``` -As an example when customizing the loading process via `TensorDeserializer`, -one could limit the number of concurrency readers during -deserialization with the `num_readers` parameter in the initializer -via `model_loader_extra_config` like so: +As an example when customizing the loading process via `TensorDeserializer`, you could limit the number of concurrency readers during deserialization with the `num_readers` parameter in the initializer via `model_loader_extra_config` like so: ```bash vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \ @@ -122,8 +101,3 @@ llm = LLM( model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} ) ``` - - - -!!! note - Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. From 480e84eb8ebca7c72310f299ca7a4ad36cc7f483 Mon Sep 17 00:00:00 2001 From: William Goldby Date: Tue, 15 Jul 2025 15:16:04 -0700 Subject: [PATCH 53/57] fix: Add title for Tensorizer configuration Signed-off-by: William Goldby Signed-off-by: Sanger Steel --- docs/models/extensions/tensorizer.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 737711c3b926..28007037db0e 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -67,6 +67,8 @@ llm = LLM( ) ``` +## Options for configuring Tensorizer + `tensorizer`'s core objects that serialize and deserialize models are `TensorSerializer` and `TensorDeserializer` respectively. In order to pass arbitrary kwargs to these, which will configure the serialization and deserialization processes, you can provide them as keys to `model_loader_extra_config` with `serialization_kwargs` and `deserialization_kwargs` respectively. Full docstrings detailing all parameters for the aforementioned objects can be found in `tensorizer`'s [serialization.py](https://github.com/coreweave/tensorizer/blob/main/tensorizer/serialization.py) file. As an example, CPU concurrency can be limited when serializing with `tensorizer` via the `limit_cpu_concurrency` parameter in the initializer for `TensorSerializer`. To set `limit_cpu_concurrency` to some arbitrary value, you would do so like this when serializing: From b2efb9ff86119bbbd62a32fc813b87fb28a4e178 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Fri, 18 Jul 2025 10:20:44 -0400 Subject: [PATCH 54/57] docs: Update example file docstring Signed-off-by: Sanger Steel --- examples/others/tensorize_vllm_model.py | 31 ++++++++++++++----------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 64a6c42ae235..8c33114780e3 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -84,18 +84,24 @@ Once a model is serialized, tensorizer can be invoked with the `LLM` class directly to load models: - llm = LLM(model="facebook/opt-125m", - load_format="tensorizer", - model_loader_extra_config=TensorizerConfig( - tensorizer_uri = path_to_tensors, - num_readers=3, - ) - ) +```python +from vllm import LLM +llm = LLM( + "s3://my-bucket/vllm/facebook/opt-125m/v1", + load_format="tensorizer", + enable_lora=True +) +``` + A serialized model can be used during model loading for the vLLM OpenAI -inference server. `model_loader_extra_config` is exposed as the CLI arg -`--model-loader-extra-config`, and accepts a JSON string literal of the -TensorizerConfig arguments desired. +inference server: + +``` +vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \ + --load-format tensorizer \ + --enable-lora +``` In order to see all of the available arguments usable to configure loading with tensorizer that are given to `TensorizerConfig`, run: @@ -116,10 +122,9 @@ `--enable-lora`. For instance: ``` -vllm serve \ +vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \ --load-format tensorizer \ - --model-loader-extra-config '{"tensorizer_uri": ".tensors"}' \ - --enable-lora + --enable-lora ``` """ From c1ba86c9e664c5264e25db7332caf11edb168640 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Fri, 18 Jul 2025 13:12:21 -0400 Subject: [PATCH 55/57] docs: Revert original markdown title Signed-off-by: Sanger Steel --- docs/models/extensions/tensorizer.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 28007037db0e..3e01be2d8b9a 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -1,7 +1,4 @@ ---- -title: Loading models with CoreWeave's Tensorizer ---- -[](){ #tensorizer } +# Loading models with CoreWeave's Tensorizer vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer). vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized From dfe285087eec16ef06452c6152733652f95a1bf9 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Mon, 21 Jul 2025 13:41:35 -0400 Subject: [PATCH 56/57] style: Run linter Signed-off-by: Sanger Steel --- docs/models/extensions/tensorizer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index 3e01be2d8b9a..ffb3a7d55f85 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -13,7 +13,7 @@ To install `tensorizer`, run `pip install vllm[tensorizer]`. ## The basics -To load a model using Tensorizer, the model first needs to be serialized by +To load a model using Tensorizer, the model first needs to be serialized by Tensorizer. [The example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html) takes care of this process. Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference. From c67e148c6d916f9087b5cb5adc75f4ff6455e1c3 Mon Sep 17 00:00:00 2001 From: Sanger Steel Date: Wed, 23 Jul 2025 14:25:20 -0400 Subject: [PATCH 57/57] docs: Resolve suggested changes from review Signed-off-by: Sanger Steel --- docs/models/extensions/tensorizer.md | 2 +- examples/others/tensorize_vllm_model.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index ffb3a7d55f85..f70ab0c6f4e5 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -14,7 +14,7 @@ To install `tensorizer`, run `pip install vllm[tensorizer]`. ## The basics To load a model using Tensorizer, the model first needs to be serialized by -Tensorizer. [The example script](https://docs.vllm.ai/en/latest/examples/others/tensorize_vllm_model.html) takes care of this process. +Tensorizer. [The example script](../../examples/others/tensorize_vllm_model.md) takes care of this process. Let's walk through a basic example by serializing `facebook/opt-125m` using the script, and then loading it for inference. diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index 8c33114780e3..559c7c493aca 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -88,8 +88,7 @@ from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", - load_format="tensorizer", - enable_lora=True + load_format="tensorizer" ) ``` @@ -99,8 +98,7 @@ ``` vllm serve s3://my-bucket/vllm/facebook/opt-125m/v1 \ - --load-format tensorizer \ - --enable-lora + --load-format tensorizer ``` In order to see all of the available arguments usable to configure