ModelTC · STwangyingrui · Jul 7, 2025 · Jul 9, 2025 · gemini-code-assist · Jul 8, 2025
diff --git a/lightllm/common/basemodel/basemodel.py b/lightllm/common/basemodel/basemodel.py
@@ -346,6 +346,10 @@ def _decode(
     ) -> ModelOutput:
         if self.graph is not None and self.graph.can_run(model_input.batch_size, model_input.max_len_in_batch):
             find_graph_batch_size = self.graph.find_closest_graph_batch_size(model_input.batch_size)
+            if find_graph_batch_size is None:
+                logger.error("No suitable graph batch size found for batch_size={model_input.batch_size}, return None.")
+                return None
+
             padded_model_input = self._create_padded_decode_model_input(model_input, find_graph_batch_size)
             infer_state = self._create_inferstate(padded_model_input)
             copy_kv_index_to_req(
@@ -356,7 +360,9 @@ def _decode(
             )
             infer_state.init_some_extra_state(self, padded_model_input.input_ids)
 
-            if self.graph.need_capture(find_graph_batch_size):
+            # Check if a graph needs to be captured.
+            # get_graph returns None if a graph for the batch_size doesn't exist.
+            if self.graph.get_graph(find_graph_batch_size) is None:
                 infer_state.is_cuda_graph = True
                 model_output: ModelOutput = self.graph.capture_decode(
                     self._token_forward, padded_model_input.input_ids, infer_state
@@ -497,6 +503,10 @@ def microbatch_overlap_decode(self, model_input0: ModelInput, model_input1: Mode
 
         if self.graph is not None and self.graph.can_run(origin_batch_size, max_len_in_batch):
             find_graph_batch_size = self.graph.find_closest_graph_batch_size(origin_batch_size)
+            if find_graph_batch_size is None:
+                logger.error("No suitable graph batch size found for batch_size={origin_batch_size}, return None.")
+                return None
+
             padded_model_input0 = self._create_padded_decode_model_input(model_input0, find_graph_batch_size)
             padded_model_input1 = self._create_padded_decode_model_input(model_input1, find_graph_batch_size)
             infer_state0 = self._create_inferstate(padded_model_input0, 0)
@@ -516,7 +526,9 @@ def microbatch_overlap_decode(self, model_input0: ModelInput, model_input1: Mode
             )
             infer_state1.init_some_extra_state(self, padded_model_input1.input_ids)
 
-            if self.graph.need_capture(find_graph_batch_size):
+            # Check if a graph needs to be captured.
+            # get_graph returns None if a graph for the batch_size doesn't exist.
+            if self.graph.get_graph(find_graph_batch_size) is None:
                 infer_state0.is_cuda_graph = True
                 infer_state1.is_cuda_graph = True
 

diff --git a/lightllm/common/basemodel/cuda_graph.py b/lightllm/common/basemodel/cuda_graph.py
@@ -2,27 +2,29 @@
 import torch
 import copy
 import bisect
+from collections import OrderedDict
 from typing import Optional
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.distributed import dist_group_manager, lightllm_capture_graph, CustomProcessGroup
 from lightllm.common.basemodel.batch_objs import ModelInput, ModelOutput
 from .infer_struct import InferStateInfo
 
-
 logger = init_logger(__name__)
 
 
 class CudaGraph:
     # CudaGraph forward pass for the decoding stage.
 
     def __init__(self, max_batch_size=8, max_len_in_batch=8192):
-        self.graph = {}
+        self.graph = OrderedDict()  # for LRU
+
         self.mempool = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
         self.max_batch_size = max_batch_size
         self.graph_max_len_in_batch = max_len_in_batch
         self.args = get_env_start_args()
         self.enable_decode_microbatch_overlap = self.args.enable_decode_microbatch_overlap
+        self.max_graph_pool_size = self.args.max_graph_pool_size
 
         # gen cuda graph batch_sizes
         # cuda graph gen for batch size = [1, 2, 3, ..., graph_split_batch_size]
@@ -47,12 +49,21 @@ def __init__(self, max_batch_size=8, max_len_in_batch=8192):
     def can_run(self, batch_size, max_len_in_batch):
         return batch_size <= self.max_batch_size and max_len_in_batch <= self.graph_max_len_in_batch
 
-    def need_capture(self, batch_size):
-        find_batch_size = self.find_closest_graph_batch_size(batch_size)
-        if find_batch_size is not None:
-            return find_batch_size not in self.graph
+    def get_graph(self, batch_size):
+        # We assume batch_size has already been adjusted to the closest supported graph batch size
+        # If the graph already exists, get it and move it to the most recently used position.
+        if batch_size in self.graph:
+            find_graph = self.graph.pop(batch_size)  # Dequeue the graph
+            self.graph[batch_size] = find_graph  # Enqueue the graph for LRU
+            return find_graph
         else:
-            assert False, "dead code"
+            return None
+
+    def evict_oldest_graph(self):
+        if self.graph:
+            oldest_batch_size, oldest_graph = self.graph.popitem(last=False)
+            del oldest_graph
+            logger.info(f"Evicted CUDA graph for batch size: {oldest_batch_size}")
 
     def find_closest_graph_batch_size(self, batch_size):
         index = bisect.bisect_left(self.cuda_graph_batch_sizes, batch_size)
@@ -64,6 +75,9 @@ def find_closest_graph_batch_size(self, batch_size):
 
     def _capture_decode(self, decode_func, input_ids: torch.Tensor, infer_state: InferStateInfo):
         dist_group: CustomProcessGroup = infer_state.dist_group
+        if len(self.graph) >= self.max_graph_pool_size:
+            self.evict_oldest_graph()
+
         graph_obj = torch.cuda.CUDAGraph()
         batch_size = input_ids.shape[0]
         infer_state.max_len_in_batch = self.graph_max_len_in_batch
@@ -84,6 +98,7 @@ def _capture_decode(self, decode_func, input_ids: torch.Tensor, infer_state: Inf
         with lightllm_capture_graph(dist_group):
             with torch.cuda.graph(graph_obj, pool=self.mempool):
                 model_output = decode_func(input_ids, infer_state)
+        # We assume batch_size has already been adjusted to the closest supported graph batch size
         self.graph[batch_size] = (graph_obj, input_ids, infer_state, model_output)
         graph_obj.replay()
         return model_output
@@ -97,6 +112,9 @@ def _capture_decode_overlap(
         infer_state1: InferStateInfo,
     ):
         dist_group: CustomProcessGroup = infer_state.dist_group
+        if len(self.graph) >= self.max_graph_pool_size:
+            self.evict_oldest_graph()
+
         dist_group1 = infer_state1.dist_group
         graph_obj = torch.cuda.CUDAGraph()
         batch_size = input_ids.shape[0]
@@ -113,6 +131,7 @@ def _capture_decode_overlap(
             with lightllm_capture_graph(dist_group):
                 with torch.cuda.graph(graph_obj, pool=self.mempool):
                     model_output, model_output1 = decode_func(input_ids, infer_state, input_ids1, infer_state1)
+        # We assume batch_size has already been adjusted to the closest supported graph batch size
         self.graph[batch_size] = (
             graph_obj,
             input_ids,

diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -335,6 +335,13 @@ def make_argument_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--disable_cudagraph", action="store_true", help="Disable the cudagraph of the decoding stage")
 
+    parser.add_argument(
+        "--max_graph_pool_size",
+        type=int,
+        default=16,
+        help="""Maximum cuda graph pool size for decoding stage.""",
+    )
+
     parser.add_argument(
         "--graph_max_batch_size",
         type=int,

diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -77,6 +77,7 @@ class StartArgs:
     visual_nccl_ports: List[int] = field(default_factory=lambda: [29500])
     enable_monitor_auth: bool = field(default=False)
     disable_cudagraph: bool = field(default=False)
+    max_graph_pool_size: int = field(default=16)
     graph_max_batch_size: int = field(default=256)
     graph_split_batch_size: int = field(default=32)
     graph_grow_step_size: int = field(default=16)