ModelTC
diff --git a/‎lightllm/common/calibration_fp8kv_mem_manager.py
Lines changed: 6 additions & 0 deletions b/‎lightllm/common/calibration_fp8kv_mem_manager.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎lightllm/common/export_calibration_mem_manager.py
Lines changed: 6 additions & 0 deletions b/‎lightllm/common/export_calibration_mem_manager.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎lightllm/common/fp8kv_mem_manager.py
Lines changed: 0 additions & 9 deletions b/‎lightllm/common/fp8kv_mem_manager.py
Lines changed: 0 additions & 9 deletions
diff --git a/‎lightllm/common/mem_manager.py
Lines changed: 0 additions & 147 deletions b/‎lightllm/common/mem_manager.py
Lines changed: 0 additions & 147 deletions
diff --git a/‎lightllm/common/mem_utils.py
Lines changed: 6 additions & 2 deletions b/‎lightllm/common/mem_utils.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎lightllm/common/offline_fp8_quant_mem_manager.py
Lines changed: 156 additions & 0 deletions b/‎lightllm/common/offline_fp8_quant_mem_manager.py
Lines changed: 156 additions & 0 deletions
diff --git a/‎lightllm/models/llama/flashattention_infer_struct.py
Lines changed: 1 addition & 1 deletion b/‎lightllm/models/llama/flashattention_infer_struct.py
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,6 @@
+from .offline_fp8_quant_mem_manager import OfflineFP8QuantMemManager
+
+
+class CalibrationFP8KVMemoryManager(OfflineFP8QuantMemManager):
+    def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9):
+        super().__init__(size, dtype, head_num, head_dim, layer_num, always_copy, mem_fraction, is_export_mode=False)
@@ -0,0 +1,6 @@
+from .offline_fp8_quant_mem_manager import OfflineFP8QuantMemManager
+
+
+class ExportCalibrationMemoryManager(OfflineFP8QuantMemManager):
+    def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9):
+        super().__init__(size, dtype, head_num, head_dim, layer_num, always_copy, mem_fraction, is_export_mode=True)
@@ -1,6 +1,5 @@
 import re
 import os
-import json
 import torch
 import torch.distributed as dist
 from typing import List, Union
@@ -13,155 +12,10 @@
 from lightllm.utils.envs_utils import get_unique_server_name, get_env_start_args
 from lightllm.distributed.pynccl import PyNcclCommunicator
 from lightllm.utils.dist_utils import get_current_device_id
-from lightllm.utils.envs_utils import get_kv_quant_calibration_inference_count
-from lightllm.utils.envs_utils import get_kv_quant_calibration_warmup_count
-from lightllm.utils.dist_utils import get_global_rank
-from lightllm.utils.config_utils import get_model_architectures
 
 logger = init_logger(__name__)
 
 
-class OfflineFP8QuantManager:
-    def __init__(self, layer_num, head_num):
-        self.qmin = torch.finfo(torch.float8_e4m3fn).min
-        self.qmax = torch.finfo(torch.float8_e4m3fn).max
-        self.model_arch = get_model_architectures(get_env_start_args().model_dir)
-        self.layer_num = layer_num
-        self.head_num = head_num
-        self.total_head_num = head_num * dist.get_world_size() if dist.is_initialized() else head_num
-        self.scales_shape = [layer_num, 2 * head_num] if get_env_start_args().enable_fa3 else [layer_num, 2]
-        self.scales = None
-        self.scales_list = []
-        self.abs_max = None
-        self.warmup_counts = get_kv_quant_calibration_warmup_count()
-        self.inference_counts = get_kv_quant_calibration_inference_count()
-        self.count = 0
-        self.enable_calib = False
-        if get_env_start_args().export_kv_quant_calibration:
-            self.abs_max = torch.zeros(self.scales_shape, dtype=torch.float32, device="cuda")
-        elif get_env_start_args().kv_quant_calibration_config_path is not None:
-            logger.info(
-                f"kv_quant_calibration_config_path {get_env_start_args().kv_quant_calibration_config_path} is set, "
-                "will load kv quant calibration config"
-            )
-            if os.path.exists(get_env_start_args().kv_quant_calibration_config_path):
-                with open(get_env_start_args().kv_quant_calibration_config_path, "r") as f:
-                    cfg = json.load(f)
-
-                if cfg["architectures"] != self.model_arch:
-                    raise ValueError(
-                        f"architectures {cfg['architectures']} in config "
-                        f"not match current model_arch {self.model_arch}"
-                    )
-                if cfg["num_layers"] != layer_num:
-                    raise ValueError(
-                        f"num_layers {cfg['num_layers']} in config " f"not match current layer_num {layer_num}"
-                    )
-                if cfg["num_head"] != self.total_head_num:
-                    raise ValueError(
-                        f"num_head {cfg['num_head']} in config "
-                        f"not match current model head num {self.total_head_num}"
-                    )
-                if get_env_start_args().enable_fa3:
-                    if cfg["quant_type"] != "per_head":
-                        raise ValueError(f"quant type {cfg['num_head']} in config not match fa3 backend")
-                else:
-                    if cfg["quant_type"] != "per_tensor":
-                        raise ValueError(f"quant type {cfg['quant_type']} in config not match flashinfer backend")
-
-                self.qmin = cfg["qmin"]
-                self.qmax = cfg["qmax"]
-                self.scales_shape = cfg["scales_shape"]
-
-                full_scales_list = cfg["scales"]
-                self.scales_list = full_scales_list
-                self.scales = torch.tensor(self.scales_list, dtype=torch.float32, device="cuda").view(self.scales_shape)
-                if not get_env_start_args().enable_fa3:
-                    self.scales = torch.repeat_interleave(self.scales, self.head_num, dim=-1)
-                if get_env_start_args().enable_fa3 and dist.is_initialized() and dist.get_world_size() > 1:
-                    half_head = self.total_head_num // 2
-                    start_head = dist.get_rank() * head_num
-                    end_head = start_head + head_num
-                    k_scales = self.scales[:, start_head:end_head].contiguous()
-                    v_scales = self.scales[:, start_head + half_head : end_head + half_head].contiguous()
-                    current_scales = torch.cat((k_scales, v_scales), dim=-1)
-
-                    self.scales_list = current_scales.tolist()
-                    self.scales = current_scales
-            else:
-                raise FileNotFoundError(
-                    f"kv_quant_calibration_config {get_env_start_args().kv_quant_calibration_config_path} not found"
-                )
-        elif "calibration_fp8kv" in get_env_start_args().mode:
-            logger.warning("scales is None, no kv_quant_calibration_config_path be set")
-
-    def enable_calibration(self):
-        assert get_env_start_args().disable_cudagraph, "Calibration is not supported in cudagraph mode"
-        logger.info("Enable kv cache calibration, will collect kv cache data for quantization calibration")
-        self.enable_calib = True
-
-    def update_calibration_data(self, kv_buffer: torch.Tensor, layer_index: int):
-        if not self.enable_calib or self.count >= self.warmup_counts + self.inference_counts:
-            return
-
-        if self.abs_max is not None and self.count >= self.warmup_counts:
-            if get_env_start_args().enable_fa3:
-                kv_max = kv_buffer.abs().amax(dim=(0, 2)).to(torch.float32)
-            else:
-                k_max = kv_buffer[:, : self.head_num, :].abs().amax(dim=()).to(torch.float32)
-                v_max = kv_buffer[:, self.head_num :, :].abs().amax(dim=()).to(torch.float32)
-                kv_max = torch.tensor([k_max, v_max], device="cuda", dtype=torch.float32)
-            self.abs_max[layer_index] = torch.maximum(self.abs_max[layer_index], kv_max)
-            if self.count == self.warmup_counts + self.inference_counts - 1 and layer_index == self.layer_num - 1:
-                final_abs_max = self.abs_max
-                if dist.is_initialized() and dist.get_world_size() > 1:
-                    if get_env_start_args().enable_fa3:
-                        k_max, v_max = torch.chunk(self.abs_max, 2, dim=-1)
-                        k_max = k_max.contiguous()
-                        v_max = v_max.contiguous()
-                        gathered_k_max = [torch.zeros_like(k_max) for _ in range(dist.get_world_size())]
-                        gathered_v_max = [torch.zeros_like(v_max) for _ in range(dist.get_world_size())]
-                        dist.all_gather(gathered_k_max, k_max, group=None, async_op=False)
-                        dist.all_gather(gathered_v_max, v_max, group=None, async_op=False)
-                        k_max = torch.cat(gathered_k_max, dim=-1)
-                        v_max = torch.cat(gathered_v_max, dim=-1)
-                        final_abs_max = torch.cat((k_max, v_max), dim=-1)
-                    else:
-                        dist.all_reduce(self.abs_max, op=dist.ReduceOp.MAX, group=None, async_op=False)
-
-                self.scales = final_abs_max / self.qmax
-                self.scales = torch.where(self.scales > 0, self.scales, torch.ones_like(self.scales))
-
-                if get_global_rank() == 0:
-                    self.abs_max = final_abs_max
-                    self._export_calibration_data()
-
-        if layer_index == self.layer_num - 1:
-            self.count += 1
-
-    def _export_calibration_data(self):
-        cfg = {
-            "version": "1.0",
-            "architectures": self.model_arch,
-            "quant_type": "per_head" if get_env_start_args().enable_fa3 else "per_tensor",
-            "qmin": self.qmin,
-            "qmax": self.qmax,
-            "num_layers": self.layer_num,
-            "num_head": self.total_head_num,
-            "scales_shape": list(self.abs_max.shape),
-            "scales": self.scales.cpu().numpy().tolist(),
-        }
-        with open("./kv_cache_calib.json", "w") as f:
-            json.dump(cfg, f, indent=4)
-        logger.info(
-            f"Export kv cache calibration data to kv_cache_calib.json, "
-            f"architectures: {self.model_arch}, "
-            f"qmin: {self.qmin}, qmax: {self.qmax}, "
-            f"total heads: {self.total_head_num}, "
-            f"scales_shape: {list(self.abs_max.shape)}, "
-        )
-
-
 class MemoryManager:
     def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9):
         self.size = size
@@ -198,7 +52,6 @@ def __init__(self, size, dtype, head_num, head_dim, layer_num, always_copy=False
             layer_num,
         )
         self.HOLD_TOKEN_MEMINDEX = self.size
-        self.offline_fp8_quant_manager = OfflineFP8QuantManager(layer_num, head_num)
 
     def get_cell_size(self):
         return 2 * self.head_num * self.head_dim * self.layer_num * torch._utils._element_size(self.dtype)
 
@@ -1,6 +1,7 @@
 from lightllm.common.mem_manager import MemoryManager
 from lightllm.common.int8kv_mem_manager import INT8KVMemoryManager
-from lightllm.common.fp8kv_mem_manager import FP8KVMemoryManager
+from lightllm.common.calibration_fp8kv_mem_manager import CalibrationFP8KVMemoryManager
+from lightllm.common.export_calibration_mem_manager import ExportCalibrationMemoryManager
 from lightllm.common.ppl_int8kv_mem_manager import PPLINT8KVMemoryManager
 from lightllm.common.ppl_int4kv_mem_manager import PPLINT4KVMemoryManager
 from lightllm.utils.log_utils import init_logger
@@ -22,7 +23,10 @@ def select_mem_manager_class(mode):
     elif "triton_fp8kv" in mode:
         raise Exception("currently only for deepseek")
     elif "calibration_fp8kv" in mode:
-        memory_manager_class = FP8KVMemoryManager
+        memory_manager_class = CalibrationFP8KVMemoryManager
+        logger.info("Model kv cache using mode calibration fp8kv")
+    elif "export_fp8kv_calibration" in mode:
+        memory_manager_class = ExportCalibrationMemoryManager
         logger.info("Model kv cache using mode calibration fp8kv")
     else:
         memory_manager_class = MemoryManager
 
@@ -0,0 +1,156 @@
+import os
+import json
+import torch
+import torch.distributed as dist
+from lightllm.utils.envs_utils import get_kv_quant_calibration_inference_count
+from lightllm.utils.envs_utils import get_kv_quant_calibration_warmup_count
+from lightllm.utils.dist_utils import get_global_rank
+from lightllm.utils.config_utils import get_model_architectures
+from lightllm.utils.log_utils import init_logger
+from lightllm.utils.envs_utils import get_env_start_args
+
+logger = init_logger(__name__)
+
+from .mem_manager import MemoryManager
+
+
+class OfflineFP8QuantMemManager(MemoryManager):
+    def __init__(
+        self, size, dtype, head_num, head_dim, layer_num, always_copy=False, mem_fraction=0.9, is_export_mode=False
+    ):
+        # 这里用uint8存储量化后的kv，方便兼容各种torch算子。fp8量化目前采用离线方案，kv_buffer不存储scale
+        super().__init__(
+            size, dtype if is_export_mode else torch.uint8, head_num, head_dim, layer_num, always_copy, mem_fraction
+        )
+
+        self.qmin = torch.finfo(torch.float8_e4m3fn).min
+        self.qmax = torch.finfo(torch.float8_e4m3fn).max
+        self.model_arch = get_model_architectures(get_env_start_args().model_dir)
+        self.layer_num = layer_num
+        self.head_num = head_num
+        self.total_head_num = head_num * dist.get_world_size() if dist.is_initialized() else head_num
+        self.scales_shape = [layer_num, 2 * head_num]
+        self.scales = None
+        self.scales_list = []
+        self.abs_max = None
+        self.warmup_counts = get_kv_quant_calibration_warmup_count()
+        self.inference_counts = get_kv_quant_calibration_inference_count()
+        self.count = 0
+        self.enable_calib = False
+        self.is_export_mode = is_export_mode
+        if is_export_mode:
+            self.abs_max = torch.zeros(self.scales_shape, dtype=torch.float32, device="cuda")
+        elif get_env_start_args().kv_quant_calibration_config_path is not None:
+            logger.info(
+                f"kv_quant_calibration_config_path {get_env_start_args().kv_quant_calibration_config_path} is set, "
+                "will load kv quant calibration config"
+            )
+            if os.path.exists(get_env_start_args().kv_quant_calibration_config_path):
+                with open(get_env_start_args().kv_quant_calibration_config_path, "r") as f:
+                    cfg = json.load(f)
+
+                if cfg["architectures"] != self.model_arch:
+                    raise ValueError(
+                        f"architectures {cfg['architectures']} in config "
+                        f"not match current model_arch {self.model_arch}"
+                    )
+                if cfg["num_layers"] != layer_num:
+                    raise ValueError(
+                        f"num_layers {cfg['num_layers']} in config " f"not match current layer_num {layer_num}"
+                    )
+                if cfg["num_head"] != self.total_head_num:
+                    raise ValueError(
+                        f"num_head {cfg['num_head']} in config "
+                        f"not match current model head num {self.total_head_num}"
+                    )
+                if get_env_start_args().enable_fa3:
+                    if cfg["quant_type"] != "per_head":
+                        raise ValueError(f"quant type {cfg['num_head']} in config not match fa3 backend")
+                else:
+                    raise ValueError("only support per_head quant type for fa3 backend, use --enable_fa3 in start args")
+
+                self.qmin = cfg["qmin"]
+                self.qmax = cfg["qmax"]
+                self.scales_shape = cfg["scales_shape"]
+
+                full_scales_list = cfg["scales"]
+                self.scales_list = full_scales_list
+                self.scales = torch.tensor(self.scales_list, dtype=torch.float32, device="cuda").view(self.scales_shape)
+                if dist.is_initialized() and dist.get_world_size() > 1:
+                    half_head = self.total_head_num // 2
+                    start_head = dist.get_rank() * head_num
+                    end_head = start_head + head_num
+                    k_scales = self.scales[:, start_head:end_head].contiguous()
+                    v_scales = self.scales[:, start_head + half_head : end_head + half_head].contiguous()
+                    current_scales = torch.cat((k_scales, v_scales), dim=-1)
+
+                    self.scales_list = current_scales.tolist()
+                    self.scales = current_scales
+            else:
+                raise FileNotFoundError(
+                    f"kv_quant_calibration_config {get_env_start_args().kv_quant_calibration_config_path} not found"
+                )
+        else:
+            logger.warning("scales is None, no kv_quant_calibration_config_path be set")
+
+    def enable_calibration(self):
+        assert (
+            get_env_start_args().enable_fa3
+        ), "Calibration is only supported in fa3 backend, use --enable_fa3 in start args"
+        assert self.is_export_mode, "Calibration is only supported in export mode"
+        assert get_env_start_args().disable_cudagraph, "Calibration is not supported in cudagraph mode"
+        logger.info("Enable kv cache calibration, will collect kv cache data for quantization calibration")
+        self.enable_calib = True
+
+    def update_calibration_data(self, kv_buffer: torch.Tensor, layer_index: int):
+        if not self.enable_calib or self.count >= self.warmup_counts + self.inference_counts:
+            return
+
+        if self.abs_max is not None and self.count >= self.warmup_counts:
+            kv_max = kv_buffer.abs().amax(dim=(0, 2)).to(torch.float32)
+            self.abs_max[layer_index] = torch.maximum(self.abs_max[layer_index], kv_max)
+            if self.count == self.warmup_counts + self.inference_counts - 1 and layer_index == self.layer_num - 1:
+                final_abs_max = self.abs_max
+                if dist.is_initialized() and dist.get_world_size() > 1:
+                    k_max, v_max = torch.chunk(self.abs_max, 2, dim=-1)
+                    k_max = k_max.contiguous()
+                    v_max = v_max.contiguous()
+                    gathered_k_max = [torch.zeros_like(k_max) for _ in range(dist.get_world_size())]
+                    gathered_v_max = [torch.zeros_like(v_max) for _ in range(dist.get_world_size())]
+                    dist.all_gather(gathered_k_max, k_max, group=None, async_op=False)
+                    dist.all_gather(gathered_v_max, v_max, group=None, async_op=False)
+                    k_max = torch.cat(gathered_k_max, dim=-1)
+                    v_max = torch.cat(gathered_v_max, dim=-1)
+                    final_abs_max = torch.cat((k_max, v_max), dim=-1)
+
+                self.scales = final_abs_max / self.qmax
+                self.scales = torch.where(self.scales > 0, self.scales, torch.ones_like(self.scales))
+
+                if get_global_rank() == 0:
+                    self.abs_max = final_abs_max
+                    self._export_calibration_data()
+
+        if layer_index == self.layer_num - 1:
+            self.count += 1
+
+    def _export_calibration_data(self):
+        cfg = {
+            "version": "1.0",
+            "architectures": self.model_arch,
+            "quant_type": "per_head",
+            "qmin": self.qmin,
+            "qmax": self.qmax,
+            "num_layers": self.layer_num,
+            "num_head": self.total_head_num,
+            "scales_shape": list(self.abs_max.shape),
+            "scales": self.scales.cpu().numpy().tolist(),
+        }
+        with open("./kv_cache_calib.json", "w") as f:
+            json.dump(cfg, f, indent=4)
+        logger.info(
+            f"Export kv cache calibration data to kv_cache_calib.json, "
+            f"architectures: {self.model_arch}, "
+            f"qmin: {self.qmin}, qmax: {self.qmax}, "
+            f"total heads: {self.total_head_num}, "
+            f"scales_shape: {list(self.abs_max.shape)}, "
+        )
@@ -61,7 +61,7 @@ def init_some_extra_state(self, model, input_ids: torch.Tensor):
             self.page_table[:, max_seq_len_k:].fill_(0)
 
         if "calibration_fp8kv" in model.mode:
-            offline_scales = self.mem_manager.offline_fp8_quant_manager.scales
+            offline_scales = self.mem_manager.scales
             head_num = self.mem_manager.head_num
             self.k_descale = (
                 offline_scales[:, :head_num]