diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..d16e9335b
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "third-party/cutlass"]
+	path = third-party/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/docs/CN/source/getting_started/multimodal_model_quickstart.rst b/docs/CN/source/getting_started/multimodal_model_quickstart.rst
new file mode 100644
index 000000000..cc3eaf724
--- /dev/null
+++ b/docs/CN/source/getting_started/multimodal_model_quickstart.rst
@@ -0,0 +1,11 @@
+..multimodal_model_quickstart.rst
+-------------------------
+
+下载多模态模型（如llava系列、internvl系列、qwen_vl系列等）的模型以后，在终端使用下面的代码部署API服务：
+
+.. code-block:: console
+
+    $ python -m lightllm.server.api_server --model_dir ~/models/llava-7b-chat --use_dynamic_prompt_cache --enable_multimodal
+
+.. note::
+    上面代码中的 ``--model_dir`` 参数需要修改为你本机实际的模型路径。
diff --git a/lightllm-kernel/CMakeLists.txt b/lightllm-kernel/CMakeLists.txt
new file mode 100644
index 000000000..25a9855b6
--- /dev/null
+++ b/lightllm-kernel/CMakeLists.txt
@@ -0,0 +1,65 @@
+cmake_minimum_required(VERSION 3.22)
+project(lightllm_kernel LANGUAGES CXX CUDA)
+
+# GPU 架构：缺省支持 A100(80)、Ampere(86)、Ada/L40s/4090(89)、Hopper(90)，
+if(NOT CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 80;86;89;90)
+endif()
+
+# 找 PyTorch & Python
+find_package(Torch REQUIRED)
+find_package(Python REQUIRED COMPONENTS Development)
+find_package(CUDAToolkit REQUIRED)
+
+# 收集 csrc 下的 .cpp/.cu
+file(GLOB_RECURSE SRC_CPP   CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cpp")
+file(GLOB_RECURSE SRC_CUDA  CONFIGURE_DEPENDS "${PROJECT_SOURCE_DIR}/csrc/*.cu")
+
+# 编译生成 Python 扩展， _C.so
+if (NOT TARGET _C)
+  add_library(_C SHARED ${SRC_CPP} ${SRC_CUDA})
+
+  # C++17 更方便调度宏
+  target_compile_features(_C PRIVATE cxx_std_17)
+  target_include_directories(_C PRIVATE 
+    ${TORCH_INCLUDE_DIRS}
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/csrc
+    ${PROJECT_SOURCE_DIR}/../third-party/cutlass/include
+  )
+  target_link_libraries(_C
+      PRIVATE
+        ${TORCH_LIBRARIES}
+        Python::Python
+        CUDA::cudart
+        CUDA::cuda_driver)
+
+        
+  # 输出文件名 _C.so，无前缀
+  set_target_properties(_C PROPERTIES
+      PREFIX ""
+      OUTPUT_NAME "_C"
+      BUILD_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
+      INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../torch/lib"
+  )
+endif()
+# 安装：把 _C.so、Python 包和 csrc 一起拷到 site-packages
+include(GNUInstallDirs)
+
+# 1) 计算 Python site-packages 路径
+
+message(STATUS "Installing to ARCH = ${Python_SITEARCH}")
+message(STATUS "Installing to PURE = ${Python_SITELIB}")
+
+# 2) 安装编译好的 _C.so 到 lightllm_kernel 目录
+install(TARGETS _C
+        LIBRARY DESTINATION ${Python_SITEARCH}/lightllm_kernel)
+
+# 3) 安装 Python 源码包
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/lightllm_kernel
+        DESTINATION ${Python_SITELIB})
+
+# 4) 安装 csrc 源码以供 JIT fallback
+install(DIRECTORY ${PROJECT_SOURCE_DIR}/csrc
+        DESTINATION ${Python_SITELIB}/lightllm_kernel)
diff --git a/lightllm-kernel/LICENSE b/lightllm-kernel/LICENSE
new file mode 100644
index 000000000..7a4a3ea24
--- /dev/null
+++ b/lightllm-kernel/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/lightllm-kernel/Makefile b/lightllm-kernel/Makefile
new file mode 100644
index 000000000..5b7100bb6
--- /dev/null
+++ b/lightllm-kernel/Makefile
@@ -0,0 +1,14 @@
+.PHONY: build clean submodule
+
+SUBMODULE_DIR = third-party/cutlass
+
+submodule:
+	git submodule update --init --recursive
+
+build: submodule
+	# 8.0-> A100, 8.6-> A10, 8.9-> L40s/4090, 9.0+PTX-> Hopper
+	TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0+PTX" \
+	python -m pip install -v .
+
+clean:
+	rm -rf build dist *.egg-info
\ No newline at end of file
diff --git a/lightllm-kernel/README-CH.md b/lightllm-kernel/README-CH.md
new file mode 100644
index 000000000..647a594b8
--- /dev/null
+++ b/lightllm-kernel/README-CH.md
@@ -0,0 +1,42 @@
+# LightLLM-Kernel
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+lightllm-kernel 是大模型推理系统 LightLLM 的 CUDA 算子库。它提供了在大型模型推理过程中所需的一系列自定义 GPU 运算算子，以加速关键步骤的计算。
+
+## 功能列表
+
+| Module       | Description                                                                                     |
+|--------------|-------------------------------------------------------------------------------------------------|
+| **Attention** | Optimized Multi-Head Attention kernels with fused QKV operations and efficient softmax         |
+| **MoE**       | Expert routing and computation kernels for Mixture-of-Experts architectures                    |
+| **Quant**     | Low-precision quantization support (INT8/INT4) for weights and activations                      |
+| **Extensions**| Continuous expansion of optimized operations for emerging model architectures                   |
+
+## 安装方法
+
+lightllm_kernel 提供了静态编译以及JIT（Just-In-Time）动态编译的安装方式。推荐使用静态编译安装以获得最佳性能，同时也支持开发者使用可编辑安装进行开发调试。
+
+### System Requirements
+- NVIDIA GPU with Compute Capability ≥ 7.0 (Volta+)
+- CUDA 11.8 or higher
+- Python 3.8+
+
+### Installation Methods
+
+#### Static Compilation (Recommended)
+```bash
+git clone https://github.com/YourUsername/lightllm_kernel.git
+cd lightllm_kernel
+make build
+# Alternative using pip
+pip install .
+```
+
+## 贡献指南
+欢迎社区开发者为 lightllm_kernel 做出贡献！如果您计划新增自定义算子或改进现有功能，请参考以下指南：
+- 新增算子实现：在 csrc/ 目录下添加您的 CUDA/C++ 源码文件，添加时建议参考现有算子的代码风格和结构。
+- 注册Python接口：在 csrc/ops_bindings.cpp中，将新增的算子通过 PyBind11 或 TORCH_LIBRARY 等机制注册到 Python 接口。
+- 导出算子到Python模块：在lightllm_kernel/ops/__init__.py只添加相应的导出代码，使新算子包含在 lightllm_kernel.ops 模块中。
+- 本地测试：开发完成后，请在本地对您的更改进行测试。您可以编译安装新的版本并编写简单的脚本调用新算子，检查其功能和性能是否符合预期。如果项目附带了测试用例，也请运行所有测试确保不引入回归。
+- 
\ No newline at end of file
diff --git a/lightllm-kernel/README.md b/lightllm-kernel/README.md
new file mode 100644
index 000000000..9ce4bce41
--- /dev/null
+++ b/lightllm-kernel/README.md
@@ -0,0 +1,39 @@
+# LightLLM-Kernel
+
+[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+
+LightLLM-Kernel is a high-performance CUDA kernel library powering the LightLLM inference system. It provides optimized GPU implementations for critical operations in large language model (LLM) inference, delivering significant performance improvements through carefully crafted CUDA kernels.
+
+## Project Overview
+
+LightLLM-Kernel serves as the computational backbone for LightLLM framework, offering:
+- **Custom CUDA Kernels**: Highly optimized implementations for transformer-based model operations
+- **Memory Efficiency**: Reduced memory footprint through advanced quantization techniques
+- **Scalability**: Support for large model architectures including MoE (Mixture-of-Experts) models
+
+## Key Features
+
+### Core Modules
+| Module       | Description                                                                                     |
+|--------------|-------------------------------------------------------------------------------------------------|
+| **Attention** | Optimized Multi-Head Attention kernels with fused QKV operations and efficient softmax         |
+| **MoE**       | Expert routing and computation kernels for Mixture-of-Experts architectures                    |
+| **Quant**     | Low-precision quantization support (INT8/INT4) for weights and activations                      |
+| **Extensions**| Continuous expansion of optimized operations for emerging model architectures                   |
+
+## Installation
+
+### System Requirements
+- NVIDIA GPU with Compute Capability ≥ 7.0 (Volta+)
+- CUDA 11.8 or higher
+- Python 3.8+
+
+### Installation Methods
+
+#### Static Compilation (Recommended)
+```bash
+git clone https://github.com/YourUsername/lightllm_kernel.git
+cd lightllm_kernel
+make build
+# Alternative using pip
+pip install .
\ No newline at end of file
diff --git a/lightllm-kernel/benchmark/bench_quant_per_token_bf16_fp8.py b/lightllm-kernel/benchmark/bench_quant_per_token_bf16_fp8.py
new file mode 100644
index 000000000..cd2eb291f
--- /dev/null
+++ b/lightllm-kernel/benchmark/bench_quant_per_token_bf16_fp8.py
@@ -0,0 +1,71 @@
+import time
+import torch
+import itertools
+from typing import Optional, Tuple
+from vllm import _custom_ops as ops
+from sgl_kernel import sgl_per_token_quant_fp8
+
+try:
+    from lightllm_kernel.ops import per_token_quant_bf16_fp8
+except ImportError:
+    raise ImportError("lightllm-kernel op per_token_quant_bf16_fp8 not found.")
+
+fp8_type_ = torch.float8_e4m3fn
+
+
+def vllm_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return ops.scaled_fp8_quant(input, use_per_token_if_dynamic=True)
+
+
+def sglang_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    scale = torch.zeros(input.size(0), device=input.device, dtype=torch.float32)
+    output = torch.empty_like(input, device=input.device, dtype=fp8_type_)
+    sgl_per_token_quant_fp8(input, output, scale)
+
+    return output, scale
+
+
+def lightllm_per_token_quant_fp8(
+    input: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return per_token_quant_bf16_fp8(input)
+
+
+def dequantize(q: torch.Tensor, scale: torch.Tensor):
+    return q.to(torch.bfloat16) * scale.view(-1, *((1,) * (q.dim() - 1)))
+
+
+def benchmark(fn, name, inp, iterations=200):
+    for _ in range(20):
+        q, s = fn(inp)
+    torch.cuda.synchronize()
+
+    starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
+    starter.record()
+    for _ in range(iterations):
+        q, s = fn(inp)
+    ender.record()
+    torch.cuda.synchronize()
+    avg_ms = starter.elapsed_time(ender) / iterations
+
+    q, s = fn(inp)
+    recon = dequantize(q, s)
+    err = recon - inp.to(torch.bfloat16)
+    mse = err.pow(2).mean().item()
+    max_err = err.abs().max().item()
+
+    print(f"{name:20s} | latency: {avg_ms:7.3f} ms | MSE: {mse:.3e} | MaxErr: {max_err:.3e}")
+
+
+if __name__ == "__main__":
+    batch, seq_len = 64, 4096
+    device = "cuda"
+    inp = torch.randn(batch, seq_len, device=device, dtype=torch.bfloat16)
+
+    benchmark(vllm_per_token_quant_fp8, "vllm_ops", inp)
+    benchmark(sglang_per_token_quant_fp8, "sgl_kernel", inp)
+    benchmark(lightllm_per_token_quant_fp8, "lightllm_kernel", inp)
diff --git a/lightllm-kernel/benchmark/bench_rms_norm.py b/lightllm-kernel/benchmark/bench_rms_norm.py
new file mode 100644
index 000000000..c591c53cb
--- /dev/null
+++ b/lightllm-kernel/benchmark/bench_rms_norm.py
@@ -0,0 +1,78 @@
+import time
+import torch
+from typing import Optional, Tuple, Union
+
+from vllm import _custom_ops as vllm_ops
+from lightllm_kernel.ops import rmsnorm_bf16 as lightllm_rms_norm
+from lightllm.models.vit.triton_kernel.rms_norm_vit import rms_norm as triton_rms_norm
+
+
+def vllm_rmsnorm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    residual: Optional[torch.Tensor] = None,
+):
+    orig_shape = x.shape
+    x = x.view(-1, x.shape[-1])
+    if residual is not None:
+        residual = residual.view(-1, residual.shape[-1])
+
+    if residual is not None:
+        vllm_ops.fused_add_rms_norm(x, residual, weight, eps)
+        output = (x, residual)
+    else:
+        out = torch.empty_like(x)
+        vllm_ops.rms_norm(out, x, weight, eps)
+        output = out
+
+    if isinstance(output, tuple):
+        output = (output[0].view(orig_shape), output[1].view(orig_shape))
+    else:
+        output = output.view(orig_shape)
+    return output
+
+
+def torch_rmsnorm(x: torch.Tensor, w: torch.Tensor, eps: float):
+    mean_sq = x.pow(2).mean(dim=-1, keepdim=True)
+    inv_std = torch.rsqrt(mean_sq + eps)
+    out = x * inv_std * w
+    return out
+
+
+def benchmark(fn, name, x, w, eps, iterations=200):
+    for _ in range(10):
+        _ = fn(x, w, eps)
+    torch.cuda.synchronize()
+
+    starter = torch.cuda.Event(enable_timing=True)
+    ender = torch.cuda.Event(enable_timing=True)
+    starter.record()
+    for _ in range(iterations):
+        _ = fn(x, w, eps)
+    ender.record()
+    torch.cuda.synchronize()
+    latency_ms = starter.elapsed_time(ender) / iterations
+
+    y_ref = torch_rmsnorm(x, w, eps)
+    y_out = fn(x, w, eps)
+    err = y_out - y_ref
+    mse = err.pow(2).mean().item()
+    max_err = err.abs().max().item()
+
+    print(f"{name:20s} | latency: {latency_ms:7.3f} ms | MSE: {mse:.3e} | MaxErr: {max_err:.3e}")
+
+
+if __name__ == "__main__":
+
+    batch, dim = 64, 1024
+    eps = 1e-6
+    device = "cuda"
+
+    x = torch.randn(batch, dim, device=device, dtype=torch.bfloat16)
+    w = torch.randn(dim, device=device, dtype=torch.bfloat16)
+
+    benchmark(torch_rmsnorm, "torch_rmsnorm", x, w, eps)
+    benchmark(lightllm_rms_norm, "lightllm_rms_norm", x, w, eps)
+    benchmark(triton_rms_norm, "triton_rms_norm", x, w, eps)
+    benchmark(vllm_rmsnorm, "vllm_rmsnorm", x, w, eps)
diff --git a/lightllm-kernel/benchmark/bench_tp_norm.py b/lightllm-kernel/benchmark/bench_tp_norm.py
new file mode 100644
index 000000000..53599ebb3
--- /dev/null
+++ b/lightllm-kernel/benchmark/bench_tp_norm.py
@@ -0,0 +1,86 @@
+# bench_tp_norm_tp4.py
+import os
+import torch
+import torch.distributed as dist
+from types import SimpleNamespace
+
+from lightllm_kernel.ops import (
+    rmsnorm_bf16,
+    pre_tp_norm_bf16,
+    post_tp_norm_bf16,
+)
+
+
+def init_dist():
+    dist.init_process_group("nccl", init_method="env://")
+    rank = int(os.environ["LOCAL_RANK"])
+    torch.cuda.set_device(rank)
+    return rank, dist.get_world_size()
+
+
+def tp_norm_cuda(x, w, cfg):
+    if cfg.tp_world == 1:
+        return rmsnorm_bf16(x, w, cfg.eps)
+
+    var_local = pre_tp_norm_bf16(x)
+    dist.all_reduce(var_local, op=dist.ReduceOp.SUM)
+    return post_tp_norm_bf16(x, w, var_local, cfg.global_embed, cfg.eps)
+
+
+def tp_norm_ref(x, w, cfg):
+    x32 = x.to(torch.float32)
+    var = x32.pow(2).sum(-1, keepdim=True)
+    if cfg.tp_world > 1:
+        dist.all_reduce(var, op=dist.ReduceOp.SUM)
+    x32 = x32 * torch.rsqrt(var / cfg.global_embed + cfg.eps)
+    return (w.to(torch.float32) * x32).to(x.dtype)
+
+
+def bench(fn, tag, x, w, cfg, iters=200):
+    for _ in range(20):
+        fn(x, w, cfg)
+    torch.cuda.synchronize()
+    t0 = torch.cuda.Event(True)
+    t1 = torch.cuda.Event(True)
+    t0.record()
+    for _ in range(iters):
+        fn(x, w, cfg)
+    t1.record()
+    torch.cuda.synchronize()
+    ms = t0.elapsed_time(t1) / iters
+
+    ref = tp_norm_ref(x, w, cfg).to(torch.float32)
+    out = fn(x, w, cfg).to(torch.float32)
+    mse = (out - ref).pow(2).mean().item()
+    err = (out - ref).abs().max().item()
+
+    if dist.get_rank() == 0:
+        print(f"{tag:18s}| {ms:6.3f} ms | MSE {mse:.3e} | MaxErr {err:.3e}")
+
+
+if __name__ == "__main__":
+    rank, world = init_dist()
+
+    tp_world = 4
+    pad_heads, dim_h = 32, 1024
+    local_embed = pad_heads * dim_h
+    global_embed = local_embed * tp_world
+    tokens = 2048
+    eps = 1e-6
+
+    x = torch.randn(tokens, local_embed, device=f"cuda:{rank}", dtype=torch.bfloat16)
+    w = torch.randn(local_embed, device=f"cuda:{rank}", dtype=torch.bfloat16)
+
+    cfg = SimpleNamespace(tp_world=tp_world, global_embed=global_embed, eps=eps)
+
+    if rank == 0:
+        print(
+            f"tp={tp_world}, tokens={tokens}, local_embed={local_embed}, " f"global_embed={global_embed}, dtype=bf16\n"
+        )
+    dist.barrier()
+
+    bench(tp_norm_ref, "torch_ref", x, w, cfg)
+    bench(tp_norm_cuda, "cuda_kernel", x, w, cfg)
+
+    dist.destroy_process_group()
+# python -m torch.distributed.run --nproc_per_node=4 bench_tp_norm.py
diff --git a/lightllm-kernel/csrc/allgather/all_gather.cu b/lightllm-kernel/csrc/allgather/all_gather.cu
new file mode 100644
index 000000000..56e4a863d
--- /dev/null
+++ b/lightllm-kernel/csrc/allgather/all_gather.cu
@@ -0,0 +1,150 @@
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/all.h>
+
+#include "ops_common.h"
+#include "all_gather.cuh"
+
+namespace lightllm {
+namespace ops {
+// Fake pointer type, must match fptr_t type in ops.h.
+// We use this type alias to indicate when pointers are passed in as int64_t.
+using fptr_t = int64_t;
+static_assert(sizeof(void*) == sizeof(fptr_t));
+
+fptr_t init_custom_gather_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
+                      torch::Tensor& rank_data, int64_t rank,
+                      bool full_nvlink) {
+  int world_size = fake_ipc_ptrs.size();
+  if (world_size > 8)
+    throw std::invalid_argument("world size > 8 is not supported");
+  if (world_size % 2 != 0)
+    throw std::invalid_argument("Odd num gpus is not supported for now");
+  if (rank < 0 || rank >= world_size)
+    throw std::invalid_argument("invalid rank passed in");
+
+  vllm::Signal* ipc_ptrs[8];
+  for (int i = 0; i < world_size; i++) {
+    ipc_ptrs[i] = reinterpret_cast<vllm::Signal*>(fake_ipc_ptrs[i]);
+  }
+  return (fptr_t) new vllm::CustomAllgather(ipc_ptrs, rank_data.data_ptr(),
+                                            rank_data.numel(), rank, world_size,
+                                            full_nvlink);
+}
+
+/**
+ * Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
+ * t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
+ * because it allows transpose of contiguous slice (i.e. slicing the first
+ * dimension). Currently, we require this because stride information is not
+ * passed into the kernels and we treat input tensors as flat.
+ *
+ * Examples
+ * A = torch.zeros(3, 3, 3)
+ * 1. A: OK
+ * 2. A[1:]: OK
+ * 3. A.permute(2, 0, 1): OK
+ * 4. A[1:].permute(2, 0, 1): OK
+ * 5. A[None].expand(2, -1, -1, -1): Not OK
+ * 6. A[:, 1:, 1:]: Not OK
+ */
+bool _is_weak_contiguous_gather(torch::Tensor& t) {
+  return t.is_contiguous() ||
+         (t.storage().nbytes() - t.storage_offset() * t.element_size() ==
+          t.numel() * t.element_size());
+}
+
+/**
+ * Performs an out-of-place allgather and stores result in out.
+ *
+ * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
+ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
+ * copied into _reg_buffer.
+ */
+void all_gather(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+  
+                fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK(_is_weak_contiguous_gather(out));
+  TORCH_CHECK(_is_weak_contiguous_gather(inp));
+  auto input_size = inp.numel() * inp.element_size();
+  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
+  if (reg_buffer) {
+    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
+    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
+                                  cudaMemcpyDeviceToDevice, stream));
+  } else {
+    reg_buffer = inp.data_ptr();
+  }
+  switch (out.scalar_type()) {
+    case at::ScalarType::Float: {
+      fa->allgather<float>(stream, reinterpret_cast<float*>(reg_buffer),
+                           reinterpret_cast<float*>(out.data_ptr()),
+                           inp.numel());
+      break;
+    }
+    case at::ScalarType::Half: {
+      fa->allgather<half>(stream, reinterpret_cast<half*>(reg_buffer),
+                          reinterpret_cast<half*>(out.data_ptr()), inp.numel());
+      break;
+    }
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    case at::ScalarType::BFloat16: {
+      fa->allgather<nv_bfloat16>(
+          stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
+          reinterpret_cast<nv_bfloat16*>(out.data_ptr()), inp.numel());
+      break;
+    }
+#endif
+    default:
+      throw std::runtime_error(
+          "custom allgather only supports float32, float16 and bfloat16");
+  }
+}
+
+void allgather_dispose(fptr_t _fa) {
+  delete reinterpret_cast<vllm::CustomAllgather*>(_fa);
+}
+
+int64_t meta_size() { return sizeof(vllm::Signal); }
+
+void allgather_register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
+  void* ipc_ptrs[8];
+  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
+    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
+  }
+  fa->register_buffer(ipc_ptrs);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+allgather_get_graph_buffer_ipc_meta(fptr_t _fa) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
+  std::vector<int64_t> bytes(handle.begin(), handle.end());
+  return std::make_tuple(bytes, offsets);
+}
+
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+void allgather_register_graph_buffers(fptr_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
+                            const std::vector<std::vector<int64_t>>& offsets) {
+  auto fa = reinterpret_cast<vllm::CustomAllgather*>(_fa);
+  std::vector<std::string> bytes;
+  bytes.reserve(handles.size());
+  for (int i = 0; i < handles.size(); i++) {
+    bytes.emplace_back(handles[i].begin(), handles[i].end());
+  }
+  bytes.reserve(handles.size());
+  fa->register_graph_buffers(bytes, offsets);
+}
+
+  } // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/allgather/all_gather.cuh b/lightllm-kernel/csrc/allgather/all_gather.cuh
new file mode 100644
index 000000000..99cb579be
--- /dev/null
+++ b/lightllm-kernel/csrc/allgather/all_gather.cuh
@@ -0,0 +1,287 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <array>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+#include "all_reduce.cuh"
+
+// #define CUDACHECK(cmd)                                              \
+//   do {                                                              \
+//     cudaError_t e = cmd;                                            \
+//     if (e != cudaSuccess) {                                         \
+//       printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+//              cudaGetErrorString(e));                                \
+//       exit(EXIT_FAILURE);                                           \
+//     }                                                               \
+//   } while (0)
+
+namespace vllm {
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct gather_packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+};
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    custom_all_gather_kernel(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename gather_packed_t<T>::P;
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  for (int idx = tid; idx < size; idx += stride) {
+    #pragma unroll
+      for (int step = 0; step < ngpus; step ++) {
+          int src_rank = (rank - step + ngpus) % ngpus;  // 当前步骤中数据来源的进程
+          P* ptr = (P*)_dp->ptrs[src_rank];
+          int dst_offset = src_rank * size;         // 数据在 recv_buf 中的存储位置
+          // 从 src_rank 的 handle 中读取数据，并存储到 recv_buf
+          int dst_idx = dst_offset + idx;
+          ((P*)result)[dst_idx] = ptr[idx];
+      }
+  }
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
+class CustomAllgather {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  RankSignals sg_;
+  // Stores an map from a pointer to its peer pointters from all ranks.
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allgather synchronization, and the second section
+   * is for storing the intermediate results required by some allgather algos.
+   *
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
+   */
+  CustomAllgather(Signal** signals, void* rank_data, size_t rank_data_sz,
+                  int rank, int world_size, bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(world_size),
+        full_nvlink_(full_nvlink),
+        self_sg_(signals[rank]),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      sg_.signals[i] = signals[i];
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] =
+        ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
+                                     *((const cudaIpcMemHandle_t*)ipc_handle),
+                                     cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(cudaIpcGetMemHandle(
+          (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " +
+          std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      data.ptrs[i] = ptrs[i];
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(
+        cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[ptrs[rank_]] = d_data;
+  }
+
+  // Note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allgather, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void register_graph_buffers(
+      const std::vector<std::string>& handles,
+      const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle =
+              open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
+                         sizeof(RankData) * num_buffers,
+                         cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * Performs allgather, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allgather(cudaStream_t stream, T* input, T* output, int size,
+                 int threads = 512, int block_limit = 36) {
+    auto d = gather_packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allgather currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error("max supported block limit is " +
+                               std::to_string(kMaxBlocks) + ". Got " +
+                               std::to_string(block_limit));
+
+    RankData* ptrs;
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " +
+            std::to_string(reinterpret_cast<uint64_t>(input)) +
+            " is not registered!");
+      ptrs = it->second;
+    }
+    size /= d;
+    // auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name)                                                       \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                 rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
+#define REDUCE_CASE(ngpus)                            \
+  case ngpus: {                                       \
+    KL(ngpus, custom_all_gather_kernel);        \
+    break;                                            \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allgather only supports num gpus in (2,4,6,8). Actual num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllgather() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CUDACHECK(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
+ * template void vllm::CustomAllgather::allgather<half>(cudaStream_t, half *,
+ half *, int, int, int);
+*/
+}  // namespace vllm
diff --git a/lightllm-kernel/csrc/allgather/all_reduce.cuh b/lightllm-kernel/csrc/allgather/all_reduce.cuh
new file mode 100644
index 000000000..6be4d4f2b
--- /dev/null
+++ b/lightllm-kernel/csrc/allgather/all_reduce.cuh
@@ -0,0 +1,516 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <iostream>
+#include <array>
+#include <limits>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+#define CUDACHECK(cmd)                                              \
+  do {                                                              \
+    cudaError_t e = cmd;                                            \
+    if (e != cudaSuccess) {                                         \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                                \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+namespace vllm {
+
+constexpr int kMaxBlocks = 36;
+// Counter may overflow, but it's fine since unsigned int overflow is
+// well-defined behavior.
+using FlagType = uint32_t;
+struct Signal {
+  alignas(128) FlagType self_counter[kMaxBlocks][8];
+  // Two sets of peer counters are needed for two syncs. The reason is that
+  // it's possible for peer GPU block to arrive at the second sync point while
+  // the current GPU block haven't passed the first sync point. Thus, peer GPU
+  // may write counter+1 while current GPU is busy waiting for counter. We use
+  // alternating counter array to avoid this possibility.
+  alignas(128) FlagType peer_counter[2][kMaxBlocks][8];
+};
+
+struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
+
+struct __align__(16) RankSignals { Signal* signals[8]; };
+
+// like std::array, but aligned
+template <typename T, int sz>
+struct __align__(alignof(T) * sz) array_t {
+  T data[sz];
+  using type = T;
+  static constexpr int size = sz;
+};
+
+// use packed type to maximize memory efficiency
+// goal: generate ld.128 and st.128 instructions
+template <typename T>
+struct packed_t {
+  // the (P)acked type for load/store
+  using P = array_t<T, 16 / sizeof(T)>;
+  // the (A)ccumulator type for reduction
+  using A = array_t<float, 16 / sizeof(T)>;
+};
+
+#define DINLINE __device__ __forceinline__
+
+// scalar cast functions
+DINLINE float upcast_s(half val) { return __half2float(val); }
+
+template <typename T>
+DINLINE T downcast_s(float val);
+template <>
+DINLINE half downcast_s(float val) {
+  return __float2half(val);
+}
+
+// scalar add functions
+// for some reason when compiling with Pytorch, the + operator for half and
+// bfloat is disabled so we call the intrinsics directly
+DINLINE half& assign_add(half& a, half b) {
+  a = __hadd(a, b);
+  return a;
+}
+DINLINE float& assign_add(float& a, float b) { return a += b; }
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
+template <>
+DINLINE nv_bfloat16 downcast_s(float val) {
+  return __float2bfloat16(val);
+}
+DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
+  a = __hadd(a, b);
+  return a;
+}
+#endif
+
+template <typename T, int N>
+DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
+#pragma unroll
+  for (int i = 0; i < N; i++) {
+    assign_add(a.data[i], b.data[i]);
+  }
+  return a;
+}
+
+template <typename T, int N>
+DINLINE array_t<float, N> upcast(array_t<T, N> val) {
+  if constexpr (std::is_same<T, float>::value) {
+    return val;
+  } else {
+    array_t<float, N> out;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      out.data[i] = upcast_s(val.data[i]);
+    }
+    return out;
+  }
+}
+
+template <typename O>
+DINLINE O downcast(array_t<float, O::size> val) {
+  if constexpr (std::is_same<typename O::type, float>::value) {
+    return val;
+  } else {
+    O out;
+#pragma unroll
+    for (int i = 0; i < O::size; i++) {
+      out.data[i] = downcast_s<typename O::type>(val.data[i]);
+    }
+    return out;
+  }
+}
+
+static DINLINE void st_flag_release(FlagType* flag_addr, FlagType flag) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("st.release.sys.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#else
+  asm volatile("membar.sys; st.volatile.global.u32 [%1], %0;" ::"r"(flag),
+               "l"(flag_addr));
+#endif
+}
+
+static DINLINE FlagType ld_flag_acquire(FlagType* flag_addr) {
+  FlagType flag;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  asm volatile("ld.acquire.sys.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#else
+  asm volatile("ld.volatile.global.u32 %0, [%1]; membar.gl;"
+               : "=r"(flag)
+               : "l"(flag_addr));
+#endif
+  return flag;
+}
+
+static DINLINE void st_flag_volatile(FlagType* flag_addr, FlagType flag) {
+  asm volatile("st.volatile.global.u32 [%1], %0;" ::"r"(flag), "l"(flag_addr));
+}
+
+static DINLINE FlagType ld_flag_volatile(FlagType* flag_addr) {
+  FlagType flag;
+  asm volatile("ld.volatile.global.u32 %0, [%1];"
+               : "=r"(flag)
+               : "l"(flag_addr));
+  return flag;
+}
+
+// is_start: whether this is the very first synchronization barrier.
+// need_fence: whether a memory fence is needed. If true, a release-acquire
+// semantic is used to enforce memory access order before and after this
+// barrier.
+template <int ngpus, bool is_start, bool need_fence = false>
+DINLINE void multi_gpu_barrier(const RankSignals& sg, Signal* self_sg,
+                               int rank) {
+  if constexpr (!is_start) __syncthreads();
+  static_assert(
+      !(is_start && need_fence));  // Start barrier shouldn't need fence.
+  if (threadIdx.x < ngpus) {
+    // Increment the counter. Technically we only need one counter, but we use
+    // multiple per block to eliminate the need to share the counter via smem.
+    auto val = self_sg->self_counter[blockIdx.x][threadIdx.x] += 1;
+    // Write the expected counter value to peer and wait for correct value from
+    // peer.
+    auto peer_counter_ptr =
+        &sg.signals[threadIdx.x]->peer_counter[val % 2][blockIdx.x][rank];
+    auto self_counter_ptr =
+        &self_sg->peer_counter[val % 2][blockIdx.x][threadIdx.x];
+    if constexpr (need_fence) {
+      st_flag_release(peer_counter_ptr, val);
+      while (ld_flag_acquire(self_counter_ptr) != val);
+    } else {
+      st_flag_volatile(peer_counter_ptr, val);
+      while (ld_flag_volatile(self_counter_ptr) != val);
+    }
+  }
+  if constexpr (is_start || need_fence) __syncthreads();
+}
+
+template <typename P, int ngpus, typename A>
+DINLINE P packed_reduce(const P* ptrs[], int idx) {
+  A tmp = upcast(ptrs[0][idx]);
+#pragma unroll
+  for (int i = 1; i < ngpus; i++) {
+    packed_assign_add(tmp, upcast(ptrs[i][idx]));
+  }
+  return downcast<P>(tmp);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_1stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  // note: we don't reorder the address so the accumulation order is the same
+  // for all ranks, ensuring bitwise identical results
+  auto dp = *_dp;
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // do the actual reduction
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
+       idx += gridDim.x * blockDim.x) {
+    ((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
+  }
+  multi_gpu_barrier<ngpus, false>(sg, self_sg, rank);
+}
+
+template <typename P>
+DINLINE P* get_tmp_buf(Signal* sg) {
+  return (P*)(((Signal*)sg) + 1);
+}
+
+template <typename T, int ngpus>
+__global__ void __launch_bounds__(512, 1)
+    cross_device_reduce_2stage(RankData* _dp, RankSignals sg, Signal* self_sg,
+                               T* __restrict__ result, int rank, int size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  using P = typename packed_t<T>::P;
+  using A = typename packed_t<T>::A;
+  int part = size / ngpus;
+  int start = rank * part;
+  int end = rank == ngpus - 1 ? size : start + part;
+  int largest_part = part + size % ngpus;
+  const P* ptrs[ngpus];
+  P* tmps[ngpus];
+#pragma unroll
+  for (int i = 0; i < ngpus; i++) {
+    int target = (rank + i) % ngpus;
+    ptrs[i] = (const P*)_dp->ptrs[target];
+    tmps[i] = get_tmp_buf<P>(sg.signals[target]);
+  }
+  auto tmp_out = tmps[0];
+  multi_gpu_barrier<ngpus, true>(sg, self_sg, rank);
+  // stage 1: reduce scatter
+  for (int idx = start + tid; idx < end; idx += stride) {
+    tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
+  }
+  multi_gpu_barrier<ngpus, false, true>(sg, self_sg, rank);
+
+  // stage 2: allgather. Note: it's important to match the tid between
+  // the two stages, because visibility across devices is only guaranteed
+  // between threads that have the same tid. If thread i computes the sum of
+  // start + i in the first stage, then thread i also gathers start + i from all
+  // ranks.
+  for (int idx = tid; idx < largest_part; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < ngpus; i++) {
+      int gather_from_rank = ((rank + i) % ngpus);
+      if (gather_from_rank == ngpus - 1 || idx < part) {
+        int dst_idx = gather_from_rank * part + idx;
+        ((P*)result)[dst_idx] = tmps[i][idx];
+      }
+    }
+  }
+}
+
+using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
+static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
+static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
+
+class CustomAllreduce {
+ public:
+  int rank_;
+  int world_size_;
+  bool full_nvlink_;
+
+  RankSignals sg_;
+  // Stores an map from a pointer to its peer pointters from all ranks.
+  std::unordered_map<void*, RankData*> buffers_;
+  Signal* self_sg_;
+
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
+  RankData *d_rank_data_base_, *d_rank_data_end_;
+  std::vector<void*> graph_unreg_buffers_;
+  // a map from IPC handles to opened IPC pointers
+  std::map<IPC_KEY, char*> ipc_handles_;
+
+  /**
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allreduce synchronization, and the second section
+   * is for storing the intermediate results required by some allreduce algos.
+   *
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
+   */
+  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
+                  int rank, int world_size, bool full_nvlink = true)
+      : rank_(rank),
+        world_size_(world_size),
+        full_nvlink_(full_nvlink),
+        self_sg_(signals[rank]),
+        d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
+        d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
+    for (int i = 0; i < world_size_; i++) {
+      sg_.signals[i] = signals[i];
+    }
+  }
+
+  char* open_ipc_handle(const void* ipc_handle) {
+    auto [it, new_handle] =
+        ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
+    if (new_handle) {
+      char* ipc_ptr;
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
+                                     *((const cudaIpcMemHandle_t*)ipc_handle),
+                                     cudaIpcMemLazyEnablePeerAccess));
+      it->second = ipc_ptr;
+    }
+    return it->second;
+  }
+
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
+    auto num_buffers = graph_unreg_buffers_.size();
+    auto handle_sz = sizeof(cudaIpcMemHandle_t);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
+    std::vector<int64_t> offsets(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto ptr = graph_unreg_buffers_[i];
+      void* base_ptr;
+      // note: must share the base address of each allocation, or we get wrong
+      // address
+      if (cuPointerGetAttribute(&base_ptr,
+                                CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
+                                (CUdeviceptr)ptr) != CUDA_SUCCESS)
+        throw std::runtime_error("failed to get pointer attr");
+      CUDACHECK(cudaIpcGetMemHandle(
+          (cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
+      offsets[i] = ((char*)ptr) - ((char*)base_ptr);
+    }
+    return std::make_pair(handles, offsets);
+  }
+
+  void check_rank_data_capacity(size_t num = 1) {
+    if (d_rank_data_base_ + num > d_rank_data_end_)
+      throw std::runtime_error(
+          "Rank data buffer is overflowed by " +
+          std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
+  }
+
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
+    check_rank_data_capacity();
+    RankData data;
+    for (int i = 0; i < world_size_; i++) {
+      data.ptrs[i] = ptrs[i];
+    }
+    auto d_data = d_rank_data_base_++;
+    CUDACHECK(
+        cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
+    buffers_[ptrs[rank_]] = d_data;
+  }
+
+  // Note: when registering graph buffers, we intentionally choose to not
+  // deduplicate the addresses. That means if the allocator reuses some
+  // addresses, they will be registered again. This is to account for the remote
+  // possibility of different allocation patterns between ranks. For example,
+  // rank 1 may get the same input address for the second allreduce, but rank 2
+  // got a different address. IPC handles have internal reference counting
+  // mechanism so overhead should be small.
+  void register_graph_buffers(
+      const std::vector<std::string>& handles,
+      const std::vector<std::vector<int64_t>>& offsets) {
+    auto num_buffers = graph_unreg_buffers_.size();
+    check_rank_data_capacity(num_buffers);
+    std::vector<RankData> rank_data(num_buffers);
+    for (int i = 0; i < num_buffers; i++) {
+      auto self_ptr = graph_unreg_buffers_[i];
+      auto& rd = rank_data[i];
+      for (int j = 0; j < world_size_; j++) {
+        if (j != rank_) {
+          char* handle =
+              open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
+          handle += offsets[j][i];
+          rd.ptrs[j] = handle;
+        } else {
+          rd.ptrs[j] = self_ptr;
+        }
+      }
+    }
+    CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
+                         sizeof(RankData) * num_buffers,
+                         cudaMemcpyHostToDevice));
+    d_rank_data_base_ += num_buffers;
+    graph_unreg_buffers_.clear();
+  }
+
+  /**
+   * Performs allreduce, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
+   */
+  template <typename T>
+  void allreduce(cudaStream_t stream, T* input, T* output, int size,
+                 int threads = 512, int block_limit = 36) {
+    auto d = packed_t<T>::P::size;
+    if (size % d != 0)
+      throw std::runtime_error(
+          "custom allreduce currently requires input length to be multiple "
+          "of " +
+          std::to_string(d));
+    if (block_limit > kMaxBlocks)
+      throw std::runtime_error("max supported block limit is " +
+                               std::to_string(kMaxBlocks) + ". Got " +
+                               std::to_string(block_limit));
+
+    RankData* ptrs;
+    cudaStreamCaptureStatus status;
+    CUDACHECK(cudaStreamIsCapturing(stream, &status));
+    if (status == cudaStreamCaptureStatusActive) {
+      ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
+      graph_unreg_buffers_.push_back(input);
+    } else {
+      auto it = buffers_.find(input);
+      if (it == buffers_.end())
+        throw std::runtime_error(
+            "buffer address " +
+            std::to_string(reinterpret_cast<uint64_t>(input)) +
+            " is not registered!");
+      ptrs = it->second;
+    }
+
+    size /= d;
+    auto bytes = size * sizeof(typename packed_t<T>::P);
+    int blocks = std::min(block_limit, (size + threads - 1) / threads);
+#define KL(ngpus, name)                                                       \
+  name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
+                                                 rank_, size);
+    // TODO(hanzhi713): Threshold is different for A100 and H100.
+    // Add per device threshold.
+#define REDUCE_CASE(ngpus)                            \
+  case ngpus: {                                       \
+    if (world_size_ == 2) {                           \
+      KL(ngpus, cross_device_reduce_1stage);          \
+    } else if (full_nvlink_) {                        \
+      if ((world_size_ <= 4 && bytes < 512 * 1024) || \
+          (world_size_ <= 8 && bytes < 256 * 1024)) { \
+        KL(ngpus, cross_device_reduce_1stage);        \
+      } else {                                        \
+        KL(ngpus, cross_device_reduce_2stage);        \
+      }                                               \
+    }                                                 \
+    break;                                            \
+  }
+
+    switch (world_size_) {
+      REDUCE_CASE(2)
+      REDUCE_CASE(4)
+      REDUCE_CASE(6)
+      REDUCE_CASE(8)
+      default:
+        throw std::runtime_error(
+            "custom allreduce only supports num gpus in (2,4,6,8). Actual num "
+            "gpus = " +
+            std::to_string(world_size_));
+    }
+#undef REDUCE_CASE
+#undef KL
+  }
+
+  ~CustomAllreduce() {
+    for (auto [_, ptr] : ipc_handles_) {
+      CUDACHECK(cudaIpcCloseMemHandle(ptr));
+    }
+  }
+};
+/**
+ * To inspect PTX/SASS, copy paste this header file to compiler explorer and add
+ a template instantiation:
+ * template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
+ half *, int, int, int);
+*/
+}  // namespace vllm
diff --git a/lightllm-kernel/csrc/attention/decode_attention_kernel.cu b/lightllm-kernel/csrc/attention/decode_attention_kernel.cu
new file mode 100644
index 000000000..3fd4ce336
--- /dev/null
+++ b/lightllm-kernel/csrc/attention/decode_attention_kernel.cu
@@ -0,0 +1,569 @@
+#include <cuda_fp16.h>
+#include <float.h> // need for FLT_MAX
+#include <math.h>
+#include <memory>
+#include <assert.h>
+#include "ops_common.h"
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace lightllm {
+namespace ops {
+
+# include <torch/extension.h>
+#define LIGHT_DISPATCH_CASE_FLOATING_TYPES(...)              \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define LIGHT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
+  AT_DISPATCH_SWITCH(                                             \
+    TYPE, NAME, LIGHT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+template <typename T>
+__device__ inline float tofloat(T value) {
+    return static_cast<float>(value);
+}
+
+// Specialization for __half
+template <>
+__device__ inline float tofloat<__half>(__half value) {
+    return __half2float(value);
+}
+
+// Specialization for __nv_bfloat16
+template <>
+__device__ inline float tofloat<__nv_bfloat16>(__nv_bfloat16 value) {
+    return __bfloat162float(value);
+}
+
+template <int VPT>
+struct BytesToType;
+
+template <>
+struct BytesToType<2>
+{
+    using type = uint16_t;
+};
+template <>
+struct BytesToType<4>
+{
+    using type = uint32_t;
+};
+template <>
+struct BytesToType<8>
+{
+    using type = uint64_t;
+};
+template <>
+struct BytesToType<16>
+{
+    using type = float4;
+};
+
+template <int Bytes>
+__device__ inline void copy(const void* local, void* data)
+{
+    using T = typename BytesToType<Bytes>::type;
+
+    const T* in = static_cast<const T*>(local);
+    T* out = static_cast<T*>(data);
+    *out = *in;
+}
+
+template<int32_t THREAD_GROUP_SIZE, int32_t ELEMENT_NUM, typename T>
+__device__ inline
+float attn_thread_group_dot(T* local_q, T* local_k)
+{
+    // Helper function for QK Dot.
+    // [TODO] It should be optimized by type fp32x4.
+
+    float qk = 0.0f;
+# pragma unroll
+    for(int32_t i = 0; i < ELEMENT_NUM; i++) {
+        qk += tofloat(local_q[i]) * tofloat(local_k[i]);
+    }
+#pragma unroll
+    for (int32_t mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+        qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    }
+    return qk;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_max(float reducing, float* shared_mem)
+{
+    // Helper function for reduce softmax qkmax.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    if (lane_id == 0) {
+        shared_mem[warp_id] = reducing;
+    }
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+    else reducing = -FLT_MAX;
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_sum(float reducing, float *shared_mem)
+{
+    // Helper function for reduce softmax exp sum.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+
+    if (lane_id == 0) shared_mem[warp_id] = reducing;
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<
+    int32_t HEAD_SIZE,
+    int32_t THREAD_GROUP_SIZE,        // how many threads inside a group
+    int32_t TPB,
+    int32_t QUANT_GROUP,
+    typename T>
+__global__
+void dynamic_batching_decoding_cache_attention_fp16_kernel(
+    T* __restrict__ output,          // [context_lens, num_heads..., head_size]
+    
+    const T* __restrict__ query,     // [seq_lens, num_heads..., head_size]
+    const int8_t* k_cache,                // [max_token, num_kv_heads, head_size]
+    const T* k_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+    const int8_t* v_cache,                // [max_token, num_kv_heads, head_size]
+    const T* v_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+
+    const float attn_scale,
+
+    const int64_t output_stride_s,
+    const int64_t output_stride_h,
+
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+    const int64_t gqa_group_size) { 
+
+    /* --- Decoding Attention Kernel Implementation --- */
+    constexpr int64_t WARP_SIZE = 32;                              // warp size
+    constexpr int64_t WPT       = TPB / WARP_SIZE;                 // warp per thread block， TPB for Thread per block 4, block_size
+    constexpr int64_t GPW       = WARP_SIZE / THREAD_GROUP_SIZE;       // thread group per warp 4
+    constexpr int64_t GPT       = WARP_SIZE / THREAD_GROUP_SIZE * WPT; // thread group per thread block 16
+
+    // const int64_t num_heads     = gridDim.x;
+    const int64_t head_idx      = blockIdx.x;
+    const int64_t batch_idx     = blockIdx.y;
+
+    const int64_t seq_len = b_seq_len[batch_idx];
+    const int64_t cur_req_idx = b_req_idx[batch_idx];
+    const int32_t * b_start_loc = req_to_tokens + cur_req_idx * req_to_tokens_stride;
+
+    constexpr int64_t VEC_SIZE  = 16 / sizeof(T);  // 128 bits, 这个是 cuda 能操作的最大的一个单位的数吧，8
+
+    // ------------------------------------------------ //
+    // Step 1. Load Q into Thread Reg.
+    constexpr int64_t VEC_LEN = (HEAD_SIZE / VEC_SIZE) / THREAD_GROUP_SIZE; // 128 / 8 / 8 = 2
+
+    static_assert((HEAD_SIZE / THREAD_GROUP_SIZE) % VEC_SIZE == 0);
+    static_assert(HEAD_SIZE % THREAD_GROUP_SIZE == 0);
+    static_assert(QUANT_GROUP == 8);
+
+    constexpr int64_t QUANT_GROUP_SHIFT = 3;
+
+    // The elements in Q, K, and V will be evenly distributed across each thread group.
+    T local_q[VEC_SIZE * VEC_LEN]; // 2 * 8
+
+    const int64_t warp_id       = threadIdx.x / WARP_SIZE;
+    const int64_t warp_lane_id  = threadIdx.x % WARP_SIZE;
+    const int64_t group_id      = warp_lane_id / THREAD_GROUP_SIZE;
+    const int64_t group_lane_id = warp_lane_id % THREAD_GROUP_SIZE;
+    const int64_t kv_head_idx     = head_idx / gqa_group_size;
+
+    #pragma unroll
+    for (int64_t i = 0; i < VEC_LEN; i++) {
+        // copy 128(16 * 8) bits from Q to Local Q
+
+        // 这个地方是错开间隔读取的，不知道如果设置成为连续位置读取会不会一样呢？
+        copy<sizeof(T) * VEC_SIZE>(
+            &query[
+                batch_idx * query_stride_s +
+                head_idx * query_stride_h +
+                (group_lane_id + i * THREAD_GROUP_SIZE) * VEC_SIZE
+            ],
+            &local_q[i * VEC_SIZE]);
+    }
+    // ------------------------------------------------ //
+    // Step 2. Solve QK Dot
+
+    const int64_t context_len = seq_len;
+    extern __shared__ float logits[];
+    float qk_max = -FLT_MAX;
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        int8_t local_k_quant[VEC_SIZE * VEC_LEN];
+        T local_k[VEC_SIZE * VEC_LEN];
+        T local_k_scale[VEC_LEN];
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+
+        // all thread groups within a warp must be launched together.
+        if (context_id >= context_len){
+            memset(local_k, 0, sizeof(local_k));
+        } else {
+            const int64_t key_offset
+                            = (mem_context_id) * kcache_stride_s
+                            + kv_head_idx * kcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from K to Local K
+                const int64_t key_idx = key_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&k_cache[key_idx],  &local_k_quant[i * VEC_SIZE]);
+
+                const int64_t key_scale_idx = key_idx >> QUANT_GROUP_SHIFT;
+                local_k_scale[i] = k_scale[key_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_k[i * VEC_SIZE + j]
+                        = local_k_scale[i] * (T)local_k_quant[i * VEC_SIZE + j];
+                }
+            }
+        }
+
+        // Ready for QK Dot
+        const float qk_dot
+            = attn_scale
+            * attn_thread_group_dot<THREAD_GROUP_SIZE, VEC_LEN * VEC_SIZE>(local_q, local_k);
+
+        if (group_lane_id == 0 && context_id < context_len) {
+            logits[context_id] = qk_dot;
+            qk_max = fmaxf(qk_dot, qk_max);
+        }
+    }
+
+    // ------------------------------------------------ //
+    // Step 3. Softmax
+
+    __shared__ float red_smem[WPT];
+
+    qk_max = attn_block_reduce_max<WPT>(qk_max, red_smem);
+
+    float exp_sum = 0.0f;
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB){
+        logits[context_id] -= qk_max;
+        logits[context_id] = exp(logits[context_id]);
+        exp_sum += logits[context_id];
+    }
+
+    static_assert(WPT == 2 || WPT == 4 || WPT == 8 || WPT == 16 || WPT == 32 || WPT == 64);
+    exp_sum = attn_block_reduce_sum<WPT>(exp_sum, red_smem);
+
+    const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB) {
+        logits[context_id] *= inv_sum;
+    }
+    __syncthreads(); // Must have this.
+
+    // ------------------------------------------------ //
+    // Step 4. Solve logits * V
+
+    int8_t local_v_quant[VEC_SIZE * VEC_LEN];
+    float local_v[VEC_SIZE * VEC_LEN];
+    T local_v_scale[VEC_LEN];
+
+    #pragma unroll
+    for(int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        local_v[i] = 0;
+    }
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+        // all thread groups within a warp must be launched together.
+        if (context_id < context_len){
+            const int64_t value_offset
+                            = (mem_context_id) * vcache_stride_s
+                            + kv_head_idx * vcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from V to Local V
+                const int64_t value_idx = value_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&v_cache[value_idx],  &local_v_quant[i * VEC_SIZE]);
+
+                const int64_t value_scale_idx = value_idx >> QUANT_GROUP_SHIFT;
+                local_v_scale[i] = v_scale[value_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_v[i * VEC_SIZE + j] += (tofloat(local_v_scale[i])
+                                                * (float)local_v_quant[i * VEC_SIZE + j]
+                                                * logits[context_id]);
+                }
+            }
+        }
+    }
+
+    #pragma unroll
+    for (int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        #pragma unroll
+        for (int32_t mask = THREAD_GROUP_SIZE; mask <= WARP_SIZE >> 1; mask = mask << 1) {
+            local_v[i] += __shfl_xor_sync(uint32_t(-1), local_v[i], mask);
+        }
+    }
+
+    __syncthreads();
+
+    // do some reuse
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB){
+        logits[i] = 0;
+    }
+
+    __syncthreads();
+
+    if (warp_lane_id < THREAD_GROUP_SIZE) {
+        #pragma unroll
+        for (int32_t i = 0; i < VEC_LEN; i++) {
+            #pragma unroll
+            for (int32_t j = 0; j < VEC_SIZE; j++) {
+                atomicAdd(
+                    logits + i * THREAD_GROUP_SIZE * VEC_SIZE + warp_lane_id * VEC_SIZE + j,
+                    local_v[i * VEC_SIZE + j]
+                );
+            }
+        }
+    }
+
+    __syncthreads();
+
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB){
+        output[batch_idx * output_stride_s + head_idx * output_stride_h + i] = logits[i];
+    }
+}
+
+
+template<typename T>
+void run_group_int8kv_decode_attention_kernel(
+    T* __restrict__ output,         
+    const T* __restrict__ query,    
+    const int8_t* k_cache,              
+    const T* k_scale,                 
+    const int8_t* v_cache,
+    const T* v_scale,
+    const float attn_scale,
+    const int64_t output_stride_s,
+    const int64_t output_stride_h,
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+
+    const int64_t batch_size,
+    const int64_t q_head_num,
+    const int64_t head_dim,
+    const int64_t gqa_group_size) {
+
+    constexpr int64_t WARP_SIZE = 32;
+    constexpr int64_t TPB = 256;
+    constexpr int64_t MAX_SHM_SIZE = 48 * 1024;
+
+    constexpr int64_t reduce_shm_size = TPB / WARP_SIZE * sizeof(float);
+    const int64_t logits_size = max(max_len_in_batch * sizeof(float), head_dim * sizeof(float));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+    if (reduce_shm_size + logits_size <= MAX_SHM_SIZE) {
+        const dim3 grid_size = {(unsigned int)q_head_num, (unsigned int)batch_size, 1};
+        switch (head_dim){
+            case 64:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<64, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 96:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<96, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 128:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<128, 8, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 256:
+                dynamic_batching_decoding_cache_attention_fp16_kernel<256, 16, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    output, query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_stride_s, output_stride_h,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            default:
+                assert(false);
+        }
+    } else {
+        assert(false);
+    }
+}
+
+void group_int8kv_decode_attention(at::Tensor o, at::Tensor q, at::Tensor k, at::Tensor k_s,  at::Tensor v,  at::Tensor v_s, at::Tensor req_to_tokens, at::Tensor b_req_idx, at::Tensor b_seq_len, int max_len_in_batch) {
+    int64_t batch_size = b_seq_len.sizes()[0];
+    int64_t head_num = q.sizes()[1];
+    int64_t head_dim = q.sizes()[2]; // q shape [batchsize, head_num, head_dim]
+    float att_scale = 1.0 / std::sqrt(head_dim);
+    int64_t kv_head_num = k.sizes()[1];
+    assert(head_num % kv_head_num == 0);
+    int64_t gqa_group_size = head_num / kv_head_num;
+    LIGHT_DISPATCH_FLOATING_TYPES(q.scalar_type(), "group_int8kv_decode_attention", ([&]{
+            run_group_int8kv_decode_attention_kernel<scalar_t>(
+                o.data_ptr<scalar_t>(), q.data_ptr<scalar_t>(), 
+                k.data_ptr<int8_t>(), k_s.data_ptr<scalar_t>(),
+                v.data_ptr<int8_t>(), v_s.data_ptr<scalar_t>(),
+                att_scale,
+                o.stride(0),
+                o.stride(1),
+                q.stride(0),
+                q.stride(1),
+                k.stride(0),
+                k.stride(1),
+                v.stride(0),
+                v.stride(1),
+                b_seq_len.data_ptr<int32_t>(),
+                b_req_idx.data_ptr<int32_t>(),
+                req_to_tokens.data_ptr<int32_t>(),
+                req_to_tokens.stride(0),
+                max_len_in_batch,
+                batch_size,
+                head_num,
+                head_dim,
+                gqa_group_size
+            );
+        }
+    ));
+}
+
+void group_int8kv_decode_attention(
+    torch::Tensor o, 
+    torch::Tensor q, 
+    torch::Tensor k, 
+    torch::Tensor k_s,  
+    torch::Tensor v,  
+    torch::Tensor v_s, 
+    torch::Tensor req_to_tokens, 
+    torch::Tensor b_req_idx, 
+    torch::Tensor b_seq_len, 
+    int64_t max_len_in_batch)
+{
+    group_int8kv_decode_attention(
+        o,
+        q, 
+        k, 
+        k_s, 
+        v, 
+        v_s, 
+        req_to_tokens, 
+        b_req_idx, 
+        b_seq_len, 
+        static_cast<int>(max_len_in_batch)
+    );
+}
+
+
+}
+}
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/attention/decode_attention_kernel_in8kv_flashdecoding.cu b/lightllm-kernel/csrc/attention/decode_attention_kernel_in8kv_flashdecoding.cu
new file mode 100644
index 000000000..c55eaaf6f
--- /dev/null
+++ b/lightllm-kernel/csrc/attention/decode_attention_kernel_in8kv_flashdecoding.cu
@@ -0,0 +1,650 @@
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <float.h> // need for FLT_MAX
+#include <math.h>
+#include <memory>
+#include <assert.h>
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include "ops_common.h"
+# include <torch/extension.h>
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace lightllm {
+namespace ops {
+
+template <typename T>
+__device__ inline float tofloat(T value) {
+    return static_cast<float>(value);
+}
+
+// Specialization for __half
+template <>
+__device__ inline float tofloat<__half>(__half value) {
+    return __half2float(value);
+}
+
+// Specialization for __nv_bfloat16
+template <>
+__device__ inline float tofloat<__nv_bfloat16>(__nv_bfloat16 value) {
+    return __bfloat162float(value);
+}
+
+#define LIGHT_DISPATCH_CASE_FLOATING_TYPES(...)              \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+
+#define LIGHT_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
+  AT_DISPATCH_SWITCH(                                             \
+    TYPE, NAME, LIGHT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
+
+template <int VPT>
+struct BytesToType;
+
+template <>
+struct BytesToType<2>
+{
+    using type = uint16_t;
+};
+template <>
+struct BytesToType<4>
+{
+    using type = uint32_t;
+};
+template <>
+struct BytesToType<8>
+{
+    using type = uint64_t;
+};
+template <>
+struct BytesToType<16>
+{
+    using type = float4;
+};
+
+template <int Bytes>
+__device__ inline void copy(const void* local, void* data)
+{
+    using T = typename BytesToType<Bytes>::type;
+
+    const T* in = static_cast<const T*>(local);
+    T* out = static_cast<T*>(data);
+    *out = *in;
+}
+
+template<int32_t THREAD_GROUP_SIZE, int32_t ELEMENT_NUM, typename T>
+__device__ inline
+float attn_thread_group_dot(T* local_q, T* local_k)
+{
+    // Helper function for QK Dot.
+    // [TODO] It should be optimized by type fp32x4.
+
+    float qk = 0.0f;
+# pragma unroll
+    for(int32_t i = 0; i < ELEMENT_NUM; i++) {
+        qk += tofloat(local_q[i]) * tofloat(local_k[i]);
+    }
+#pragma unroll
+    for (int32_t mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
+        qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+    }
+    return qk;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_max(float reducing, float* shared_mem)
+{
+    // Helper function for reduce softmax qkmax.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    if (lane_id == 0) {
+        shared_mem[warp_id] = reducing;
+    }
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+    else reducing = -FLT_MAX;
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing = fmaxf(reducing, __shfl_xor_sync(uint32_t(-1), reducing, mask));
+    }
+
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<int32_t WPT>
+__device__ inline
+float attn_block_reduce_sum(float reducing, float *shared_mem)
+{
+    // Helper function for reduce softmax exp sum.
+    constexpr int32_t WARP_SIZE = 32;
+    const int32_t lane_id = threadIdx.x % WARP_SIZE;
+    const int32_t warp_id = threadIdx.x / WARP_SIZE;
+
+# pragma unroll
+    for (int32_t mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+
+    if (lane_id == 0) shared_mem[warp_id] = reducing;
+    __syncthreads();
+
+    if (lane_id < WPT) reducing = shared_mem[lane_id];
+
+# pragma unroll
+    for (int32_t mask = WPT / 2; mask >= 1; mask /= 2) {
+        reducing += __shfl_xor_sync(uint32_t(-1), reducing, mask);
+    }
+    reducing = __shfl_sync(uint32_t(-1), reducing, 0);
+    return reducing;
+}
+
+template<
+    int32_t HEAD_SIZE,
+    int32_t THREAD_GROUP_SIZE,        // how many threads inside a group
+    int32_t TPB,
+    int32_t QUANT_GROUP,
+    typename T>
+__global__
+void dynamic_batching_flashdecoding_cache_attention_int8kv_kernel(
+    const int64_t seq_block_size, 
+
+    T* __restrict__ output_emb,
+    T* __restrict__ output_logexpsum,  
+    // T* __restrict__ output,          // [context_lens, num_heads..., head_size]
+    
+    const T* __restrict__ query,     // [seq_lens, num_heads..., head_size]
+    const int8_t* k_cache,                // [max_token, num_kv_heads, head_size]
+    const T* k_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+    const int8_t* v_cache,                // [max_token, num_kv_heads, head_size]
+    const T* v_scale,                  // [max_token, num_kv_heads, head_size / quant_group(8)]
+
+    const float attn_scale,
+
+    const int64_t output_emb_stride_b,
+    const int64_t output_emb_stride_h,
+    const int64_t output_emb_stride_s,
+    const int64_t output_emb_stride_d,
+
+    const int64_t output_logexpsum_stride_b,
+    const int64_t output_logexpsum_stride_h,
+    const int64_t output_logexpsum_stride_s,
+
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+    const int64_t gqa_group_size) { 
+
+    /* --- Decoding Attention Kernel Implementation --- */
+    constexpr int64_t WARP_SIZE = 32;                              // warp size
+    constexpr int64_t WPT       = TPB / WARP_SIZE;                 // warp per thread block， TPB for Thread per block 4, block_size
+    constexpr int64_t GPW       = WARP_SIZE / THREAD_GROUP_SIZE;       // thread group per warp 4
+    constexpr int64_t GPT       = WARP_SIZE / THREAD_GROUP_SIZE * WPT; // thread group per thread block 16
+
+    // const int64_t num_heads     = gridDim.x;
+    const int64_t head_idx      = blockIdx.x;
+    const int64_t batch_idx     = blockIdx.y;
+    const int64_t seq_block_idx = blockIdx.z;
+
+    const int64_t seq_len = b_seq_len[batch_idx];
+    const int64_t cur_req_idx = b_req_idx[batch_idx];
+    const int32_t * b_start_loc = req_to_tokens + cur_req_idx * req_to_tokens_stride + seq_block_idx * seq_block_size;
+
+    constexpr int64_t VEC_SIZE  = 16 / sizeof(T);  // 128 bits, 这个是 cuda 能操作的最大的一个单位的数吧，8
+
+    // ------------------------------------------------ //
+    // Step 1. Load Q into Thread Reg.
+    constexpr int64_t VEC_LEN = (HEAD_SIZE / VEC_SIZE) / THREAD_GROUP_SIZE; // 128 / 8 / 8 = 2
+
+    static_assert((HEAD_SIZE / THREAD_GROUP_SIZE) % VEC_SIZE == 0);
+    static_assert(HEAD_SIZE % THREAD_GROUP_SIZE == 0);
+    static_assert(QUANT_GROUP == 8);
+
+    constexpr int64_t QUANT_GROUP_SHIFT = 3;
+
+    // The elements in Q, K, and V will be evenly distributed across each thread group.
+    T local_q[VEC_SIZE * VEC_LEN]; // 2 * 8
+
+    const int64_t warp_id       = threadIdx.x / WARP_SIZE;
+    const int64_t warp_lane_id  = threadIdx.x % WARP_SIZE;
+    const int64_t group_id      = warp_lane_id / THREAD_GROUP_SIZE;
+    const int64_t group_lane_id = warp_lane_id % THREAD_GROUP_SIZE;
+    const int64_t kv_head_idx     = head_idx / gqa_group_size;
+
+    if (seq_len <= seq_block_idx * seq_block_size) {
+        return;
+    }
+    const int64_t context_len = min(seq_len - seq_block_idx * seq_block_size, seq_block_size);
+
+    #pragma unroll
+    for (int64_t i = 0; i < VEC_LEN; i++) {
+        // copy 128(16 * 8) bits from Q to Local Q
+
+        // 这个地方是错开间隔读取的，不知道如果设置成为连续位置读取会不会一样呢？
+        copy<sizeof(T) * VEC_SIZE>(
+            &query[
+                batch_idx * query_stride_s +
+                head_idx * query_stride_h +
+                (group_lane_id + i * THREAD_GROUP_SIZE) * VEC_SIZE
+            ],
+            &local_q[i * VEC_SIZE]);
+    }
+    // ------------------------------------------------ //
+    // Step 2. Solve QK Dot
+
+    extern __shared__ float logits[];
+    float qk_max = -FLT_MAX;
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        int8_t local_k_quant[VEC_SIZE * VEC_LEN];
+        T local_k[VEC_SIZE * VEC_LEN];
+        T local_k_scale[VEC_LEN];
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+
+        // all thread groups within a warp must be launched together.
+        if (context_id >= context_len){
+            memset(local_k, 0, sizeof(local_k));
+        } else {
+            const int64_t key_offset
+                            = (mem_context_id) * kcache_stride_s
+                            + kv_head_idx * kcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from K to Local K
+                const int64_t key_idx = key_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&k_cache[key_idx],  &local_k_quant[i * VEC_SIZE]);
+
+                const int64_t key_scale_idx = key_idx >> QUANT_GROUP_SHIFT;
+                local_k_scale[i] = k_scale[key_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_k[i * VEC_SIZE + j]
+                        = local_k_scale[i] * (T)local_k_quant[i * VEC_SIZE + j];
+                }
+            }
+        }
+
+        // Ready for QK Dot
+        const float qk_dot
+            = attn_scale
+            * attn_thread_group_dot<THREAD_GROUP_SIZE, VEC_LEN * VEC_SIZE>(local_q, local_k);
+
+        if (group_lane_id == 0 && context_id < context_len) {
+            logits[context_id] = qk_dot;
+            qk_max = fmaxf(qk_dot, qk_max);
+        }
+    }
+
+    // ------------------------------------------------ //
+    // Step 3. Softmax
+
+    __shared__ float red_smem[WPT];
+
+    qk_max = attn_block_reduce_max<WPT>(qk_max, red_smem);
+
+    float exp_sum = 0.0f;
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB){
+        logits[context_id] -= qk_max;
+        logits[context_id] = exp(logits[context_id]);
+        exp_sum += logits[context_id];
+    }
+
+    static_assert(WPT == 2 || WPT == 4 || WPT == 8 || WPT == 16 || WPT == 32 || WPT == 64);
+    exp_sum = attn_block_reduce_sum<WPT>(exp_sum, red_smem);
+
+    const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+    for (int64_t context_id = threadIdx.x; context_id < context_len; context_id += TPB) {
+        logits[context_id] *= inv_sum;
+    }
+    __syncthreads(); // Must have this.
+
+    // ------------------------------------------------ //
+    // Step 4. Solve logits * V
+
+    int8_t local_v_quant[VEC_SIZE * VEC_LEN];
+    float local_v[VEC_SIZE * VEC_LEN];
+    T local_v_scale[VEC_LEN];
+
+    #pragma unroll
+    for(int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        local_v[i] = 0;
+    }
+
+    for (int64_t base_id = warp_id * GPW; base_id < context_len; base_id += GPT) {
+        const int64_t context_id = base_id + group_id;
+        const int64_t mem_context_id = *(b_start_loc + context_id);
+        // all thread groups within a warp must be launched together.
+        if (context_id < context_len){
+            const int64_t value_offset
+                            = (mem_context_id) * vcache_stride_s
+                            + kv_head_idx * vcache_stride_h
+                            + group_lane_id * VEC_SIZE;
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                // copy 128(16 * 8) bits from V to Local V
+                const int64_t value_idx = value_offset + i * THREAD_GROUP_SIZE * VEC_SIZE;
+                copy<sizeof(int8_t) * VEC_SIZE>(&v_cache[value_idx],  &local_v_quant[i * VEC_SIZE]);
+
+                const int64_t value_scale_idx = value_idx >> QUANT_GROUP_SHIFT;
+                local_v_scale[i] = v_scale[value_scale_idx];
+            }
+
+            #pragma unroll
+            for (int64_t i = 0; i < VEC_LEN; i++) {
+                #pragma unroll
+                for (int64_t j = 0; j < VEC_SIZE; j++) {
+                    local_v[i * VEC_SIZE + j] += (tofloat(local_v_scale[i])
+                                                * (float)local_v_quant[i * VEC_SIZE + j]
+                                                * logits[context_id]);
+                }
+            }
+        }
+    }
+
+    #pragma unroll
+    for (int32_t i = 0; i < VEC_SIZE * VEC_LEN; i++) {
+        #pragma unroll
+        for (int32_t mask = THREAD_GROUP_SIZE; mask <= WARP_SIZE >> 1; mask = mask << 1) {
+            local_v[i] += __shfl_xor_sync(uint32_t(-1), local_v[i], mask);
+        }
+    }
+
+    __syncthreads();
+
+    // do some reuse
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB){
+        logits[i] = 0;
+    }
+
+    __syncthreads();
+
+    if (warp_lane_id < THREAD_GROUP_SIZE) {
+        #pragma unroll
+        for (int32_t i = 0; i < VEC_LEN; i++) {
+            #pragma unroll
+            for (int32_t j = 0; j < VEC_SIZE; j++) {
+                atomicAdd(
+                    logits + i * THREAD_GROUP_SIZE * VEC_SIZE + warp_lane_id * VEC_SIZE + j,
+                    local_v[i * VEC_SIZE + j]
+                );
+            }
+        }
+    }
+
+    __syncthreads();
+
+    for (int64_t i = threadIdx.x; i < HEAD_SIZE; i += TPB) {
+        output_emb[batch_idx * output_emb_stride_b + head_idx * output_emb_stride_h + seq_block_idx * output_emb_stride_s + i] = logits[i];
+    }
+
+    output_logexpsum[batch_idx * output_logexpsum_stride_b + head_idx * output_logexpsum_stride_h + seq_block_idx] = logf(exp_sum) + qk_max;
+}
+
+
+template<typename T>
+void run_group_int8kv_decode_flashattention_kernel(
+    const int64_t seq_block_size, 
+    T* __restrict__ output_emb,
+    T* __restrict__ output_logexpsum,       
+    const T* __restrict__ query,    
+    const int8_t* k_cache,              
+    const T* k_scale,                 
+    const int8_t* v_cache,
+    const T* v_scale,
+    const float attn_scale,
+
+    const int64_t output_emb_stride_b,
+    const int64_t output_emb_stride_h,
+    const int64_t output_emb_stride_s,
+    const int64_t output_emb_stride_d,
+
+    const int64_t output_logexpsum_stride_b,
+    const int64_t output_logexpsum_stride_h,
+    const int64_t output_logexpsum_stride_s,
+
+    const int64_t query_stride_s,
+    const int64_t query_stride_h,
+    const int64_t kcache_stride_s,
+    const int64_t kcache_stride_h,
+    const int64_t vcache_stride_s,
+    const int64_t vcache_stride_h,
+    const int32_t * __restrict__ b_seq_len,
+    const int32_t * __restrict__ b_req_idx,
+    const int32_t * __restrict__ req_to_tokens,
+    const int64_t req_to_tokens_stride,
+    const int64_t max_len_in_batch,
+
+    const int64_t batch_size,
+    const int64_t q_head_num,
+    const int64_t head_dim,
+    const int64_t gqa_group_size) {
+
+    constexpr int64_t WARP_SIZE = 32;
+    constexpr int64_t TPB = 256;
+    constexpr int64_t MAX_SHM_SIZE = 48 * 1024;
+
+    constexpr int64_t reduce_shm_size = TPB / WARP_SIZE * sizeof(float);
+    const int64_t logits_size = max(seq_block_size * sizeof(float), head_dim * sizeof(float));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    
+    if (reduce_shm_size + logits_size <= MAX_SHM_SIZE) {
+        const dim3 grid_size = {static_cast<unsigned int>(q_head_num), static_cast<unsigned int>(batch_size), static_cast<unsigned int>((max_len_in_batch + seq_block_size - 1) / seq_block_size)};
+        switch (head_dim){
+            case 64:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<64, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum,
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 96:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<96, 4, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum, 
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 128:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<128, 8, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum,
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            case 256:
+                dynamic_batching_flashdecoding_cache_attention_int8kv_kernel<256, 16, 256, 8>
+                <<<grid_size, 256, logits_size, stream>>>
+                (
+                    seq_block_size,
+                    output_emb,
+                    output_logexpsum,
+                    query, k_cache, k_scale, v_cache, v_scale,
+                    attn_scale,
+                    output_emb_stride_b,
+                    output_emb_stride_h,
+                    output_emb_stride_s,
+                    output_emb_stride_d,
+                    output_logexpsum_stride_b,
+                    output_logexpsum_stride_h,
+                    output_logexpsum_stride_s,
+                    query_stride_s, query_stride_h,
+                    kcache_stride_s, kcache_stride_h,
+                    vcache_stride_s, vcache_stride_h,
+                    b_seq_len, b_req_idx, req_to_tokens,
+                    req_to_tokens_stride,
+                    max_len_in_batch,
+                    gqa_group_size
+                );
+                break;
+            default:
+                assert(false);
+        }
+    } else {
+        assert(false);
+    }
+}
+
+void group_int8kv_flashdecoding_attention(const int seq_block_size, at::Tensor mid_o_emb, at::Tensor mid_o_logexpsum, float att_scale, at::Tensor q, at::Tensor k, at::Tensor k_s,  at::Tensor v,  at::Tensor v_s, at::Tensor req_to_tokens, at::Tensor b_req_idx, at::Tensor b_seq_len, int max_len_in_batch) {
+    int64_t batch_size = b_seq_len.sizes()[0];
+    int64_t head_num = q.sizes()[1];
+    int64_t head_dim = q.sizes()[2]; // q shape [batchsize, head_num, head_dim]
+    int64_t kv_head_num = k.sizes()[1];
+    assert(head_num % kv_head_num == 0);
+    int64_t gqa_group_size = head_num / kv_head_num;
+
+    LIGHT_DISPATCH_FLOATING_TYPES(q.scalar_type(), "group_int8kv_flashdecoding_attention", ([&] {
+        run_group_int8kv_decode_flashattention_kernel<scalar_t>(
+            seq_block_size, 
+            mid_o_emb.data_ptr<scalar_t>(), 
+            mid_o_logexpsum.data_ptr<scalar_t>(),
+            q.data_ptr<scalar_t>(), 
+            k.data_ptr<int8_t>(), k_s.data_ptr<scalar_t>(),
+            v.data_ptr<int8_t>(), v_s.data_ptr<scalar_t>(),
+            att_scale,
+            
+            mid_o_emb.stride(0),
+            mid_o_emb.stride(1),
+            mid_o_emb.stride(2),
+            mid_o_emb.stride(3),
+            mid_o_logexpsum.stride(0),
+            mid_o_logexpsum.stride(1),
+            mid_o_logexpsum.stride(2),
+
+            q.stride(0),
+            q.stride(1),
+            k.stride(0),
+            k.stride(1),
+            v.stride(0),
+            v.stride(1),
+            b_seq_len.data_ptr<int32_t>(),
+            b_req_idx.data_ptr<int32_t>(),
+            req_to_tokens.data_ptr<int32_t>(),
+            req_to_tokens.stride(0),
+            max_len_in_batch,
+            batch_size,
+            head_num,
+            head_dim,
+            gqa_group_size
+        );
+    }));
+
+}
+
+void group_int8kv_flashdecoding_attention(
+    const int64_t seq_block_size, 
+    torch::Tensor mid_o_emb, 
+    torch::Tensor mid_o_logexpsum, 
+    fp32_t att_scale, 
+    torch::Tensor q, 
+    torch::Tensor k, 
+    torch::Tensor k_s,  
+    torch::Tensor v,  
+    torch::Tensor v_s, 
+    torch::Tensor req_to_tokens, 
+    torch::Tensor b_req_idx, 
+    torch::Tensor b_seq_len, 
+    int64_t max_len_in_batch)
+{
+    group_int8kv_flashdecoding_attention(
+        static_cast<int>(seq_block_size), 
+        mid_o_emb, 
+        mid_o_logexpsum, 
+        att_scale, 
+        q, 
+        k, 
+        k_s, 
+        v, 
+        v_s, 
+        req_to_tokens, 
+        b_req_idx, 
+        b_seq_len, 
+        static_cast<int>(max_len_in_batch)
+    );
+}
+
+}
+}
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/cuda_compat.h b/lightllm-kernel/csrc/cuda_compat.h
new file mode 100644
index 000000000..82e55613d
--- /dev/null
+++ b/lightllm-kernel/csrc/cuda_compat.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#ifdef USE_ROCM
+  #include <hip/hip_runtime.h>
+#endif
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_LDG(arg) __ldg(arg)
+#else
+  #define VLLM_LDG(arg) *(arg)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
+#else
+  #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
+#else
+  #define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
+    __shfl_down_sync(uint32_t(-1), var, lane_delta)
+#else
+  #define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
+#endif
+
+#ifndef USE_ROCM
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#else
+  #define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
+    hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
+#endif
diff --git a/lightllm-kernel/csrc/fusion/add_norm_quant.cu b/lightllm-kernel/csrc/fusion/add_norm_quant.cu
new file mode 100755
index 000000000..3684dffc8
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/add_norm_quant.cu
@@ -0,0 +1,551 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+template<int32_t TPB>
+__global__ void device_add_norm_quant_bf16_general(
+    bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    const bf16_t* __restrict__ residual, // Residual tensor in BF16 format
+    const bf16_t* __restrict__ weight, // Weight tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                   // Number of rows in the input tensor
+    const int32_t N,                   // Number of cols in the input tensor
+    const fp32_t eps                   // Epsilon value for numerical stability
+) {
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _input = input + bid * N;
+    const bf16_t* _residual = residual + bid * N;
+    fp8_e4m3_t* _output = output + bid * N;
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Shared memory workspace to store data.
+    extern __shared__ bf16_t workspace1[];
+
+    // Local registers to hold data.
+    bf16_t local_input;
+    bf16_t local_residual;
+    bf16_t local_w;
+    bf16_t local_output;
+    fp8_e4m3_t local_f8;
+    
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_input = _input[i];
+        local_residual = _residual[i];
+
+        fp32_t x = cvt_bf16_f32(local_input);
+        fp32_t r = cvt_bf16_f32(local_residual);
+        local_input = cvt_f32_bf16(x + r);
+        fp32_t tmp = cvt_bf16_f32(local_input);
+        local_square_sum += tmp * tmp;
+
+        _input[i] = local_input;
+        workspace1[i] = local_input;
+    }
+
+    const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum); 
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    const fp32_t mean_square = reduced_square_sum * r_N;
+    const fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_input = workspace1[i];
+        local_w = weight[i];
+
+        fp32_t x = cvt_bf16_f32(local_input);
+        fp32_t w = cvt_bf16_f32(local_w);
+
+        fp32_t ret = x * inv_norm * w;
+        local_output = cvt_f32_bf16(ret);
+        fp32_t tmp = cvt_bf16_f32(local_output);
+        local_max = fmaxf(local_max, fabsf(tmp));
+
+        workspace1[i] = local_output;
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_output = workspace1[i];
+
+        fp32_t tmp = cvt_bf16_f32(local_output);
+        fp32_t ret = tmp * inv_scale;
+        local_f8 = fp8_e4m3_t(ret);
+
+        _output[i] = local_f8;
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+
+template<int32_t TPB>
+__global__ void device_add_norm_quant_bf16_vpt(
+    bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    const bf16_t* __restrict__ residual, // Residual tensor in BF16 format
+    const bf16_t* __restrict__ weight, // Weight tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                   // Number of rows in the input tensor
+    const int32_t N,                   // Number of cols in the input tensor
+    const fp32_t eps                   // Epsilon value for numerical stability
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _input = input + bid * N;
+    const bf16_t* _residual = residual + bid * N;
+    fp8_e4m3_t* _output = output + bid * N;
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    extern __shared__ bf16x2_t workspace2[];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_input[VPT / 2];
+    bf16x2_t local_residual[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_output[VPT / 2];
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_input) into local vector (local_input).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_input);
+        // Load VPT FP16 elements from global memory (_residual) into local vector (local_residual).
+        vec_copy<sizeof(bf16_t) * VPT>(_residual + i, local_residual);
+
+        # pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            // Convert the bf16x2_t to fp32x2_t for computation.
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t r = bf16x2_to_fp32x2(local_residual[j]);
+            // Add the residual to the input.
+            local_input[j] = _float22bf162_rn(make_float2(x.x + r.x, x.y + r.y));
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_input[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, _input + i);
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, workspace2 + (i >> 1));
+    }
+
+    const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum); 
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    const fp32_t mean_square = reduced_square_sum * r_N;
+    const fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_input);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(weight + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_output[j] = _float22bf162_rn(ret);
+
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_output[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_output, workspace2 + (i >> 1));
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_output);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_output[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_output[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+template<int32_t TPB, int32_t N>
+__global__ void device_add_norm_quant_bf16(
+    bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    const bf16_t* __restrict__ residual, // Residual tensor in BF16 format
+    const bf16_t* __restrict__ weight, // Weight tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                   // Number of rows in the input tensor
+    const fp32_t eps                   // Epsilon value for numerical stability
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    constexpr fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _input = input + bid * N;
+    const bf16_t* _residual = residual + bid * N;
+    fp8_e4m3_t* _output = output + bid * N;
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    __shared__ bf16x2_t workspace[N / 2];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_input[VPT / 2];
+    bf16x2_t local_residual[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_output[VPT / 2];
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_input) into local vector (local_input).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_input);
+        // Load VPT FP16 elements from global memory (_residual) into local vector (local_residual).
+        vec_copy<sizeof(bf16_t) * VPT>(_residual + i, local_residual);
+
+        # pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            // Convert the bf16x2_t to fp32x2_t for computation.
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t r = bf16x2_to_fp32x2(local_residual[j]);
+            // Add the residual to the input.
+            local_input[j] = _float22bf162_rn(make_float2(x.x + r.x, x.y + r.y));
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_input[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, _input + i);
+        vec_copy<sizeof(bf16_t) * VPT>(local_input, workspace + (i >> 1));
+    }
+
+    const fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum); 
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    const fp32_t mean_square = reduced_square_sum * r_N;
+    const fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    fp32_t local_max = -FLT_MAX;
+    #pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_input);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(weight + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_input[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_output[j] = _float22bf162_rn(ret);
+
+
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_output[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_output, workspace + (i >> 1));
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    #pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_output);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_output[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_output[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+/**
+ * @brief Fused add norm quant
+ */
+std::tuple<Tensor, Tensor> add_norm_quant_bf16_fp8(
+    Tensor& X, const Tensor &R, const Tensor &W,
+    const fp32_t eps
+) {
+    TORCH_CHECK(X.ndimension() == 2, "Input tensor X must be 2D");
+    TORCH_CHECK(R.ndimension() == 2, "Input tensor R must be 2D");
+    TORCH_CHECK(W.ndimension() == 1, "Input tensor W must be 1D");
+
+    TORCH_CHECK(X.is_cuda(), "Input tensor X must be a CUDA tensor.");
+    TORCH_CHECK(R.is_cuda(), "Input tensor R must be a CUDA tensor.");
+    TORCH_CHECK(W.is_cuda(), "Input tensor W must be a CUDA tensor.");
+
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor X must be BF16.");
+    TORCH_CHECK(R.scalar_type() == c10::ScalarType::BFloat16, "Input tensor R must be BF16.");
+    TORCH_CHECK(W.scalar_type() == c10::ScalarType::BFloat16, "Input tensor W must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor contiguous_R = R.is_contiguous() ? R : R.contiguous();
+    Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous();
+
+    const uint32_t M = contiguous_X.size(0);
+    const uint32_t N = contiguous_X.size(1);
+    
+    Tensor output_q = torch::empty(
+        {M, N},
+        torch::TensorOptions()
+            .dtype(torch::kFloat8_e4m3fn)
+            .device(contiguous_X.device())
+    );
+    Tensor scales = torch::empty(
+        {M, 1},
+        torch::TensorOptions()
+            .dtype(torch::kFloat32)
+            .device(contiguous_X.device())
+    );
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_add_norm_quant_bf16<128, 16>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 32:
+            device_add_norm_quant_bf16<128, 32>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 64:
+            device_add_norm_quant_bf16<128, 64>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 512:
+            device_add_norm_quant_bf16<128, 512>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 1024:
+            device_add_norm_quant_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 3200:
+            device_add_norm_quant_bf16<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 4096:
+            device_add_norm_quant_bf16<128, 4096>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        case 12800:
+            device_add_norm_quant_bf16<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_X),
+                PTR<bf16_t>(contiguous_R),
+                PTR<bf16_t>(contiguous_W),
+                PTR<fp8_e4m3_t>(output_q),
+                PTR<fp32_t>(scales),
+                M,
+                eps
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 128;
+            const int64_t shared_mem_size = N * sizeof(bf16_t);
+            if (N % 8 == 0) {
+                device_add_norm_quant_bf16_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_X),
+                    PTR<bf16_t>(contiguous_R),
+                    PTR<bf16_t>(contiguous_W),
+                    PTR<fp8_e4m3_t>(output_q),
+                    PTR<fp32_t>(scales),
+                    M,
+                    N,
+                    eps
+                );
+            } else {
+                device_add_norm_quant_bf16_general<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_X),
+                    PTR<bf16_t>(contiguous_R),
+                    PTR<bf16_t>(contiguous_W),
+                    PTR<fp8_e4m3_t>(output_q),
+                    PTR<fp32_t>(scales),
+                    M,
+                    N,
+                    eps
+                );
+            }
+        }
+    }
+
+    return {output_q, scales};
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu b/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu
new file mode 100755
index 000000000..b204e9737
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/gelu_per_token_quant.cu
@@ -0,0 +1,367 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+template<int32_t TPB, int32_t N>
+__global__ void device_gelu_per_token_quant_bf16_to_fp8(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M                  // Number of rows in the input tensor
+) {
+    constexpr int32_t VPT = 8;
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    const bf16x2_t one =  _float22bf162_rn(make_float2(1.0f, 1.0f));
+    const bf16x2_t one_2 =  _float22bf162_rn(make_float2(0.5f, 0.5f));
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the group
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the group
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    __shared__ bf16x2_t workspace[N / 2];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+        //gelu
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); 
+            tmp.x = erf(tmp.x * 0.7071067811f);
+            tmp.y = erf(tmp.y * 0.7071067811f);
+            bf16x2_t tan =  _float22bf162_rn(tmp);
+            tan = __hadd2(tan, one);
+            tan = __hmul2(tan, local_bf16[j]);
+            tan = __hmul2(tan, one_2);
+            local_bf16[j] = tan;
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+        
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+           fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); 
+           fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+           local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the thread group
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+template<int32_t TPB>
+__global__ void gelu_per_token_quant_bf16_to_fp8_vpt(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                  // Number of rows in the input tensor
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    constexpr fp32_t sqrt_2_over_pi = 0.7978845608028654f;
+    constexpr fp32_t coeff = 0.044715f;
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the group
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the group
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    extern __shared__ bf16x2_t workspace[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]); 
+
+           fp32_t tanh_arg1 = sqrt_2_over_pi * (tmp.x + coeff * tmp.x * tmp.x * tmp.x);
+           fp32_t tanh_arg2 = sqrt_2_over_pi * (tmp.y + coeff * tmp.y * tmp.y * tmp.y);
+           tmp.x = 0.5f * tmp.x * (1.0f + tanhf(tanh_arg1));
+           tmp.y = 0.5f * tmp.y * (1.0f + tanhf(tanh_arg2));
+
+           local_bf16[j] = _float22bf162_rn(tmp);
+        }
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the thread group
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+template<int32_t TPB>
+__global__ void gelu_per_token_quant_bf16_to_fp8_general(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each group
+    const int64_t M,                  // Number of rows in the input tensor
+    const int32_t N
+) {
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    constexpr fp32_t sqrt_2_over_pi = 0.7978845608028654f;
+    constexpr fp32_t coeff = 0.044715f;
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the group
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the group
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    extern __shared__ bf16_t workspace_[];
+
+    fp32_t local_max = -FLT_MAX;
+  
+    for (int32_t i = tid; i < N; i += TPB) {
+        fp32_t tmp = cvt_bf16_f32(_input[i]);
+        fp32_t tanh_arg = sqrt_2_over_pi * (tmp + coeff * tmp * tmp * tmp);
+        tmp = 0.5f * tmp * (1.0f + tanhf(tanh_arg));
+        local_max = fmaxf(local_max, fabsf(tmp));
+        workspace_[i] = cvt_f32_bf16(tmp);
+    }
+
+    // Reduce the maximum value across the thread group
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        // Load the previously stored vectorized data from shared memory.
+        fp32_t x = cvt_bf16_f32(workspace_[i]);
+        // Apply normalization: multiply by inv_norm and then scale by the weight.
+        fp32_t ret = x * inv_scale;
+        _output[i] = fp8_e4m3_t(ret);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+void gelu_per_token_quant_bf16_fp8 (
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2-dimensional");
+    TORCH_CHECK(input.scalar_type() == c10::kBFloat16, "Input must be BF16 type");
+
+    Tensor contiguous_input = input.is_contiguous() ? input : input.contiguous();
+    Tensor contiguous_scales = scales.is_contiguous() ? scales : scales.contiguous();
+
+    const int64_t M = input.size(0);
+    const int64_t N = input.size(1);
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 16>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 32:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 32>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 64:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 64>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 512:
+            device_gelu_per_token_quant_bf16_to_fp8<64, 512>
+            <<<blocks, 64, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+
+        case 1024:
+            device_gelu_per_token_quant_bf16_to_fp8<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 2048:
+            device_gelu_per_token_quant_bf16_to_fp8<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 3200:
+            device_gelu_per_token_quant_bf16_to_fp8<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 4096:
+            device_gelu_per_token_quant_bf16_to_fp8<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        case 12800:
+            device_gelu_per_token_quant_bf16_to_fp8<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales),
+                M
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 128;
+            int32_t sharedmem = N / 2 * sizeof(bf16x2_t);
+            if (N % 8 == 0) {
+                gelu_per_token_quant_bf16_to_fp8_vpt<128>
+                <<<blocks, TPB, sharedmem, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M, N
+                );
+            }
+            else {
+                gelu_per_token_quant_bf16_to_fp8_general<128>
+                <<<blocks, TPB, sharedmem, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    M, N
+                );
+            }
+        }
+    }
+    return ;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/fusion/post_tp_norm.cu b/lightllm-kernel/csrc/fusion/post_tp_norm.cu
new file mode 100755
index 000000000..89f711405
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/post_tp_norm.cu
@@ -0,0 +1,364 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. 
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row.
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param V       Pointer to the variance tensor in global memory. [M]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB>
+__global__
+void  device_post_tp_norm_bf16_general(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    const fp32_t __restrict__ *V,     // [M] variance
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const int32_t embed_dim,          // if multiGPUs, embed_dim differs from N
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    const fp32_t r_N = 1 / (fp32_t)embed_dim;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Local registers to hold data.
+    bf16_t local_x = cvt_f32_bf16(0.0f);
+    bf16_t local_w = cvt_f32_bf16(0.0f);
+    bf16_t local_y = cvt_f32_bf16(0.0f);
+
+    fp32_t reduced_square_sum = V[bid];
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_x = _X[i];
+        local_w = W[i];
+
+        fp32_t x = cvt_bf16_f32(local_x);
+        fp32_t w = cvt_bf16_f32(local_w);
+
+        fp32_t ret = x * inv_norm * w;
+        local_y = cvt_f32_bf16(ret);
+
+        _Y[i] = local_y;
+    }
+}
+
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. The kernel loads the
+ * data in a vectorized manner (using half2), computes the mean square,
+ * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)),
+ * and then normalizes the input row element‐wise while scaling with a weight.
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param V       Pointer to the variance tensor in global memory. [M]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB>
+__global__
+void  device_post_tp_norm_bf16_vpt(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    const fp32_t __restrict__ *V,     // [M] variance
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const int32_t embed_dim,          // if multiGPUs, embed_dim differs from N
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)embed_dim;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    fp32_t reduced_square_sum = V[bid];
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. The kernel loads the
+ * data in a vectorized manner (using half2), computes the mean square,
+ * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)),
+ * and then normalizes the input row element‐wise while scaling with a weight.
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param V       Pointer to the variance tensor in global memory. [M]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB, int32_t N>
+__global__
+void  device_post_tp_norm_bf16(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    const fp32_t __restrict__ *V,     // [M] variance
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t embed_dim,          // if multiGPUs, embed_dim differs from N
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)embed_dim;       // Reciprocal of N.
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    fp32_t reduced_square_sum = V[bid];
+
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief Launch RMSNorm kernel for FP16 tensors with aligned 16-element rows.
+ *
+ * This function validates the input tensors, ensures they are contiguous,
+ * selects the appropriate kernel configuration based on the row width N,
+ * and launches the CUDA kernel.
+ *
+ * @param X    Input tensor with shape [M, N] (FP16, CUDA).
+ * @param W    Weight tensor with shape [N] (FP16, CUDA).
+ * @param eps  Epsilon for numerical stability.
+ * @return     Output tensor with the same shape as X.
+ */
+Tensor post_tp_norm_bf16(Tensor &X, const Tensor &W, const Tensor &V, const int embed_dim, const fp32_t eps) {
+    TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D");
+    TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor.");
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous();
+    Tensor contiguous_V = V.is_contiguous() ? V : V.contiguous();
+
+    Tensor input_tensor;
+    uint32_t M, N;
+    Tensor Y;
+
+    if (X.ndimension() == 2) {
+        M = contiguous_X.size(0);
+        N = contiguous_X.size(1);
+        input_tensor = contiguous_X;
+        Y = torch::empty_like(input_tensor);
+    } else {
+        const uint32_t d0 = contiguous_X.size(0);
+        const uint32_t d1 = contiguous_X.size(1);
+        const uint32_t d2 = contiguous_X.size(2);
+        const uint32_t d3 = contiguous_X.size(3);
+
+        M = d0 * d1;
+        N = d2 * d3;
+        input_tensor = contiguous_X.view({M, N});
+        Y = torch::empty_like(input_tensor);
+    }
+
+    // Each CUDA block processes one row.
+    const int32_t blocks = M;
+
+    // Kernel dispatch based on the value of N.
+    switch (N) {
+        case 768:
+            device_post_tp_norm_bf16<128, 768>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 1024:
+            device_post_tp_norm_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 1664:
+            device_post_tp_norm_bf16<128, 1664>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 2048:
+            device_post_tp_norm_bf16<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 3200:
+            device_post_tp_norm_bf16<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+        break;
+        case 4096:
+            device_post_tp_norm_bf16<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 8192:
+            device_post_tp_norm_bf16<512, 8192>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        case 10240:
+            device_post_tp_norm_bf16<512, 10240>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                M, embed_dim, eps
+            );
+            break;
+        default:
+            static constexpr int32_t TPB = 256;
+            if (N % 8 == 0) {
+                device_post_tp_norm_bf16_vpt<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                    PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                    M, N, embed_dim, eps
+                );
+            } else {
+                device_post_tp_norm_bf16_general<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W),
+                    PTR<fp32_t>(contiguous_V), PTR<bf16_t>(Y),
+                    M, N, embed_dim, eps
+                );
+            }
+    }
+
+    // need to reshape Y back to 4 dimens
+    if (X.ndimension() == 4) {
+        Y = Y.reshape(X.sizes());
+    }
+
+    return Y;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/fusion/pre_tp_norm.cu b/lightllm-kernel/csrc/fusion/pre_tp_norm.cu
new file mode 100755
index 000000000..966cf5ce7
--- /dev/null
+++ b/lightllm-kernel/csrc/fusion/pre_tp_norm.cu
@@ -0,0 +1,257 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+/**
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of bf16 elements in one row.
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ */
+template<int32_t TPB>
+__global__
+void device_pre_tp_norm_bf16_general(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    fp32_t __restrict__ *V,                        // [M] Variance tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N
+) {
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+
+    bf16_t local_x = cvt_f32_bf16(0.0f);
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_x = _X[i];
+
+        fp32_t tmp = cvt_bf16_f32(local_x);
+
+        local_square_sum += tmp * tmp;
+    }
+
+    fp32_t block_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+
+    if (tid == 0) {
+        V[bid] = block_square_sum;
+    }
+
+}
+
+
+
+/**
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of bf16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ */
+template<int32_t TPB>
+__global__
+void device_pre_tp_norm_bf16_vpt(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    fp32_t __restrict__ *V,                        // [M] Variance tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT bf16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    V[bid] = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+
+}
+
+
+/**
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of bf16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ */
+template<int32_t TPB, int32_t N>
+__global__
+void device_pre_tp_norm_bf16(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    fp32_t __restrict__ *V,                        // [M] Variance tensor pointer.
+    const int32_t M                  // Number of rows.
+) {
+    constexpr int32_t VPT = 8;                // Number of bf16 values processed per thread.
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT bf16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    V[bid] = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+
+}
+
+/**
+ * @param X    Input tensor with shape [M, N] (bf16, CUDA).
+ */
+Tensor pre_tp_norm_bf16(Tensor &X) {
+    TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D");
+    TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor.");
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor input_tensor;
+    uint32_t M, N;
+    Tensor V;
+
+    if (X.ndimension() == 2) {
+        M = contiguous_X.size(0);
+        N = contiguous_X.size(1);
+        input_tensor = contiguous_X;
+        V = torch::empty(
+            {M},
+            torch::TensorOptions()
+                .dtype(c10::ScalarType::Float)
+                .device(contiguous_X.device())
+        );
+    } else {
+        const uint32_t d0 = contiguous_X.size(0);
+        const uint32_t d1 = contiguous_X.size(1);
+        const uint32_t d2 = contiguous_X.size(2);
+        const uint32_t d3 = contiguous_X.size(3);
+
+        M = d0 * d1;
+        N = d2 * d3;
+        input_tensor = contiguous_X.view({M, N});
+        V = torch::empty(
+            {M},
+            torch::TensorOptions()
+                .dtype(c10::ScalarType::Float)
+                .device(contiguous_X.device())
+        );
+    }
+
+
+    // Each CUDA block processes one row.
+    const int32_t blocks = M;
+
+    // Kernel dispatch based on the value of N.
+    switch (N) {
+        case 768:
+            device_pre_tp_norm_bf16<128, 768>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 1024:
+            device_pre_tp_norm_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 1664:
+            device_pre_tp_norm_bf16<128, 1664>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 2048:
+            device_pre_tp_norm_bf16<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 3200:
+            device_pre_tp_norm_bf16<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 4096:
+            device_pre_tp_norm_bf16<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 8192:
+            device_pre_tp_norm_bf16<512, 8192>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        case 10240:
+            device_pre_tp_norm_bf16<512, 10240>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 256;
+            if (N % 8 == 0) {
+                device_pre_tp_norm_bf16_vpt<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M, N
+                );
+            } else {
+                device_pre_tp_norm_bf16_general<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<fp32_t>(V), M, N
+                );
+            }
+        }
+    }
+    return V;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/gemm/Epilogues.md b/lightllm-kernel/csrc/gemm/Epilogues.md
new file mode 100755
index 000000000..aae04157b
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/Epilogues.md
@@ -0,0 +1,147 @@
+# CUTLASS Epilogues
+
+## Introduction
+This document describes the various CUTLASS epilogues implemented for fusing de-quantization operations onto GEMMs. 
+
+Currently, we only support symmetric quantization for weights,
+and symmetric and asymmetric quantization for activations.
+Both can be quantized per-tensor or per-channel (weights) / per-token (activations).
+
+There are 4 epilogues:
+1. ScaledEpilogue: symmetric quantization for activations, no bias.
+1. ScaledEpilogueBias: symmetric quantization for activations, supports bias.
+1. ScaledEpilogueAzp: asymmetric per-tensor quantization for activations, supports bias.
+1. ScaledEpilogueAzpPerToken: asymmetric per-token quantization for activations, supports bias.
+
+We do not have epilogues for asymmetric quantization of activations without bias in order to reduce final binary size.
+Instead, if no bias is passed, the epilogue will use 0 as the bias.
+That induces a redundant addition operation (and runtime check), but the performance impact is minor.
+
+## Underlying Linear Algebra
+
+More details available in the [Activation Quantization RFC](https://github.com/vllm-project/vllm/issues/3975).
+
+If $` \widehat X `$ is the quantized $` X `$, our matrices become the following
+
+```math
+A = s_a (\widehat A - J_a z_a)
+```
+```math
+B = s_b \widehat B
+```
+```math
+D = A B + C
+```
+```math
+D = s_a s_b \widehat D + C
+```
+
+Here, D is the output of the GEMM, and C is the bias.
+A is the activations and supports asymmetric quantization,
+and B is the weights and only supports symmetric quantization.
+$ s_a $ and $s_b$ are the scales for activations and weights, respectively.
+$ z_a $ is the zero-point for activations, and $ J_a $ is the matrix of all ones with dimensions of A.
+Additional epilogues would be required to support asymmetric quantization for weights.
+
+Expanding further, we can calculate $` \widehat D `$ as follows:
+
+```math
+A B = s_a ( \widehat A - J_a z_a ) s_b \widehat B
+```
+```math
+A B = s_a s_b \left( \widehat A \widehat B - J_a z_a \widehat B \right)
+```
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+
+Note that $` \widehat A \widehat B `$ is the raw output of the GEMM,
+and $` J_a \widehat B `$ is known ahead of time.
+Each row of it is equal to $` \mathbf 1 \widehat B `$, which is a row-vector of column sums of $` \widehat B `$.
+
+## Epilogues
+
+### ScaledEpilogue
+This epilogue computes the symmetric quantization for activations without bias, meaning $` C = 0 `$ and $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+```math
+D = s_a s_b \widehat D
+```
+```math
+D = s_a s_b \widehat A \widehat B
+```
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+
+### ScaledEpilogueBias
+This epilogue computes the symmetric quantization for activations with bias, meaning $` z_a = 0 `$.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B
+```
+```math
+D = s_a s_b \widehat D + C 
+```
+```math
+D = s_a s_b \widehat A \widehat B + C
+```
+
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+### ScaledEpilogueAzp
+This epilogue computes the asymmetric per-tensor quantization for activations with bias.
+The output of the GEMM is:
+
+```math
+\widehat D = \widehat A \widehat B - z_a J_a \widehat B
+```
+```math
+D = s_a s_b \widehat D + C 
+```
+```math
+D = s_a s_b \left( \widehat A \widehat B - z_a J_a \widehat B \right) + C
+```
+
+Because $` z_a `$ is a scalar, the zero-point term $` z_a J_a \widehat B `$ has every row equal to $` z_a \mathbf 1 B `$. 
+That is precomputed and stored in `azp_with_adj` as a row-vector.
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+  - Generally this will be per-tensor as the zero-points are per-tensor.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_with_adj` term offline and pass it to the kernel.
+
+### ScaledEpilogueAzpPerToken
+This epilogue computes the asymmetric per-token quantization for activations with bias.
+
+The output of the GEMM is the same as above, but the $` z_a `$ is a column-vector.
+That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product of $` z_a `$ and $` \mathbf 1 \widehat B `$.
+
+Epilogue parameters:
+- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
+  - Generally this will be per-token as the zero-points are per-token.
+- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
+- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
+- `azp` is the zero-point (`z_a`), is per-token (column-vector).
+- `bias` is the bias, is always per-channel (row-vector).
+
+To use these kernels efficiently, users must precompute the `azp_adj` term offline and pass it to the kernel.
+
+The epilogue performs the following computation (where `Dq` is the raw quantized output of the GEMM):
+```
+out = scale_a * scale_b * (Dq - azp_adj * azp) + bias
+```
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu
new file mode 100755
index 000000000..55d623755
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cu
@@ -0,0 +1,73 @@
+#include <cudaTypedefs.h>
+
+#if defined CUDA_VERSION && CUDA_VERSION >= 12000
+
+  #include "scaled_mm_c3x_sm90_fp8_dispatch.cuh"
+  #include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+/*
+   This file defines quantized GEMM operations using the CUTLASS 3.x API, for
+   NVIDIA GPUs with sm90a (Hopper) or later.
+*/
+
+template <template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+void cutlass_scaled_mm_sm90_epilogue(torch::Tensor& out, torch::Tensor const& a,
+                                     torch::Tensor const& b,
+                                     EpilogueArgs&&... epilogue_args) {
+  
+    TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+    TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+    if (out.dtype() == torch::kBFloat16) {
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::bfloat16_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    } else {
+      TORCH_CHECK(out.dtype() == torch::kFloat16);
+      return cutlass_gemm_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                            cutlass::half_t, Epilogue>(
+          out, a, b, std::forward<EpilogueArgs>(epilogue_args)...);
+    }
+  
+}
+
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias,
+                            c10::optional<torch::Tensor> const& ls) {
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+  if (bias && ls) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    TORCH_CHECK(ls->dtype() == c.dtype(),
+                "currently ls dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBiasLs>(
+        c, a, b, a_scales, b_scales, *bias, *ls);
+  } else if (bias) {
+    TORCH_CHECK(bias->dtype() == c.dtype(),
+                "currently bias dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueBias>(
+        c, a, b, a_scales, b_scales, *bias);
+  } else if (ls) {
+    TORCH_CHECK(ls->dtype() == c.dtype(),
+                "currently ls dtype must match output dtype ", c.dtype());
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogueLs>(
+        c, a, b, a_scales, b_scales, *ls);
+  } else {
+    return cutlass_scaled_mm_sm90_epilogue<c3x::ScaledEpilogue>(
+        c, a, b, a_scales, b_scales);
+  }
+}
+
+} // namespace ops
+} // namespace lightllm
+
+#endif
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh
new file mode 100755
index 000000000..93641a157
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_c3x.cuh
@@ -0,0 +1,161 @@
+#pragma once
+
+// clang-format will break include orders
+// clang-format off
+#include <torch/all.h>
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "cutlass/cutlass.h"
+
+#include "cute/tensor.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cutlass/numeric_types.h"
+
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+
+#include "cutlass_extensions/common.hpp"
+// clang-format on
+
+/*
+  Epilogues defined in,
+  csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp,
+  must contain a public type named EVTCompute of type Sm90EVT, as well as a
+  static prepare_args function that constructs an EVTCompute::Arguments struct.
+*/
+
+using namespace cute;
+
+namespace lightllm {
+namespace ops {
+
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename ElementAB_, typename ElementD_,
+          template <typename, typename, typename> typename Epilogue_,
+          typename TileShape, typename ClusterShape, typename KernelSchedule,
+          typename EpilogueSchedule>
+struct cutlass_3x_gemm {
+  using ElementAB = ElementAB_;
+  using ElementD = ElementD_;
+  using ElementAcc =
+      typename std::conditional<std::is_same_v<ElementAB, int8_t>, int32_t,
+                                float>::type;
+
+  using EpilogueDescriptor =
+      cutlass::epilogue::collective::detail::EpilogueDescriptor<
+          TileShape, cutlass::epilogue::collective::EpilogueTileAuto, ElementD,
+          ElementD, EpilogueSchedule>;
+
+  using Epilogue = Epilogue_<ElementAcc, ElementD, EpilogueDescriptor>;
+
+  using StrideD = Stride<int64_t, Int<1>, Int<0>>;
+  using ElementC = void;
+  using StrideC = StrideD;
+
+  using EVTCompute = typename Epilogue::EVTCompute;
+
+  using CollectiveEpilogue =
+      typename cutlass::epilogue::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, TileShape,
+          ClusterShape, cutlass::epilogue::collective::EpilogueTileAuto,
+          ElementAcc, float, ElementC, StrideC, 4, ElementD, StrideD, 4,
+          EpilogueSchedule, EVTCompute>::CollectiveOp;
+
+  static constexpr size_t CEStorageSize =
+      sizeof(typename CollectiveEpilogue::SharedStorage);
+  using Stages = typename cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(CEStorageSize)>;
+
+  // clang-format off
+  using CollectiveMainloop =
+      typename cutlass::gemm::collective::CollectiveBuilder<
+          cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, 
+          ElementAB, cutlass::layout::RowMajor, 16, 
+          ElementAB, cutlass::layout::ColumnMajor, 16, 
+          ElementAcc, TileShape, ClusterShape,
+          Stages,
+          KernelSchedule>::CollectiveOp;
+  // clang-format on
+
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
+      cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
+      cutlass::gemm::PersistentScheduler>>;
+
+  struct GemmKernel : public KernelType {};
+};
+
+template <typename Gemm, typename... EpilogueArgs>
+void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a,
+                         torch::Tensor const& b,
+                         EpilogueArgs&&... epilogue_params) {
+  using ElementAB = typename Gemm::ElementAB;
+  using ElementD = typename Gemm::ElementD;
+
+  int32_t m = a.size(0);
+  int32_t n = b.size(1);
+  int32_t k = a.size(1);
+
+  int64_t lda = a.stride(0);
+  int64_t ldb = b.stride(1);
+  int64_t ldc = out.stride(0);
+
+  using StrideA = Stride<int64_t, Int<1>, int64_t>;
+  using StrideB = Stride<int64_t, Int<1>, int64_t>;
+  using StrideC = typename Gemm::StrideC;
+
+  StrideA a_stride{lda, Int<1>{}, 0};
+  StrideB b_stride{ldb, Int<1>{}, 0};
+  StrideC c_stride{ldc, Int<1>{}, Int<0>{}};
+
+  using GemmKernel = typename Gemm::GemmKernel;
+  typename GemmKernel::ProblemShape prob_shape{m, n, k, 1};
+
+  auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
+  auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
+  typename GemmKernel::MainloopArguments mainloop_args{a_ptr, a_stride, b_ptr,
+                                                       b_stride};
+
+  auto c_ptr = static_cast<ElementD*>(out.data_ptr());
+  typename GemmKernel::EpilogueArguments epilogue_args{
+      Gemm::Epilogue::prepare_args(
+          std::forward<EpilogueArgs>(epilogue_params)...),
+      c_ptr, c_stride, c_ptr, c_stride};
+
+  typename GemmKernel::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm,
+                                      prob_shape, mainloop_args, epilogue_args};
+
+  // Launch the CUTLASS GEMM kernel.
+  using GemmOp = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  GemmOp gemm_op;
+  CUTLASS_CHECK(gemm_op.can_implement(args));
+
+  size_t workspace_size = gemm_op.get_workspace_size(args);
+  auto const workspace_options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(a.device());
+  auto workspace = torch::empty(workspace_size, workspace_options);
+
+  auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
+
+  cutlass::Status status = gemm_op.run(args, workspace.data_ptr(), stream);
+  CUTLASS_CHECK(status);
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh b/lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh
new file mode 100755
index 000000000..bbd709ccb
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_c3x_sm90_fp8_dispatch.cuh
@@ -0,0 +1,97 @@
+#pragma once
+#include "scaled_mm_c3x.cuh"
+
+/**
+ * This file defines Gemm kernel configurations for SM90 (fp8) based on the Gemm
+ * shape.
+ */
+
+namespace lightllm {
+namespace ops {
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_default {
+  // M in (128, inf)
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M128 {
+  // M in (64, 128]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue>
+struct sm90_fp8_config_M64 {
+  // M in [1, 64]
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
+                      KernelSchedule, EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType,
+          template <typename, typename, typename> typename Epilogue,
+          typename... EpilogueArgs>
+inline void cutlass_gemm_sm90_fp8_dispatch(torch::Tensor& out,
+                                           torch::Tensor const& a,
+                                           torch::Tensor const& b,
+                                           EpilogueArgs&&... args) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config_default<InType, OutType,
+                                       Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_gemm_caller<Cutlass3xGemmM64>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_gemm_caller<Cutlass3xGemmM128>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  } else {
+    // m in (128, inf)
+    return cutlass_gemm_caller<Cutlass3xGemmDefault>(
+        out, a, b, std::forward<EpilogueArgs>(args)...);
+  }
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/gemm/scaled_mm_entry.cu b/lightllm-kernel/csrc/gemm/scaled_mm_entry.cu
new file mode 100755
index 000000000..6655c3712
--- /dev/null
+++ b/lightllm-kernel/csrc/gemm/scaled_mm_entry.cu
@@ -0,0 +1,83 @@
+#include <cudaTypedefs.h>
+
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/all.h>
+
+#include "ops_common.h"
+#include "cutlass_extensions/common.hpp"
+
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a,
+                            torch::Tensor const& b,
+                            torch::Tensor const& a_scales,
+                            torch::Tensor const& b_scales,
+                            c10::optional<torch::Tensor> const& bias,
+                            c10::optional<torch::Tensor> const& ls);
+
+bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) {
+  // CUTLASS FP8 kernels need at least
+  //   CUDA 12.0 on SM90 systems (Hopper)
+
+#if defined CUDA_VERSION
+  if (cuda_device_capability >= 90) {
+    return CUDA_VERSION >= 12000;
+  } else if (cuda_device_capability >= 89) {
+    return CUDA_VERSION >= 12040;
+  }
+#endif
+
+  return false;
+}
+
+void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a,
+                       torch::Tensor const& b, torch::Tensor const& a_scales,
+                       torch::Tensor const& b_scales,
+                       c10::optional<torch::Tensor> const& bias,
+                       c10::optional<torch::Tensor> const& ls) {
+  // Checks for conformality
+  TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
+  TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
+              b.size(1) == c.size(1));
+  TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
+  TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
+
+  // Check for strides and alignment
+  TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1);  // Row-major
+  TORCH_CHECK(b.stride(0) == 1);                      // Column-major
+  TORCH_CHECK(c.stride(0) % 16 == 0 &&
+              b.stride(1) % 16 == 0);  // 16 Byte Alignment
+  TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
+
+  if (bias) {
+    TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
+                bias->dim() == 1);
+  }
+
+  if (ls) {
+    TORCH_CHECK(ls->numel() == b.size(1) && ls->is_contiguous() &&
+                ls->dim() == 1);
+  }
+
+  at::cuda::OptionalCUDAGuard const device_guard(device_of(a));
+  int32_t version_num = get_sm_version_num();
+
+  if (version_num >= 90) {
+    cutlass_scaled_mm_sm90(c, a, b, a_scales, b_scales, bias, ls);
+    return;
+  }
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+    false,
+    "No compiled cutlass_scaled_mm for a compute capability less than "
+    "CUDA device capability: ",
+    version_num);
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/moe/grouped_topk.cu b/lightllm-kernel/csrc/moe/grouped_topk.cu
new file mode 100644
index 000000000..83bbee8c7
--- /dev/null
+++ b/lightllm-kernel/csrc/moe/grouped_topk.cu
@@ -0,0 +1,344 @@
+#include <cub/cub.cuh>
+#include <torch/extension.h>
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "../cuda_compat.h"
+
+#ifndef USE_ROCM
+    #include <cub/util_type.cuh>
+    #include <cub/cub.cuh>
+#else
+    #include <hipcub/util_type.hpp>
+    #include <hipcub/hipcub.hpp>
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+namespace lightllm {
+namespace ops{
+
+template <int TPB>
+__launch_bounds__(TPB) 
+__global__ void moeGroupedTopK(
+    const float* input, 
+    const bool* finished, 
+    float* inputs_after_softmax, 
+    const int num_cols, 
+    const float* correction_bias, 
+    float* group_scores, 
+    float* output, // topk_weights
+    int* indices, // topk_indices
+    int* group_indices, // token_expert_indices
+    const int num_experts, 
+    const int num_expert_group, 
+    const int topk_group,
+    const int k,
+    const bool renormalize,
+    const bool softmax_or_sigmoid, 
+    const int start_expert, 
+    const int end_expert)
+{
+
+    const int thread_row_offset = blockIdx.x * num_cols;
+
+    if(softmax_or_sigmoid)
+    {
+        //softmax
+        using BlockReduce_topk = cub::BlockReduce<float, TPB>;
+        __shared__ typename BlockReduce_topk::TempStorage tmpStorage;
+
+        __shared__ float normalizing_factor;
+        __shared__ float float_max;
+
+        cub::Sum sum;
+        float threadData(-FLT_MAX);
+
+        // Don't touch finished rows.
+        if ((finished != nullptr) && finished[blockIdx.x])
+        {
+            return;
+        }
+
+        for (int i = threadIdx.x; i < num_cols; i += TPB)
+        {
+            const int idx = thread_row_offset + i;
+            threadData = max(static_cast<float>(input[idx]), threadData);
+        }
+
+        const float maxElem = BlockReduce_topk(tmpStorage).Reduce(threadData, cub::Max());
+        if (threadIdx.x == 0)
+        {
+            float_max = maxElem;
+        }
+        __syncthreads();
+
+        threadData = 0;
+
+        for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+        {
+            const int idx = thread_row_offset + ii;
+            threadData += exp((static_cast<float>(input[idx]) - float_max));
+        }
+
+        const auto Z = BlockReduce_topk(tmpStorage).Reduce(threadData, sum);
+
+        if (threadIdx.x == 0)
+        {
+            normalizing_factor = 1.f / Z;
+        }
+        __syncthreads();
+
+        for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
+        {
+            const int idx = thread_row_offset + ii;
+            const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
+            inputs_after_softmax[idx] = val + (correction_bias ? correction_bias[idx] : 0.f);
+        }
+    } else {
+        // sigmoid
+        for (int i = threadIdx.x; i < num_cols; i += TPB)
+        {
+            const int idx = thread_row_offset + i;
+            float val = 1.f / (1.f + expf(-input[idx])); 
+            inputs_after_softmax[idx] = val + (correction_bias ? correction_bias[idx] : 0.f);
+        }
+    }
+    __syncthreads();
+
+    using cub_kvp = cub::KeyValuePair<int, float>;
+    using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+    __shared__ typename BlockReduce::TempStorage tmpStorage_kvp;
+
+    int block_row = blockIdx.x;  // (0 - tokens-1) 即0-199
+    int thread_read_offset = block_row * num_experts;
+
+    int group_size = num_experts / num_expert_group;
+
+    for(int group_id = threadIdx.x; group_id < num_expert_group; group_id += TPB)
+    {
+        float local_max = -FLT_MAX;
+        const int start = group_id * group_size;
+        const int end   = (group_id + 1) * group_size;
+
+        // find max in this group
+        for(int e = start; e < end; e++)
+        {
+            float val = inputs_after_softmax[thread_read_offset + e];
+            local_max = fmaxf(local_max, val);
+        }
+
+        // store max in group_scores
+        group_scores[block_row * num_expert_group + group_id] = local_max;
+    }
+    __syncthreads();
+
+    cub_kvp thread_kvp;
+    cub::ArgMax arg_max;
+
+    const bool row_is_active = finished ? !finished[block_row] : true;
+    thread_read_offset = blockIdx.x * num_expert_group;
+
+    for (int k_idx = 0; k_idx < topk_group; ++k_idx)
+    {
+        thread_kvp.key = 0;
+        thread_kvp.value = -1.f; // This is OK because inputs are probabilities
+
+        // every thread finds the max expert in a different expert group
+        cub_kvp inp_kvp;
+        for (int expert = threadIdx.x; expert < num_expert_group; expert += TPB)
+        {
+            const int idx = thread_read_offset + expert;
+            inp_kvp.key = expert;
+            inp_kvp.value = group_scores[idx];
+
+            for (int prior_k = 0; prior_k < k_idx; ++prior_k)
+            {
+                const int prior_winning_expert = group_indices[topk_group * block_row + prior_k]; 
+
+                if (prior_winning_expert == expert)
+                {
+                    inp_kvp = thread_kvp;
+                }
+            }
+
+            thread_kvp = arg_max(inp_kvp, thread_kvp);
+        }
+
+        const cub_kvp result_kvp = BlockReduce(tmpStorage_kvp).Reduce(thread_kvp, arg_max);
+        if (threadIdx.x == 0)
+        {
+            // Ignore experts the node isn't responsible for with expert parallelism
+            const int expert = result_kvp.key;
+            const bool node_uses_expert = expert >= start_expert && expert < end_expert;
+            const bool should_process_row = row_is_active && node_uses_expert;
+
+            const int idx = topk_group * block_row + k_idx;
+            group_indices[idx] = should_process_row ? (expert - start_expert) : num_expert_group;
+            assert(group_indices[idx] >= 0);
+        }
+        __syncthreads();
+    }
+
+    int score_offset = block_row * num_experts; 
+    for (int e = threadIdx.x; e < num_experts; e += TPB)
+    {
+        int grp = e / group_size;
+        bool selected = false;
+        // selected = True if e in group_indices[block_row, :]
+        for (int i = 0; i < topk_group; i++) {
+            int sel_grp = group_indices[block_row * topk_group + i];
+            if (sel_grp == grp) {
+                selected = true;
+                break;
+            }
+        }
+        if (!selected) {
+            inputs_after_softmax[score_offset + e] = 0.0f;
+        }
+    }
+    __syncthreads();
+
+    for (int tk = 0; tk < k; tk++) {
+        thread_kvp.key = -1;
+        thread_kvp.value = -FLT_MAX;
+        for (int e = threadIdx.x; e < num_experts; e += TPB) {
+            bool already_selected = false;
+            for (int prev = 0; prev < tk; prev++) {
+                if (indices[block_row * k + prev] == e) {
+                    already_selected = true;
+                    break;
+                }
+            }
+            float val = already_selected ? -FLT_MAX : inputs_after_softmax[score_offset + e];
+            cub_kvp inp;
+            inp.key = e;
+            inp.value = val;
+            thread_kvp = arg_max(inp, thread_kvp);
+        }
+        cub_kvp result = BlockReduce(tmpStorage_kvp).Reduce(thread_kvp, arg_max);
+        if (threadIdx.x == 0) {
+            output[block_row * k + tk] = result.value;
+            indices[block_row * k + tk] = result.key;
+        }
+        __syncthreads();
+    }
+
+    // renormalize
+    if (threadIdx.x == 0 && renormalize) {
+        float sum = 0.0f;
+        int out_offset = block_row * k;
+        for (int j = 0; j < k; j++) {
+            sum += output[out_offset + j];
+        }
+        // avoid division by zero
+        if (sum > 0.0f) {
+            for (int j = 0; j < k; j++) {
+                output[out_offset + j] /= sum;
+            }
+        }
+    }
+    __syncthreads();
+
+}
+
+void GroupedTopKKernelLauncher(
+    const float* gating_output,
+    const float* correction_bias,
+    float* topk_weights,
+    int* topk_indicies,
+    int* group_indices,
+    float* softmax_workspace,
+    float* group_scores,
+    const int num_tokens,
+    const int num_experts,
+    const int num_expert_group,
+    const int topk_group,
+    const int topk,
+    const bool renormalize,
+    const bool softmax_or_sigmoid,
+    cudaStream_t stream) {
+
+    static constexpr int TPB = 256;
+    moeGroupedTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
+        gating_output, nullptr, softmax_workspace, num_experts, correction_bias,
+        group_scores, topk_weights, topk_indicies, group_indices,
+        num_experts, num_expert_group, topk_group, topk, renormalize, softmax_or_sigmoid, 0, num_experts);
+}
+
+void grouped_topk_cuda(
+    torch::Tensor& topk_weights,                // [num_tokens, topk]
+    torch::Tensor& correction_bias,             // [num_tokens, num_experts]
+    torch::Tensor& topk_indices,                // [num_tokens, topk]
+    torch::Tensor& group_indices,               // [num_tokens, topk_group]
+    torch::Tensor& gating_output,               // [num_tokens, num_experts]
+    const int num_expert_group,
+    const int topk_group,
+    const int topk,
+    const bool renormalize,
+    std::string scoring_func,
+    torch::Tensor group_scores = torch::Tensor() // [num_tokens, num_expert_group]
+    )
+{
+    const int num_experts = gating_output.size(-1);
+    const int num_tokens = gating_output.numel() / num_experts;
+
+    const int64_t workspace_size = num_tokens * num_experts;
+
+    const bool softmax_or_sigmoid = (scoring_func == "softmax") ? true : false;
+
+    float* d_group_scores = nullptr;
+    if (group_scores.defined() && group_scores.numel() > 0) {
+        d_group_scores = group_scores.data_ptr<float>();
+    } else {
+        cudaMalloc(&d_group_scores, num_tokens * num_expert_group * sizeof(float));
+        cudaMemset(d_group_scores, 0, num_tokens * num_expert_group * sizeof(float));
+    }
+
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
+    GroupedTopKKernelLauncher(
+        gating_output.data_ptr<float>(),
+        correction_bias.defined() ? correction_bias.data_ptr<float>() : nullptr,
+        topk_weights.data_ptr<float>(),
+        topk_indices.data_ptr<int>(),
+        group_indices.data_ptr<int>(),
+        softmax_workspace.data_ptr<float>(),
+        d_group_scores,
+        num_tokens,
+        num_experts,
+        num_expert_group,
+        topk_group,
+        topk,
+        renormalize,
+        softmax_or_sigmoid,
+        stream);
+}
+
+torch::Tensor grouped_topk(
+        torch::Tensor topk_weights,
+        torch::Tensor correction_bias,
+        torch::Tensor topk_indices,
+        torch::Tensor group_indices,
+        torch::Tensor gating_output,
+        int64_t  num_expert_group,
+        int64_t  topk_group,
+        int64_t  topk,
+        bool     renormalize,
+        std::string scoring_func,
+        torch::Tensor group_scores) {
+
+    grouped_topk_cuda(topk_weights, correction_bias, topk_indices, group_indices,
+                      gating_output,
+                      static_cast<int>(num_expert_group),
+                      static_cast<int>(topk_group),
+                      static_cast<int>(topk),
+                      renormalize, scoring_func, group_scores);
+
+    return topk_weights;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/norm/rmsnorm_bf16.cu b/lightllm-kernel/csrc/norm/rmsnorm_bf16.cu
new file mode 100755
index 000000000..95853db0d
--- /dev/null
+++ b/lightllm-kernel/csrc/norm/rmsnorm_bf16.cu
@@ -0,0 +1,350 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+template<int32_t TPB>
+__global__
+void device_rmsnorm_align16_bf16_general(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid; i < N; i += TPB) {
+        fp32_t tmp = cvt_bf16_f32(_X[i]);
+        local_square_sum += tmp* tmp;
+    }
+    
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+    
+    // // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid; i < N; i += TPB) {
+        fp32_t x = cvt_bf16_f32(_X[i]);
+        fp32_t w = cvt_bf16_f32(W[i]);
+        // Apply normalization: multiply by inv_norm and then scale by the weight.
+        fp32_t ret = x* inv_norm * w;
+        _Y[i] = cvt_f32_bf16(ret);
+    }
+}
+
+template<int32_t TPB>
+__global__
+void device_rmsnorm_align16_bf16_vpt(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const int32_t N,
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    const fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    // __shared__ bf16x2_t workspace[N / 2];
+    extern __shared__ bf16x2_t workspace2[];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_x, workspace2 + (i >> 1));
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief CUDA kernel to perform RMS normalization on an FP16 tensor.
+ *
+ * Each block processes one row of the input tensor. The kernel loads the
+ * data in a vectorized manner (using half2), computes the mean square,
+ * calculates the reciprocal square root (i.e. 1/sqrt(mean_square+eps)),
+ * and then normalizes the input row element‐wise while scaling with a weight.
+ *
+ * @tparam TPB   Threads per block.
+ * @tparam N     Number of FP16 elements in one row (must be a multiple of VPT).
+ *
+ * @param X       Pointer to the input tensor in global memory. [M, N]
+ * @param W       Pointer to the weight tensor in global memory. [N]
+ * @param Y       Pointer to the output tensor in global memory. [M, N]
+ * @param M       Number of rows in the tensor.
+ * @param eps     Epsilon for numerical stability.
+ */
+template<int32_t TPB, int32_t N>
+__global__
+void device_rmsnorm_align16_bf16(
+    bf16_t __restrict__ *X,           // [M, N] Input tensor pointer.
+    const bf16_t __restrict__ *W,     // [N] Weight tensor pointer.
+    bf16_t __restrict__ *Y,                        // [M, N] Output tensor pointer.
+    const int32_t M,                  // Number of rows.
+    const fp32_t eps                  // Epsilon for numerical stability.
+) {
+    constexpr int32_t VPT = 8;                // Number of FP16 values processed per thread.
+    constexpr fp32_t r_N = 1 / (fp32_t)N;       // Reciprocal of N.
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t tid = threadIdx.x;
+    const int32_t bid = blockIdx.x;
+
+    // Each block processes one row of the input tensor.
+    bf16_t* _X = X + bid * N;
+    bf16_t* _Y = Y + bid * N;
+
+    // Shared memory workspace to store vectorized (half2) data.
+    // Note: since each bf16x2_t holds 2 half values, the workspace size is N/2.
+    __shared__ bf16x2_t workspace[N / 2];
+
+    // Local registers to hold vectorized data.
+    bf16x2_t local_x[VPT / 2];
+    bf16x2_t local_w[VPT / 2];
+    bf16x2_t local_y[VPT / 2];
+
+    // Each thread computes a partial sum of squares.
+    fp32_t local_square_sum = 0.0f;
+    # pragma unroll
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_X + i, local_x);
+        // Store the loaded data into shared memory.
+        // Divide index by 2 because 'workspace' is an array of bf16x2_t.
+        vec_copy<sizeof(bf16_t) * VPT>(local_x, workspace + (i >> 1));
+
+        // Compute the sum of squares for the VPT elements.
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_x[j]);
+            local_square_sum += (tmp.x * tmp.x + tmp.y * tmp.y);
+        }
+    }
+
+    // Reduce the partial sums across the block, block reduce sum will invoke __syncthread();
+    fp32_t reduced_square_sum = lightllm::reduce::sm70::sync_block_reduce_sum_f32<TPB>(local_square_sum);
+    // Compute the mean square and then the inverse RMS normalization factor.
+    // For RMSNorm, the normalization factor is 1/sqrt(mean(x^2)+eps).
+    fp32_t mean_square = reduced_square_sum * r_N;
+    fp32_t inv_norm = rsqrtf(mean_square + eps);
+
+    // Normalize each element using the computed normalization factor.
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load the previously stored vectorized data from shared memory.
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_x);
+        // Load the corresponding weight values from global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(W + i, local_w);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT / 2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_x[j]);
+            fp32x2_t w = bf16x2_to_fp32x2(local_w[j]);
+            // Apply normalization: multiply by inv_norm and then scale by the weight.
+            fp32x2_t ret = make_float2(
+                x.x * inv_norm * w.x,
+                x.y * inv_norm * w.y
+            );
+            local_y[j] = _float22bf162_rn(ret);
+        }
+        // Write the normalized vectorized data back to global memory.
+        vec_copy<sizeof(bf16_t) * VPT>(local_y, _Y + i);
+    }
+}
+
+/**
+ * @brief Launch RMSNorm kernel for FP16 tensors with aligned 16-element rows.
+ *
+ * This function validates the input tensors, ensures they are contiguous,
+ * selects the appropriate kernel configuration based on the row width N,
+ * and launches the CUDA kernel.
+ *
+ * @param X    Input tensor with shape [M, N] (FP16, CUDA).
+ * @param W    Weight tensor with shape [N] (FP16, CUDA).
+ * @param eps  Epsilon for numerical stability.
+ * @return     Output tensor with the same shape as X.
+ */
+Tensor rmsnorm_align16_bf16(const Tensor &X, const Tensor &W, const fp32_t eps) {
+
+    TORCH_CHECK(X.ndimension() == 2 || X.ndimension() == 4, "Input tensor must be 2D or 4D");
+    TORCH_CHECK(X.is_cuda(), "Input tensor must be a CUDA tensor.");
+    TORCH_CHECK(X.scalar_type() == c10::ScalarType::BFloat16, "Input tensor must be BF16.");
+
+    Tensor contiguous_X = X.is_contiguous() ? X : X.contiguous();
+    Tensor contiguous_W = W.is_contiguous() ? W : W.contiguous();
+
+    Tensor input_tensor;
+    uint32_t M, N;
+    Tensor Y;
+
+    if (X.ndimension() == 2) {
+        M = contiguous_X.size(0);
+        N = contiguous_X.size(1);
+        input_tensor = contiguous_X;
+        Y = torch::empty_like(input_tensor);
+    } else {
+        const uint32_t d0 = contiguous_X.size(0);
+        const uint32_t d1 = contiguous_X.size(1);
+        const uint32_t d2 = contiguous_X.size(2);
+        const uint32_t d3 = contiguous_X.size(3);
+
+        M = d0 * d1;
+        N = d2 * d3;
+        input_tensor = contiguous_X.view({M, N});
+        Y = torch::empty_like(input_tensor);
+    }
+
+    // Each CUDA block processes one row.
+    const int32_t blocks = M;
+
+    // Kernel dispatch based on the value of N.
+    switch (N) {
+        case 768:
+            device_rmsnorm_align16_bf16<128, 768>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 1024:
+            device_rmsnorm_align16_bf16<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 2048:
+            device_rmsnorm_align16_bf16<128, 2048>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 3200:
+            device_rmsnorm_align16_bf16<256, 3200>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 4096:
+            device_rmsnorm_align16_bf16<256, 4096>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 8192:
+            device_rmsnorm_align16_bf16<512, 8192>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        case 10240:
+            device_rmsnorm_align16_bf16<512, 10240>
+            <<<blocks, 512, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                M, eps
+            );
+            break;
+        default: {
+            static constexpr int32_t TPB = 256;
+            if (N % 8 == 0) {
+                const int64_t shared_mem_size = N * sizeof(bf16_t);
+                device_rmsnorm_align16_bf16_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                    M, N, eps
+                );
+            } else {
+                device_rmsnorm_align16_bf16_general<TPB>
+                <<<blocks, TPB, 0, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(input_tensor), PTR<bf16_t>(contiguous_W), PTR<bf16_t>(Y),
+                    M, N, eps
+                );
+            }
+        }
+    }
+
+    // need to reshape Y back to 4 dimens
+    if (X.ndimension() == 4) {
+        Y = Y.reshape(X.sizes());
+    }
+
+    return Y;
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/ops_bindings.cpp b/lightllm-kernel/csrc/ops_bindings.cpp
new file mode 100644
index 000000000..40346e3ea
--- /dev/null
+++ b/lightllm-kernel/csrc/ops_bindings.cpp
@@ -0,0 +1,30 @@
+#include <torch/extension.h>
+#include "ops_common.h"
+#include <pybind11/pybind11.h>
+
+namespace lightllm {
+namespace ops {
+
+PYBIND11_MODULE(_C, m) {
+    m.def("grouped_topk", &grouped_topk,"GROUPED TOP-K (CUDA)");
+    m.def("rmsnorm_align16_bf16", &rmsnorm_align16_bf16, "RMSNORM (CUDA)");
+    m.def("pre_tp_norm_bf16", &pre_tp_norm_bf16, "PRE TP NORM (CUDA)");
+    m.def("post_tp_norm_bf16", &post_tp_norm_bf16, "POST TP NORM (CUDA)");
+    m.def("per_token_quant_bf16_fp8", &per_token_quant_bf16_fp8, "PER TOKEN QUANT FP8 (CUDA)");
+    m.def("per_token_quant_bf16_int8", &per_token_quant_bf16_int8, "PER TOKEN QUANT INT8 (CUDA)");
+    m.def("add_norm_quant_bf16_fp8", &add_norm_quant_bf16_fp8, "ADD NORM QUANT FUSED (CUDA)");
+    m.def("gelu_per_token_quant_bf16_fp8", &gelu_per_token_quant_bf16_fp8, "GELU QUANT FUSED (CUDA)");
+    m.def("cutlass_scaled_mm", &cutlass_scaled_mm, "CUTLASS SCALED MM (CUDA)");
+    m.def("all_gather", &all_gather, "ALL GATHER (CUDA)");
+    m.def("allgather_dispose", &allgather_dispose, "ALL GATHER DISPOSE (CUDA)");
+    m.def("init_custom_gather_ar", &init_custom_gather_ar, "INIT CUSTOM GATHER AR (CUDA)");
+    m.def("allgather_register_buffer", &allgather_register_buffer, "ALL GATHER REGISTER BUFFER (CUDA)");
+    m.def("allgather_register_graph_buffers", &allgather_register_graph_buffers, "ALL GATHER REGISTER BRAPH BUFFERS (CUDA)");
+    m.def("allgather_get_graph_buffer_ipc_meta", &allgather_get_graph_buffer_ipc_meta, "ALL GATHER GET GRAPH BUFFER IPC META (CUDA)");
+    m.def("meta_size", &lightllm::ops::meta_size, "Size (in bytes) of vllm::Signal metadata");
+    m.def("group8_int8kv_flashdecoding_stage1", &group_int8kv_flashdecoding_attention, "INT8KV FLASHDECODING ATTENTION (CUDA)");
+    m.def("group_int8kv_decode_attention", &group_int8kv_decode_attention, "INT8KV DECODE ATTENTION (CUDA)");
+}
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
new file mode 100755
index 000000000..6e6a98596
--- /dev/null
+++ b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_fp8.cu
@@ -0,0 +1,329 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+// CUDA kernel for per token quantization from BF16 to FP8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_fp8_general(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t N
+) {
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8_e4m3_t local_f8;
+    bf16_t local_bf16;
+
+    extern __shared__ bf16_t workspace1[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = _input[i];
+        workspace1[i] = local_bf16;
+
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        local_max = fmaxf(local_max, fabsf(tmp));
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = workspace1[i];
+        
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        fp32_t x = tmp * inv_scale;
+        local_f8 = fp8_e4m3_t(x);
+
+        _output[i] = local_f8;
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+
+}
+
+// CUDA kernel for per token quantization from BF16 to FP8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_fp8_vpt(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    extern __shared__ bf16x2_t workspace2[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace2 + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+
+// CUDA kernel for per token quantization from BF16 to FP8
+template<int32_t TPB, int32_t N>
+__global__ void device_per_token_quant_bf16_to_fp8(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    fp8_e4m3_t* __restrict__ output,   // Output tensor in FP8 format
+    fp32_t* __restrict__ scales       // Output scales for each token
+) {
+    constexpr int32_t VPT = 8;
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t FP8_E4M3_MAX = 448.0f; // Maximum value representable in FP8 E4M3 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    fp8_e4m3_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    fp8x4_e4m3_t local_f8[VPT / 4];
+    bf16x2_t local_bf16[VPT / 2];
+
+    __shared__ bf16x2_t workspace[N / 2];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / FP8_E4M3_MAX;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/4; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[2 * j + 0]);
+            fp32x2_t y = bf16x2_to_fp32x2(local_bf16[2 * j + 1]);
+            fp32x4_t ret = make_float4(
+                x.x * inv_scale,
+                x.y * inv_scale,
+                y.x * inv_scale,
+                y.y * inv_scale
+            );
+            local_f8[j] = fp8x4_e4m3_t(ret);
+        }
+
+        vec_copy<sizeof(fp8_e4m3_t) * VPT>(local_f8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+void per_token_quant_bf16_fp8 (
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2-dimensional");
+    TORCH_CHECK(input.scalar_type() == c10::kBFloat16, "Input must be BF16 type");
+
+    Tensor contiguous_input = input.is_contiguous() ? input : input.contiguous();
+    Tensor contiguous_scales = scales.is_contiguous() ? scales : scales.contiguous();
+
+    const int64_t M = input.size(0);
+    const int64_t N = input.size(1);
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_per_token_quant_bf16_to_fp8<128, 16>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 32:
+            device_per_token_quant_bf16_to_fp8<128, 32>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 64:
+            device_per_token_quant_bf16_to_fp8<128, 64>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 512:
+            device_per_token_quant_bf16_to_fp8<128, 512>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 1024:
+            device_per_token_quant_bf16_to_fp8<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 3200:
+            device_per_token_quant_bf16_to_fp8<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 4096:
+            device_per_token_quant_bf16_to_fp8<128, 4096>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 12800:
+            device_per_token_quant_bf16_to_fp8<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<fp8_e4m3_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        default: {
+            static constexpr int TPB = 128;
+            const int64_t shared_mem_size = N * sizeof(bf16_t);
+            if (N % 8 == 0) {
+                device_per_token_quant_bf16_to_fp8_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    N
+                );
+            } else {
+                device_per_token_quant_bf16_to_fp8_general<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<fp8_e4m3_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    N
+                );
+            }
+        }
+    }
+
+    return;
+}
+
+} // namespace ops
+} // namespace lightllm
diff --git a/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
new file mode 100644
index 000000000..0df97753c
--- /dev/null
+++ b/lightllm-kernel/csrc/quant/per_token_quantize_bf16_int8.cu
@@ -0,0 +1,325 @@
+#include "ops_common.h"
+#include "reduce/sm70.cuh"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+// CUDA kernel for per token quantization from BF16 to INT8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_int8_general(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    int8_t* __restrict__ output,   // Output tensor in INT8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int64_t N
+) {
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t kINT8Max = 127.0f; // Maximum value representable in INT8 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    int8_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    int8_t local_int8;
+    bf16_t local_bf16;
+
+    extern __shared__ bf16_t workspace1[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = _input[i];
+        workspace1[i] = local_bf16;
+
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        local_max = fmaxf(local_max, fabsf(tmp));
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / kINT8Max;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid; i < N; i += TPB) {
+        local_bf16 = workspace1[i];
+        
+        fp32_t tmp = cvt_bf16_f32(local_bf16);
+        fp32_t x = tmp * inv_scale;
+        local_int8 = float_to_int8_rn(x);
+
+        _output[i] = local_int8;
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+
+}
+
+// CUDA kernel for per token quantization from BF16 to INT8
+template<int32_t TPB>
+__global__ void device_per_token_quant_bf16_to_int8_vpt(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    int8_t* __restrict__ output,   // Output tensor in INT8 format
+    fp32_t* __restrict__ scales,       // Output scales for each token
+    const int32_t N
+) {
+    constexpr int32_t VPT = 8;
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t kINT8Max = 127.0f; // Maximum value representable in INT8 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    int8_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+     _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    int8_t local_int8[VPT];
+    bf16x2_t local_bf16[VPT / 2];
+
+    extern __shared__ bf16x2_t workspace2[];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace2 + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / kINT8Max;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace2 + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[j]);
+
+            int8_t a = float_to_int8_rn(x.x * inv_scale);
+            int8_t b = float_to_int8_rn(x.y * inv_scale);
+            
+            local_int8[2 * j] = a;
+            local_int8[2 * j + 1] = b;
+        }
+
+        vec_copy<sizeof(int8_t) * VPT>(local_int8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+
+// CUDA kernel for per token quantization from BF16 to INT8
+template<int32_t TPB, int32_t N>
+__global__ void device_per_token_quant_bf16_to_int8(
+    const bf16_t* __restrict__ input,  // Input tensor in BF16 format
+    int8_t* __restrict__ output,   // Output tensor in INT8 format
+    fp32_t* __restrict__ scales       // Output scales for each token
+) {
+    constexpr int32_t VPT = 8;
+
+    static_assert(N % 2 == 0, "N must be even.");
+    static_assert(N % VPT == 0, "N must be a multiple of VPT.");
+
+    const int32_t bid = blockIdx.x;
+    const int32_t tid = threadIdx.x;
+    constexpr fp32_t kINT8Max = 127.0f; // Maximum value representable in INT8 format
+    
+    const bf16_t* _input = input + bid * N; // Input pointer for the token
+    int8_t* _output  = output + bid * N; // Output pointer for the token
+
+    fp32_t* _scales;
+    _scales = scales + bid;
+
+    // Local arrays for intermediate storage
+    int8_t local_int8[VPT];
+    bf16x2_t local_bf16[VPT / 2];
+
+    __shared__ bf16x2_t workspace[N / 2];
+
+    fp32_t local_max = -FLT_MAX;
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        // Load VPT FP16 elements from global memory (_X) into local vector (local_x).
+        vec_copy<sizeof(bf16_t) * VPT>(_input + i, local_bf16);
+
+        vec_copy<sizeof(bf16_t) * VPT>(local_bf16, workspace + (i >> 1));
+
+        // Compute the max for the VPT elements.
+        #pragma unroll
+        for(int32_t j = 0; j< VPT/2; j++){
+            fp32x2_t tmp = bf16x2_to_fp32x2(local_bf16[j]);
+            fp32_t max = fmaxf(fabsf(tmp.x), fabsf(tmp.y));
+            local_max = fmaxf(local_max, max);
+        }
+    }
+
+    // Reduce the maximum value across the block
+    const fp32_t reduced_max = lightllm::reduce::sm70::sync_block_reduce_max_f32<TPB>(local_max);
+
+    // Compute the scale factor with epsilon to avoid division by zero
+    constexpr fp32_t epsilon = 1e-7f;
+    const fp32_t scale = reduced_max / kINT8Max;
+    const fp32_t inv_scale = 1.0f / (scale + epsilon);
+
+    for (int32_t i = tid * VPT; i < N; i += TPB * VPT) {
+        vec_copy<sizeof(bf16_t) * VPT>(workspace + (i >> 1), local_bf16);
+
+        #pragma unroll
+        for (int32_t j = 0; j < VPT/2; j++) {
+            fp32x2_t x = bf16x2_to_fp32x2(local_bf16[j]);
+
+            int8_t a = float_to_int8_rn(x.x * inv_scale);
+            int8_t b = float_to_int8_rn(x.y * inv_scale);
+
+            local_int8[2 * j] = a;
+            local_int8[2 * j + 1] = b;
+        }
+
+        vec_copy<sizeof(int8_t) * VPT>(local_int8, _output + i);
+    }
+
+    if(tid == 0){
+        *_scales = scale;
+    }
+}
+
+
+void per_token_quant_bf16_int8 (
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+) {
+    TORCH_CHECK(input.is_cuda(), "Input must be a CUDA tensor");
+    TORCH_CHECK(input.dim() == 2, "Input must be 2-dimensional");
+    TORCH_CHECK(input.scalar_type() == c10::kBFloat16, "Input must be BF16 type");
+
+    Tensor contiguous_input = input.is_contiguous() ? input : input.contiguous();
+    Tensor contiguous_scales = scales.is_contiguous() ? scales : scales.contiguous();
+
+    const int64_t M = input.size(0);
+    const int64_t N = input.size(1);
+
+    const int32_t blocks = M;
+
+    switch (N) {
+        case 16:
+            device_per_token_quant_bf16_to_int8<128, 16>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 32:
+            device_per_token_quant_bf16_to_int8<128, 32>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 64:
+            device_per_token_quant_bf16_to_int8<128, 64>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 512:
+            device_per_token_quant_bf16_to_int8<128, 512>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 1024:
+            device_per_token_quant_bf16_to_int8<128, 1024>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 3200:
+            device_per_token_quant_bf16_to_int8<128, 3200>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 4096:
+            device_per_token_quant_bf16_to_int8<128, 4096>
+            <<<blocks, 128, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        case 12800:
+            device_per_token_quant_bf16_to_int8<256, 12800>
+            <<<blocks, 256, 0, at::cuda::getCurrentCUDAStream()>>>(
+                PTR<bf16_t>(contiguous_input),
+                PTR<int8_t>(output),
+                PTR<fp32_t>(contiguous_scales)
+            );
+            break;
+        default: {
+            static constexpr int TPB = 128;
+            const int64_t shared_mem_size = N * sizeof(bf16_t);
+            if (N % 8 == 0) {
+                device_per_token_quant_bf16_to_int8_vpt<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<int8_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    N
+                );
+            } else {
+                device_per_token_quant_bf16_to_int8_general<TPB>
+                <<<blocks, TPB, shared_mem_size, at::cuda::getCurrentCUDAStream()>>>(
+                    PTR<bf16_t>(contiguous_input),
+                    PTR<int8_t>(output),
+                    PTR<fp32_t>(contiguous_scales),
+                    N
+                );
+            }
+        }
+    }
+
+    return;
+}
+
+} // namespace ops
+} // namespace lightllm
diff --git a/lightllm-kernel/include/cutlass_extensions/common.hpp b/lightllm-kernel/include/cutlass_extensions/common.hpp
new file mode 100755
index 000000000..f8a19f974
--- /dev/null
+++ b/lightllm-kernel/include/cutlass_extensions/common.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include <climits>
+#include "cuda_runtime.h"
+#include <iostream>
+
+/**
+ * Helper function for checking CUTLASS errors
+ */
+#define CUTLASS_CHECK(status)                       \
+  {                                                 \
+    cutlass::Status error = status;                 \
+    TORCH_CHECK(error == cutlass::Status::kSuccess, \
+                cutlassGetStatusString(error));     \
+  }
+
+/**
+ * Panic wrapper for unwinding CUDA runtime errors
+ */
+#define CUDA_CHECK(status)                                        \
+  {                                                               \
+    cudaError_t error = status;                                   \
+    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
+  }
+
+inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
+  int max_shared_mem_per_block_opt_in = 0;
+  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
+                        cudaDevAttrMaxSharedMemoryPerBlockOptin,
+                        device);
+  return max_shared_mem_per_block_opt_in;
+}
+
+inline int32_t get_sm_version_num() {
+  int32_t major_capability, minor_capability;
+  cudaDeviceGetAttribute(&major_capability, cudaDevAttrComputeCapabilityMajor,
+                         0);
+  cudaDeviceGetAttribute(&minor_capability, cudaDevAttrComputeCapabilityMinor,
+                         0);
+  int32_t version_num = major_capability * 10 + minor_capability;
+  return version_num;
+}
+
+inline uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
diff --git a/lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp b/lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
new file mode 100755
index 000000000..58b1e8ff1
--- /dev/null
+++ b/lightllm-kernel/include/cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp
@@ -0,0 +1,447 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcast {
+  static_assert(Stages == 0, "Row broadcast doesn't support smem usage");
+  static_assert(is_static_v<decltype(take<0,2>(StrideMNL{}))>); // batch stride can be dynamic or static
+  static_assert(take<0,2>(StrideMNL{}) == Stride<_0,_1>{});
+
+  struct SharedStorage { 
+    array_aligned<Element, size<1>(CtaTileShapeMNK{})> smem;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params)
+      , smem(const_cast<Element*>(shared_storage.smem.data())) { }
+
+  Params params;
+  Element *smem = nullptr;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
+  }
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class CTensor, class ThrResidue, class ThrNum>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+        GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
+        GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
+        SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
+        CTensor tCcRow_, ThrResidue residue_tCcRow_, ThrNum thr_num_, Params const& params_)
+      : tGS_gRow(tGS_gRow_)
+      , tGS_sRow(tGS_sRow_)
+      , tGS_cRow(tGS_cRow_)
+      , tiled_G2S(tiled_g2s_)
+      , tSR_sRow(tSR_sRow_)
+      , tSR_rRow(tSR_rRow_)
+      , tCcRow(tCcRow_)
+      , residue_tCcRow(residue_tCcRow_)
+      , params(params_) {}
+
+    GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_STensor tGS_sRow;                                                         // (CPY,CPY_M,CPY_N)
+    GS_CTensor tGS_cRow;                                                         // (CPY,CPY_M,CPY_N)
+    Tiled_G2S tiled_G2S;
+
+    SR_STensor tSR_sRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
+  
+    CTensor tCcRow;                                                              // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    ThrResidue residue_tCcRow;                                                   // (m, n)
+    ThrNum thr_num;
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.row_broadcast) {
+        fill(tSR_rRow, *(params.ptr_row));
+        return;
+      }
+
+      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+      Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
+      Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
+      Tensor tGS_cRow_flt = make_tensor(tGS_cRow.data(), make_layout(tGS_gRow_flt.shape(), tGS_cRow.stride()));
+
+      for (int i = 0; i < size(tGS_gRow_flt); ++i) {
+        if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
+          continue; // OOB of SMEM, 
+        }
+        if (elem_less(tGS_cRow_flt(i), make_coord(get<0>(residue_tCcRow), get<1>(residue_tCcRow)))) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        }
+        else {
+          tGS_sRow_flt(i) = Element(0); // Set to Zero when OOB so LDS could be issue without any preds.
+        }
+      }
+      synchronize();
+    }
+
+    CUTLASS_DEVICE void
+    begin_loop(int epi_m, int epi_n) {
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        if (!params.row_broadcast) return; // Do not issue LDS when row is scalar 
+        Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
+        Tensor tSR_rRow_flt = filter_zeros(tSR_rRow);
+        copy(tSR_sRow_flt, tSR_rRow_flt);
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tSR_rRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    using ThreadCount = decltype(size(args.tiled_copy));
+
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow(_,_,l), take<0,2>(args.tile_shape_mnk), make_coord(m, n));          // (CTA_M, CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem), 
+        make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})), make_shape(_0{}, _1{}));  // (CTA_M, CTA_N)
+    //// G2S: Gmem to Smem
+    auto tiled_g2s = make_tiled_copy(Copy_Atom<DefaultCopy, Element>{},
+                                     Layout< Shape<_1, ThreadCount>, 
+                                            Stride<_0,          _1>>{}, 
+                                     Layout<_1>{});   
+    auto thr_g2s = tiled_g2s.get_slice(args.thread_idx);
+    Tensor tGS_gRow = thr_g2s.partition_S(gRow);
+    Tensor tGS_sRow = thr_g2s.partition_D(sRow);
+
+    //// G2S: Coord 
+    auto cRow = make_identity_tensor(make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{})));
+    Tensor tGS_cRow = thr_g2s.partition_S(cRow);
+
+    //// S2R: Smem to Reg
+    Tensor tSR_sRow = sm90_partition_for_epilogue<ReferenceSrc>(sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tSR_rRow = make_tensor_like(take<0,3>(tSR_sRow));                                           // (CPY,CPY_M,CPY_N)
+
+    return ConsumerStoreCallbacks<decltype(tGS_gRow), decltype(tGS_sRow), decltype(tGS_cRow), decltype(tiled_g2s), decltype(tSR_sRow), decltype(tSR_rRow), decltype(args.tCcD), decltype(args.residue_cD), ThreadCount>(
+      tGS_gRow, 
+      tGS_sRow, 
+      tGS_cRow, tiled_g2s, 
+      tSR_sRow, 
+      tSR_rRow, 
+      args.tCcD, 
+      args.residue_cD,
+      ThreadCount{}, 
+      params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcast {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static bool
+  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
+    return true;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor, class CTensor, class ProblemShape>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(
+      GTensor&& tCgCol,
+      RTensor&& tCrCol,
+      CTensor&& tCcCol,
+      ProblemShape problem_shape,
+      Params const& params
+    ): 
+      tCgCol(cute::forward<GTensor>(tCgCol)),
+      tCrCol(cute::forward<RTensor>(tCrCol)),
+      tCcCol(cute::forward<CTensor>(tCcCol)),
+      m(get<0>(problem_shape)),
+      params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;
+    CTensor tCcCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+    int m;
+
+    CUTLASS_DEVICE void
+    begin() {
+      Tensor pred = make_tensor<bool>(shape(tCgCol));
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < size(pred); ++i) {
+        pred(i) = get<0>(tCcCol(i)) < m;
+      }
+
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_if(pred, filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    // Generate an identity tensor matching the shape of the global tensor and 
+    //  partition the same way, this will be used to generate the predicate
+    //  tensor for loading
+    Tensor cCol = make_identity_tensor(mCol.shape());
+    Tensor tCcCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      cCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+
+    return ConsumerStoreCallbacks(
+      cute::move(tCgCol), 
+      cute::move(tCrCol), 
+      cute::move(tCcCol), 
+      args.problem_shape_mnkl, 
+      params
+    );
+  }
+};
+
+}
diff --git a/lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
new file mode 100755
index 000000000..00b9c6f4a
--- /dev/null
+++ b/lightllm-kernel/include/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp
@@ -0,0 +1,286 @@
+#pragma once
+
+#include "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp"
+
+/*
+   This file defines custom epilogues for fusing channel scales, token scales,
+   bias, and activation zero-points onto a GEMM operation using the
+   CUTLASS 3.x API, for NVIDIA GPUs with sm90a (Hopper) or later.
+
+   Epilogues must contain a public type named EVTCompute of type Sm90EVT,
+   as well as a static prepare_args function that constructs an
+   EVTCompute::Arguments struct.
+*/
+
+namespace lightllm::c3x {
+
+using namespace cute;
+
+/*
+ * This class provides the common load descriptors for the
+ * ScaledEpilogue[...] classes
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBase {
+ protected:
+  using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
+
+  template <typename T>
+  using ColOrScalarLoad = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<1>, Int<0>, Int<0>>>;
+
+  template <typename T>
+  using RowOrScalarLoad = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T,
+      Stride<Int<0>, Int<1>, Int<0>>>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using ColLoad = cutlass::epilogue::fusion::Sm90ColBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<1>, Int<0>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // Don't want to support nullptr by default
+  template <typename T, bool EnableNullPtr = false>
+  using RowLoad = cutlass::epilogue::fusion::Sm90RowBroadcast<
+      0 /*Stages*/, typename EpilogueDescriptor::TileShape, T, T,
+      Stride<Int<0>, Int<1>, Int<0>>, 128 / sizeof_bits_v<T>, EnableNullPtr>;
+
+  // This utility function constructs the arguments for the load descriptors
+  // from a tensor. It can handle both row and column, as well as row/column or
+  // scalar cases.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(torch::Tensor const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = static_cast<T*>(tensor.data_ptr());
+    if constexpr (std::is_same_v<Descriptor, ColOrScalarLoad<T>> ||
+                  std::is_same_v<Descriptor, RowOrScalarLoad<T>>) {
+      return Arguments{data_ptr, tensor.numel() != 1};
+    } else {
+      static_assert(!std::is_same_v<Descriptor, ColLoad<T, true>> &&
+                    !std::is_same_v<Descriptor, RowLoad<T, true>>);
+      return Arguments{data_ptr};
+    }
+  }
+
+  // This overload handles the case where there might not be a tensor, in which
+  // case a nullptr is passed and a constant (0) is used.
+  template <typename Descriptor, typename T>
+  static auto args_from_tensor(c10::optional<torch::Tensor> const& tensor) {
+    using Arguments = typename Descriptor::Arguments;
+    auto* data_ptr = tensor ? static_cast<T*>(tensor->data_ptr()) : nullptr;
+    static_assert(std::is_same_v<Descriptor, ColLoad<T, true>> ||
+                  std::is_same_v<Descriptor, RowLoad<T, true>>);
+    return Arguments{data_ptr};
+  }
+};
+
+/*
+   This epilogue function defines a quantized GEMM operation similar to
+   torch.scaled_mm_.
+
+   A and B may be both either int8 or fp8_e4m3. A can be
+   quantized per-tensor or per-row. B can be quantized per-tensor or per-column.
+   Any combination of per-tensor and per-row or column is supported.
+   A and B must have symmetric quantization (zero point == 0).
+
+   So the GEMM operation is D = (a_scales * A) (b_scales * B), where the
+   scales are applied elementwise with numpy-style broadcasting.
+
+   ScaleA and ScaleB define the epilogue functions that apply the scales for
+   the A and B operands respectively. These scales may be either per-tensor or
+   per row or column.
+*/
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogue
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+  using ArgumentType = typename EVTCompute::Arguments;
+
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias.
+ * This bias can also be used in the per-tensor azp case, where the activation
+ * zero point (azp) is used to compute an azp correction term,
+ * which is folded into the bias.
+ *
+ * The bias tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBias
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    return ArgumentType{a_args, evt0_args, bias_args};
+  }
+};
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but multiplies a Ls.
+ * The Ls tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueLs
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Ls = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute1 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0>;
+
+  using Compute2 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+    
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute2, Ls, EVTCompute1>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& ls) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto ls_args = SUPER::template args_from_tensor<Ls, ElementD>(ls);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    typename EVTCompute1::Arguments evt1_args{a_args, evt0_args};
+    return ArgumentType{ls_args, evt1_args};
+  }
+};
+
+
+/*
+ * This epilogue performs the same operation as ScaledEpilogue, but adds a bias and multiplies a Ls.
+ * The bias tensor must be per-output channel.
+ * The Ls tensor must be per-output channel.
+ * ScaleA and ScaleB can be per-tensor or per-token/per-channel.
+ */
+template <typename ElementAcc, typename ElementD, typename EpilogueDescriptor>
+struct ScaledEpilogueBiasLs
+    : private ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor> {
+ private:
+  using SUPER = ScaledEpilogueBase<ElementAcc, ElementD, EpilogueDescriptor>;
+  using Accum = typename SUPER::Accum;
+  using ScaleA = typename SUPER::template ColOrScalarLoad<float>;
+  using ScaleB = typename SUPER::template RowOrScalarLoad<float>;
+  using Bias = typename SUPER::template RowLoad<ElementD>;
+  using Ls = typename SUPER::template RowLoad<ElementD>;
+
+  using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+  using EVTCompute0 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
+
+  using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiply_add, float, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+    
+  using EVTCompute1 =
+      cutlass::epilogue::fusion::Sm90EVT<Compute1, ScaleA, EVTCompute0, Bias>;
+
+  using Compute2 = cutlass::epilogue::fusion::Sm90Compute<
+      cutlass::multiplies, ElementD, float,
+      cutlass::FloatRoundStyle::round_to_nearest>;
+
+ public:
+  using EVTCompute =
+      cutlass::epilogue::fusion::Sm90EVT<Compute2, Ls, EVTCompute1>;
+
+  using ArgumentType = typename EVTCompute::Arguments;
+  static ArgumentType prepare_args(torch::Tensor const& a_scales,
+                                   torch::Tensor const& b_scales,
+                                   torch::Tensor const& bias,
+                                   torch::Tensor const& ls) {
+    auto a_args = SUPER::template args_from_tensor<ScaleA, float>(a_scales);
+    auto b_args = SUPER::template args_from_tensor<ScaleB, float>(b_scales);
+    auto bias_args = SUPER::template args_from_tensor<Bias, ElementD>(bias);
+    auto ls_args = SUPER::template args_from_tensor<Ls, ElementD>(ls);
+
+    typename EVTCompute0::Arguments evt0_args{b_args};
+    typename EVTCompute1::Arguments evt1_args{a_args, evt0_args, bias_args};
+    return ArgumentType{ls_args, evt1_args};
+  }
+};
+
+
+} // namespace lightllm::c3x
\ No newline at end of file
diff --git a/lightllm-kernel/include/ops_common.h b/lightllm-kernel/include/ops_common.h
new file mode 100644
index 000000000..6f814a6c1
--- /dev/null
+++ b/lightllm-kernel/include/ops_common.h
@@ -0,0 +1,137 @@
+#pragma once
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <vector>
+#include <tuple>
+
+#include "utils.h"
+
+
+namespace lightllm {
+namespace ops {
+
+using namespace lightllm;
+
+int64_t meta_size();
+Tensor pre_tp_norm_bf16(Tensor &input);
+
+Tensor post_tp_norm_bf16(
+    Tensor &input, const Tensor& weight,
+    const Tensor& tp_variance, const int embed_dim,
+    const fp32_t eps
+);
+
+Tensor rmsnorm_align16_bf16(
+    const Tensor &X, const Tensor &W,
+    const fp32_t eps
+);
+
+void per_token_quant_bf16_fp8(
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+);
+
+void per_token_quant_bf16_int8(
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+);
+
+std::tuple<Tensor, Tensor> add_norm_quant_bf16_fp8(
+    Tensor& X, const Tensor &R, const Tensor &W,
+    const fp32_t eps
+);
+
+void gelu_per_token_quant_bf16_fp8(
+    Tensor& output,
+    const Tensor& input,
+    Tensor& scales
+);
+
+void cutlass_scaled_mm(
+    Tensor& c, Tensor const& a,
+    Tensor const& b, Tensor const& a_scales,
+    Tensor const& b_scales,
+    c10::optional<Tensor> const& bias,
+    c10::optional<Tensor> const& ls
+);
+
+Tensor grouped_topk(
+        Tensor topk_weights,
+        Tensor correction_bias,
+        Tensor topk_indices,
+        Tensor group_indices,
+        Tensor gating_output,
+        int64_t  num_expert_group,
+        int64_t  topk_group,
+        int64_t  topk,
+        bool     renormalize,
+        std::string scoring_func,
+        Tensor group_scores
+);
+
+void all_gather(
+    int64_t _fa,
+    Tensor& inp,
+    Tensor& out,
+    int64_t _reg_buffer,
+    int64_t reg_buffer_sz_bytes
+);
+
+void group_int8kv_flashdecoding_attention(
+    const int64_t seq_block_size, 
+    Tensor mid_o_emb, 
+    Tensor mid_o_logexpsum, 
+    fp32_t att_scale, 
+    Tensor q, 
+    Tensor k, 
+    Tensor k_s,  
+    Tensor v,  
+    Tensor v_s, 
+    Tensor req_to_tokens, 
+    Tensor b_req_idx, 
+    Tensor b_seq_len, 
+    int64_t max_len_in_batch);
+
+void group_int8kv_decode_attention(
+    Tensor o, 
+    Tensor q, 
+    Tensor k, 
+    Tensor k_s,  
+    Tensor v,  
+    Tensor v_s, 
+    Tensor req_to_tokens, 
+    Tensor b_req_idx, 
+    Tensor b_seq_len, 
+    int64_t max_len_in_batch);
+
+int64_t init_custom_gather_ar(
+    const std::vector<int64_t>& fake_ipc_ptrs,
+    torch::Tensor& rank_data,
+    int64_t rank,
+    bool full_nvlink
+);
+
+void allgather_dispose(
+    int64_t _fa
+);
+
+void allgather_register_buffer(
+    int64_t _fa,
+    const std::vector<int64_t>& fake_ipc_ptrs
+);
+
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+allgather_get_graph_buffer_ipc_meta(
+    int64_t _fa
+);
+
+void allgather_register_graph_buffers(
+    int64_t _fa,
+    const std::vector<std::vector<int64_t>>& handles,
+    const std::vector<std::vector<int64_t>>& offsets
+);
+
+} // namespace ops
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/include/reduce/sm70.cuh b/lightllm-kernel/include/reduce/sm70.cuh
new file mode 100755
index 000000000..b1d78f344
--- /dev/null
+++ b/lightllm-kernel/include/reduce/sm70.cuh
@@ -0,0 +1,191 @@
+#pragma once
+#include "utils.h"
+
+namespace lightllm {
+namespace reduce {
+namespace sm70 {
+/**
+ * @brief Performs a block-wide reduction to sum up floating-point
+ * values across all threads in a block.
+ *
+ * This function computes the sum of all `input` values
+ * provided by threads in a block using
+ * a combination of warp shuffle and shared memory.
+ * The result is stored in the first thread of the block.
+ *
+ * @tparam TPB Threads per block, must be a multiple of the warp size (32).
+ * @param input The input value for the calling thread.
+ * @return The block-wide sum of the input values. Only thread 0 of the block holds the valid result.
+ *
+ * @note This function assumes that `TPB` is divisible by 32 (warp size).
+ */
+template<int32_t TPB>
+__device__ inline
+fp32_t sync_block_reduce_sum_f32(const fp32_t input) {
+    constexpr int32_t warpSize = 32;
+    static_assert(TPB <= warpSize * warpSize);
+
+    // Thread ID within the current block
+    const int32_t tid = threadIdx.x;
+    const int32_t warp_lane = tid % 32;
+    const int32_t warp_id   = tid / warpSize;
+
+    fp32_t local_sum = input;
+
+    // Warp-level reduction using shuffle operations
+    for (int32_t stride = warpSize / 2; stride > 0; stride /= 2) {
+        local_sum += __shfl_down_sync(0xFFFFFFFF, local_sum, stride);
+    }
+
+    // Shared memory reduction across warps
+    __shared__ fp32_t shared_sum[TPB / warpSize];
+    if (warp_lane == 0) {
+        shared_sum[warp_id] = local_sum;
+    }
+    __syncthreads();
+
+    // Block-level reduction using the first warp
+    if (warp_id == 0) {
+        if (warp_lane < TPB / warpSize) {
+            local_sum = shared_sum[warp_lane];
+        } else {
+            local_sum = 0.0f;
+        }
+
+        for (int32_t stride = (TPB / warpSize) / 2; stride > 0; stride /= 2) {
+            local_sum += __shfl_down_sync(0xFFFFFFFF, local_sum, stride);
+        }
+    }
+
+    if (warp_id == 0 && warp_lane == 0) {
+        shared_sum[0] = local_sum;
+    }
+    __syncthreads();
+
+    return shared_sum[0];
+}
+
+
+
+template<int32_t TPB>
+__device__ inline
+fp32_t sync_block_reduce_max_f32(const fp32_t input) {
+    constexpr int32_t warpSize = 32;
+    static_assert(TPB <= warpSize * warpSize);
+
+    // Thread ID within the current block
+    const int32_t tid = threadIdx.x;
+    const int32_t warp_lane = tid % 32;
+    const int32_t warp_id   = tid / warpSize;
+
+    fp32_t local_max = input;
+
+    // Warp-level reduction using shuffle operations
+    for (int32_t stride = warpSize / 2; stride > 0; stride /= 2) {
+        local_max = fmaxf(__shfl_down_sync(0xFFFFFFFF, local_max, stride), local_max);
+    }
+
+    // Shared memory reduction across warps
+    __shared__ fp32_t shared_max[TPB / warpSize];
+    if (warp_lane == 0) {
+        shared_max[warp_id] = local_max;
+    }
+    __syncthreads();
+
+    // Block-level reduction using the first warp
+    if (warp_id == 0) {
+        if (warp_lane < TPB / warpSize) {
+            local_max = shared_max[warp_lane];
+        } else {
+            local_max = -FLT_MAX;
+        }
+
+        for (int32_t stride = (TPB / warpSize) / 2; stride > 0; stride /= 2) {
+            local_max = fmaxf(__shfl_down_sync(0xFFFFFFFF, local_max, stride), local_max);
+        }
+    }
+
+    if (warp_id == 0 && warp_lane == 0) {
+        shared_max[0] = local_max;
+    }
+    __syncthreads();
+
+    return shared_max[0];
+}
+
+/**
+ * @brief Performs a block-wide reduction to compute both sum and max
+ * of floating-point values across all threads in a block.
+ *
+ * This function computes both the sum and maximum of all `input` values
+ * provided by threads in a block using a combination of warp shuffle
+ * and shared memory. The result is stored in the first thread of the block.
+ *
+ * @tparam TPB Threads per block, must be a multiple of the warp size (32).
+ * @param input The input value for the calling thread (contains .x for sum, .y for max).
+ * @return The block-wide reduction result (sum in .x, max in .y). Only thread 0 of the block holds the valid result.
+ *
+ * @note This function assumes that `TPB` is divisible by 32 (warp size).
+ */
+template<int32_t TPB>
+__device__ inline
+fp32x2_t sync_block_reduce_sum_max_f32(const fp32x2_t input) {
+    constexpr int32_t warpSize = 32;
+    static_assert(TPB <= warpSize * warpSize);
+
+    // Thread ID within the current block
+    const int32_t tid = threadIdx.x;
+    const int32_t warp_lane = tid % warpSize;
+    const int32_t warp_id   = tid / warpSize;
+
+    fp32x2_t local_result = input;
+
+    // Warp-level reduction using shuffle operations
+    for (int32_t stride = warpSize / 2; stride > 0; stride /= 2) {
+        // Sum reduction for .x component
+        float sum_val = __shfl_down_sync(0xFFFFFFFF, local_result.x, stride);
+        local_result.x += sum_val;
+        
+        // Max reduction for .y component
+        float max_val = __shfl_down_sync(0xFFFFFFFF, local_result.y, stride);
+        local_result.y = max(local_result.y, max_val);
+    }
+
+    // Shared memory reduction across warps
+    __shared__ fp32x2_t shared_result[TPB / warpSize];
+    if (warp_lane == 0) {
+        shared_result[warp_id] = local_result;
+    }
+    __syncthreads();
+
+    // Block-level reduction using the first warp
+    if (warp_id == 0) {
+        if (warp_lane < TPB / warpSize) {
+            local_result = shared_result[warp_lane];
+        } else {
+            local_result.x = 0.0f;  // Identity for sum
+            local_result.y = -INFINITY;  // Identity for max
+        }
+
+        for (int32_t stride = (TPB / warpSize) / 2; stride > 0; stride /= 2) {
+            // Sum reduction for .x component
+            float sum_val = __shfl_down_sync(0xFFFFFFFF, local_result.x, stride);
+            local_result.x += sum_val;
+            
+            // Max reduction for .y component
+            float max_val = __shfl_down_sync(0xFFFFFFFF, local_result.y, stride);
+            local_result.y = max(local_result.y, max_val);
+        }
+    }
+
+    if (warp_id == 0 && warp_lane == 0) {
+        shared_result[0] = local_result;
+    }
+    __syncthreads();
+
+    return shared_result[0];
+}
+
+} // namespace sm70
+} // namespace reduce
+} // namespace lightllm
\ No newline at end of file
diff --git a/lightllm-kernel/include/utils.h b/lightllm-kernel/include/utils.h
new file mode 100644
index 000000000..882b5cea8
--- /dev/null
+++ b/lightllm-kernel/include/utils.h
@@ -0,0 +1,273 @@
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+#include <device_launch_parameters.h>
+#include <cuda_runtime_api.h>
+
+// mycuda, some wrappers and utils
+namespace lightllm {
+// type definitions
+using fp16_t = __half;
+using fp16x2_t = __half2;
+using bf16_t = __nv_bfloat16;
+using bf16x2_t = __nv_bfloat162;
+
+using fp8_e4m3_t = __nv_fp8_e4m3;
+using fp8x2_e4m3_t = __nv_fp8x2_e4m3;
+using fp8x4_e4m3_t = __nv_fp8x4_e4m3;
+
+using fp32_t = float;
+using fp32x2_t = float2;
+using fp32x4_t = float4;
+
+using int32x4_t = int4;
+using int32x2_t = int2;
+
+using int8x2_t = short;
+using int8x4_t = int32_t;
+using int8x8_t = int64_t;
+
+using vec_type = int4;
+
+// convert fp16_t to fp32_t
+__device__ inline fp32_t cvt_f16_f32(const fp16_t x) { return __half2float(x); }
+
+__device__ inline fp16_t cvt_f32_f16(const fp32_t x) { return __float2half(x); }
+
+// Convert bf16_t to fp32_t
+__device__ inline fp32_t cvt_bf16_f32(const bf16_t x) {
+    return __bfloat162float(x);
+}
+
+// Convert fp32_t to bf16_t
+__device__ inline bf16_t cvt_f32_bf16(const fp32_t x) {
+    return __float2bfloat16(x);
+}
+
+// bf16x2 to fp32x2 conversion
+__device__ inline fp32x2_t bf16x2_to_fp32x2(bf16x2_t bf16x2_val) {
+    // Extract the two bfloat16 values from bf16x2
+    bf16_t low = __low2bfloat16(bf16x2_val);
+    bf16_t high = __high2bfloat16(bf16x2_val);
+
+    // Convert bfloat16 to float
+    float low_f = __bfloat162float(low);
+    float high_f = __bfloat162float(high);
+
+    // Pack the two floats into a float2
+    return make_float2(low_f, high_f);
+}
+
+__device__ inline bf16x2_t _float22bf162_rn(fp32x2_t val) {
+    bf16_t low = __float2bfloat16(val.x);
+    bf16_t high = __float2bfloat16(val.y);
+    return bf16x2_t(low, high);
+}
+
+__device__ inline int8_t float_to_int8_rn(fp32_t x) {
+  uint32_t dst;
+  asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
+  return reinterpret_cast<const int8_t&>(dst);
+}
+
+template <typename T>
+__host__ __device__ T Cdiv(T numerator, T denominator) {
+    return (numerator + denominator - 1) / denominator;
+}
+
+template <typename T>
+__host__ __device__ T Adiv(T value, T alignment) {
+    return (value + alignment - 1) & ~(alignment - 1);
+}
+
+__device__ inline fp32x2_t operator+(const fp32x2_t& a, const fp32x2_t& b) {
+    return {a.x + b.x, a.y + b.y};
+}
+
+__device__ inline fp16_t abs(const fp16_t& x) { return __habs(x); }
+
+__device__ inline bool operator>(const fp16_t& a, const fp16_t& b) {
+    return __hgt(a, b);
+}
+
+__device__ inline fp16_t operator+(const fp16_t& a, const fp16_t& b) {
+    return __hadd(a, b);
+}
+
+__device__ inline fp16_t operator-(const fp16_t& a, const fp16_t& b) {
+    return __hsub(a, b);
+}
+
+__device__ inline fp16_t operator*(const fp16_t& a, const fp16_t& b) {
+    return __hmul(a, b);
+}
+
+__device__ inline fp16_t operator/(const fp16_t& a, const fp16_t& b) {
+    return __hdiv(a, b);
+}
+
+__device__ inline fp16_t& operator+=(fp16_t& a, const fp16_t& b) {
+    a = __hadd(a, b);
+    return a;
+}
+
+__device__ inline fp16_t& operator-=(fp16_t& a, const fp16_t& b) {
+    a = __hsub(a, b);
+    return a;
+}
+
+__device__ inline fp16_t& operator*=(fp16_t& a, const fp16_t& b) {
+    a = __hmul(a, b);
+    return a;
+}
+
+__device__ inline fp16_t& operator/=(fp16_t& a, const fp16_t& b) {
+    a = __hdiv(a, b);
+    return a;
+}
+
+__device__ inline fp16x2_t operator+(const fp16x2_t& a, const fp16x2_t& b) {
+    return __hadd2(a, b);
+}
+
+template <int VPT>
+struct BytesToType;
+
+template <>
+struct BytesToType<2>
+{
+    using type = uint16_t;
+};
+template <>
+struct BytesToType<4>
+{
+    using type = uint32_t;
+};
+template <>
+struct BytesToType<8>
+{
+    using type = uint64_t;
+};
+template <>
+struct BytesToType<16>
+{
+    using type = float4;
+};
+
+template <int Bytes>
+__device__ inline void vec_copy(const void* src, void* dest)
+{
+    using T = typename BytesToType<Bytes>::type;
+
+    const T* in = static_cast<const T*>(src);
+    T* out = static_cast<T*>(dest);
+    *out = *in;
+}
+
+template<int32_t divisor>
+__device__ inline int32x2_t divmod(const int32_t x);
+
+template<>
+__device__ inline int32x2_t divmod<128>(const int32_t x) {
+    return {x >> 7, x & 0x7F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<64>(const int32_t x) {
+    return {x >> 6, x & 0x3F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<32>(const int32_t x) {
+    return {x >> 5, x & 0x1F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<16>(const int32_t x) {
+    return {x >> 4, x & 0x0F};
+}
+
+template<>
+__device__ inline int32x2_t divmod<8>(const int32_t x) {
+    return {x >> 3, x & 0x07};
+}
+
+template<>
+__device__ inline int32x2_t divmod<4>(const int32_t x) {
+    return {x >> 2, x & 0x03};
+}
+
+template<>
+__device__ inline int32x2_t divmod<2>(const int32_t x) {
+    return {x >> 1, x & 0x01};
+}
+
+}  // namespace lightllm
+
+// mytorch, some wrappers and utils
+namespace lightllm {
+using Tensor = torch::Tensor;
+
+template <typename T>
+__host__ inline T *PTR(at::Tensor t) {
+    return reinterpret_cast<T *>(t.data_ptr());
+}
+
+template <>
+__host__ inline fp16_t *PTR(at::Tensor t) {
+    return reinterpret_cast<fp16_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline fp16x2_t *PTR(at::Tensor t) {
+    return reinterpret_cast<fp16x2_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline int8x4_t *PTR(at::Tensor t) {
+    return reinterpret_cast<int8x4_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline int8x2_t *PTR(at::Tensor t) {
+    return reinterpret_cast<int8x2_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline int8_t *PTR(at::Tensor t) {
+    return reinterpret_cast<int8_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline uint16_t *PTR(at::Tensor t) {
+    return reinterpret_cast<uint16_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline uint32_t *PTR(at::Tensor t) {
+    return reinterpret_cast<uint32_t *>(t.data_ptr());
+}
+
+template <>
+__host__ inline void *PTR(at::Tensor t) {
+    return reinterpret_cast<void *>(t.data_ptr());
+}
+
+__device__ inline
+void block_debug_print_matrix(fp16_t *ptr, int32_t M, int32_t N, int32_t stride) {
+    if(threadIdx.x == 0) {
+        printf("Debug Matrix [%d, %d, %d]: \n", blockIdx.x, blockIdx.y, blockIdx.z);
+        for(int32_t i = 0; i < M; i++) {
+            for(int32_t j = 0; j < N; j++) {
+                printf("%.2f ", __half2float(ptr[i * stride + j]));
+            }
+            printf("\n");
+        }
+    }
+}
+
+}  // namespace lightllm
diff --git a/lightllm-kernel/lightllm_kernel/__init__.py b/lightllm-kernel/lightllm_kernel/__init__.py
new file mode 100644
index 000000000..9373b4b13
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/__init__.py
@@ -0,0 +1,4 @@
+from . import ops  # noqa: F401
+
+meta_size = ops.meta_size
+__all__ = ["ops"]
diff --git a/lightllm-kernel/lightllm_kernel/ops/__init__.py b/lightllm-kernel/lightllm_kernel/ops/__init__.py
new file mode 100644
index 000000000..fe6cfdde8
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/__init__.py
@@ -0,0 +1,104 @@
+import importlib
+import os
+from pathlib import Path
+from torch.utils.cpp_extension import load
+
+PKG = "lightllm_kernel"
+try:
+    _C = importlib.import_module(f"{PKG}._C")
+except ImportError:
+    # raise ImportError("Cannot import compiled extension 'lightllm_kernel.ops'")
+    repo_root = Path(__file__).resolve().parents[3]
+    kernels_root = Path(__file__).resolve().parents[2]
+    csrc_dir = kernels_root / "csrc"
+    if not csrc_dir.exists():
+        raise ImportError(
+            "Cannot import compiled extension 'lightllm_kernel.ops' and no source "
+            "directory (csrc/) found; please ensure you have run "
+            "'cmake --install' or placed lightllm_kernel.ops.so on PYTHONPATH."
+        )
+
+    PROGRAM_NAME = "lightllm_kernel._C"
+    EXTENSION_BUILD_DIR = "build"
+    INCLUDE_DIR = "include"
+    CUTLASS_DIR = "third-party/cutlass/include"
+
+    sources = []
+    file_names = []  # Store file names for printing
+    for subdir, _, files in os.walk(csrc_dir):
+        for file in files:
+            if file.endswith((".cpp", ".cu")):
+                sources.append(os.path.join(subdir, file))
+                file_names.append(file)
+
+    # Print all detected source file names
+    print(f"{PROGRAM_NAME}: Detected source files:")
+    for file_name in file_names:
+        print(f"  - {file_name}")
+
+    _C = load(
+        name=PROGRAM_NAME,
+        sources=sources,
+        verbose=True,
+        extra_include_paths=[
+            os.path.join(kernels_root, INCLUDE_DIR),
+            os.path.join(repo_root, CUTLASS_DIR),
+        ],
+        build_directory=os.path.join(kernels_root, EXTENSION_BUILD_DIR),
+        with_cuda=True,
+        extra_ldflags=["-lcuda", "-L/usr/local/cuda/lib64"],
+        extra_cuda_cflags=[
+            "-DNDEBUG",
+            "-O3",
+            "-use_fast_math",
+            # A100
+            "-gencode=arch=compute_80,code=sm_80",
+            "-gencode=arch=compute_80,code=compute_80",
+            # Ada / L40s / 4090
+            "-gencode=arch=compute_89,code=sm_89",
+            "-gencode=arch=compute_89,code=compute_89",
+            # Hopper / H100 / H200
+            "-gencode=arch=compute_90,code=sm_90",
+            "-gencode=arch=compute_90,code=compute_90",
+            "-gencode=arch=compute_90a,code=sm_90a",
+        ],
+        extra_cflags=["-O3"],
+    )
+
+meta_size = _C.meta_size
+# 向外暴露 Python 端接口
+from .fusion import pre_tp_norm_bf16, post_tp_norm_bf16, add_norm_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
+from .norm import rmsnorm_bf16
+from .allgather import (
+    all_gather,
+    allgather_dispose,
+    init_custom_gather_ar,
+    allgather_register_buffer,
+    allgather_register_graph_buffers,
+    allgather_get_graph_buffer_ipc_meta,
+)
+from .quant import per_token_quant_bf16_fp8, per_token_quant_bf16_int8
+from .gemm import cutlass_scaled_mm_bias_ls
+from .moe import grouped_topk
+from .attention import group8_int8kv_flashdecoding_stage1, group_int8kv_decode_attention
+
+__all__ = [
+    "rmsnorm_bf16",
+    "per_token_quant_bf16_fp8",
+    "per_token_quant_bf16_int8",
+    "pre_tp_norm_bf16",
+    "post_tp_norm_bf16",
+    "add_norm_quant_bf16_fp8",
+    "gelu_per_token_quant_bf16_fp8",
+    "cutlass_scaled_mm_bias_ls",
+    "grouped_topk",
+    "meta_size",
+    "all_gather",
+    "allgather_dispose",
+    "init_custom_gather_ar",
+    "allgather_register_buffer",
+    "allgather_get_graph_buffer_ipc_meta",
+    "allgather_register_graph_buffers",
+    "group8_int8kv_flashdecoding_stage1",
+    "group_int8kv_decode_attention",
+]
diff --git a/lightllm-kernel/lightllm_kernel/ops/allgather.py b/lightllm-kernel/lightllm_kernel/ops/allgather.py
new file mode 100644
index 000000000..f4d124eb5
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/allgather.py
@@ -0,0 +1,29 @@
+import torch
+from typing import Optional, List, Tuple
+from . import _C
+
+
+def all_gather(
+    _fa: int, inp: torch.Tensor, out: torch.Tensor, _reg_buffer: int, reg_buffer_sz_bytes: int
+) -> torch.Tensor:
+    return _C.all_gather(_fa, inp, out, _reg_buffer, reg_buffer_sz_bytes)
+
+
+def init_custom_gather_ar(fake_ipc_ptrs: List[int], rank_data: torch.Tensor, rank: int, full_nvlink: bool) -> int:
+    return _C.init_custom_gather_ar(fake_ipc_ptrs, rank_data, rank, full_nvlink)
+
+
+def allgather_dispose(_fa: int) -> None:
+    _C.allgather_dispose(_fa)
+
+
+def allgather_register_buffer(_fa: int, fake_ipc_ptrs: List[int]) -> None:
+    _C.allgather_register_buffer(_fa, fake_ipc_ptrs)
+
+
+def allgather_get_graph_buffer_ipc_meta(_fa: int) -> Tuple[List[int], List[int]]:
+    return _C.allgather_get_graph_buffer_ipc_meta(_fa)
+
+
+def allgather_register_graph_buffers(_fa: int, handles: List[List[int]], offsets: List[List[int]]) -> None:
+    _C.allgather_register_graph_buffers(_fa, handles, offsets)
diff --git a/lightllm-kernel/lightllm_kernel/ops/attention.py b/lightllm-kernel/lightllm_kernel/ops/attention.py
new file mode 100644
index 000000000..dc1ba99d5
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/attention.py
@@ -0,0 +1,63 @@
+import torch
+from typing import Optional, Tuple
+from . import _C
+
+
+def group8_int8kv_flashdecoding_stage1(
+    seq_block_size: int,
+    mid_o_emb: torch.Tensor,
+    mid_o_logexpsum: torch.Tensor,
+    att_scale: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+    v: torch.Tensor,
+    v_s: torch.Tensor,
+    req_to_tokens: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    max_len_in_batch: int,
+) -> None:
+
+    return _C.group8_int8kv_flashdecoding_stage1(
+        seq_block_size,
+        mid_o_emb,
+        mid_o_logexpsum,
+        att_scale,
+        q,
+        k,
+        k_s,
+        v,
+        v_s,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        max_len_in_batch,
+    )
+
+
+def group_int8kv_decode_attention(
+    o: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    k_s: torch.Tensor,
+    v: torch.Tensor,
+    v_s: torch.Tensor,
+    req_to_tokens: torch.Tensor,
+    b_req_idx: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    max_len_in_batch: int,
+) -> None:
+
+    return _C.group_int8kv_decode_attention(
+        o,
+        q,
+        k,
+        k_s,
+        v,
+        v_s,
+        req_to_tokens,
+        b_req_idx,
+        b_seq_len,
+        max_len_in_batch,
+    )
diff --git a/lightllm-kernel/lightllm_kernel/ops/fusion.py b/lightllm-kernel/lightllm_kernel/ops/fusion.py
new file mode 100644
index 000000000..a9131420f
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/fusion.py
@@ -0,0 +1,30 @@
+import torch
+from typing import Optional, Tuple
+from . import _C
+
+
+def pre_tp_norm_bf16(input: torch.Tensor) -> torch.Tensor:
+    """Calculate powersum along embedding dimension of the input"""
+    return _C.pre_tp_norm_bf16(input)
+
+
+def post_tp_norm_bf16(
+    input: torch.tensor, weight: torch.Tensor, tp_variance: torch.Tensor, embed_dim: int, eps: float
+) -> torch.Tensor:
+    """Apply rmsnorm on given input, with weight and pre calculated powersum"""
+    return _C.post_tp_norm_bf16(input, weight, tp_variance, embed_dim, eps)
+
+
+def add_norm_quant_bf16_fp8(
+    input: torch.Tensor, residual: torch.Tensor, weight: torch.Tensor, eps: float
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply add_norm_quant on given input, with residual and weight"""
+    return _C.add_norm_quant_bf16_fp8(input, residual, weight, eps)
+
+
+def gelu_per_token_quant_bf16_fp8(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Apply gelu on given input and quantize it from bf16 to fp8 using per token quant method"""
+    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
+    _C.gelu_per_token_quant_bf16_fp8(output, input, scales)
+    return output, scales
diff --git a/lightllm-kernel/lightllm_kernel/ops/gemm.py b/lightllm-kernel/lightllm_kernel/ops/gemm.py
new file mode 100644
index 000000000..a3d3dfd4f
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/gemm.py
@@ -0,0 +1,16 @@
+import torch
+from typing import Optional
+from . import _C
+
+
+def cutlass_scaled_mm_bias_ls(
+    c: torch.Tensor,
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_scales: torch.Tensor,
+    b_scales: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    ls: Optional[torch.Tensor],
+) -> None:
+    """Apply scaled mm on the given input, with optional bias and ls weight"""
+    return _C.cutlass_scaled_mm(c, a, b, a_scales, b_scales, bias, ls)
diff --git a/lightllm-kernel/lightllm_kernel/ops/moe.py b/lightllm-kernel/lightllm_kernel/ops/moe.py
new file mode 100644
index 000000000..ce02263df
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/moe.py
@@ -0,0 +1,31 @@
+import torch
+from typing import Optional
+from . import _C
+
+
+def grouped_topk(
+    topk_weights: torch.Tensor,
+    correction_bias: torch.Tensor,
+    topk_indices: torch.Tensor,
+    group_indices: torch.Tensor,
+    gating_output: torch.Tensor,
+    num_expert_group: int,
+    topk_group: int,
+    topk: int,
+    renormalize: bool,
+    scoring_func: str,
+    group_scores: torch.Tensor,
+) -> torch.Tensor:
+    return _C.grouped_topk(
+        topk_weights,
+        correction_bias,
+        topk_indices,
+        group_indices,
+        gating_output,
+        num_expert_group,
+        topk_group,
+        topk,
+        renormalize,
+        scoring_func,
+        group_scores,
+    )
diff --git a/lightllm-kernel/lightllm_kernel/ops/norm.py b/lightllm-kernel/lightllm_kernel/ops/norm.py
new file mode 100644
index 000000000..8974308e5
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/norm.py
@@ -0,0 +1,7 @@
+import torch
+from typing import Optional
+from . import _C
+
+
+def rmsnorm_bf16(X: torch.Tensor, W: torch.Tensor, eps: float = 1e-12) -> torch.Tensor:
+    return _C.rmsnorm_align16_bf16(X, W, eps)
diff --git a/lightllm-kernel/lightllm_kernel/ops/quant.py b/lightllm-kernel/lightllm_kernel/ops/quant.py
new file mode 100644
index 000000000..8d3f8fe9d
--- /dev/null
+++ b/lightllm-kernel/lightllm_kernel/ops/quant.py
@@ -0,0 +1,18 @@
+import torch
+from typing import Optional, Tuple
+from . import _C
+
+
+def per_token_quant_bf16_fp8(input: torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Quantize the given input using per token quant method"""
+    output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+    scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
+    _C.per_token_quant_bf16_fp8(output, input, scales)
+    return output, scales
+
+def per_token_quant_bf16_int8(input: torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Quantize the given input using per token quant method"""
+    output = torch.empty_like(input, dtype=torch.int8)
+    scales = torch.empty(size=(input.shape[0], 1), device=input.device, dtype=torch.float32)
+    _C.per_token_quant_bf16_int8(output, input, scales)
+    return output, scales
diff --git a/lightllm-kernel/pyproject.toml b/lightllm-kernel/pyproject.toml
new file mode 100644
index 000000000..a7be590da
--- /dev/null
+++ b/lightllm-kernel/pyproject.toml
@@ -0,0 +1,70 @@
+[build-system]
+requires = [
+  "scikit-build-core>=0.10",
+  "cmake>=3.22",
+  "ninja",
+  "torch>=2.6.0",
+  "wheel",
+]
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "lightllm-kernel"
+version = "0.1.0"
+description = "CUDA kernel library for LightLLM"
+readme = "README.md"
+requires-python = ">=3.9"
+license = { text = "Apache-2.0" }
+keywords = ["cuda", "lightllm"]
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "License :: OSI Approved :: Apache Software License",
+  "Environment :: GPU :: NVIDIA CUDA",
+  "Operating System :: POSIX :: Linux",
+]
+dependencies = [
+  "pybind11>=2.11",
+]
+
+[project.optional-dependencies]
+dev = [
+  "black",
+  "ruff",
+  "pre-commit",
+]
+test = [
+  "pytest",
+  "pytest-cov",
+]
+
+[project.urls]
+Homepage = "https://github.com/ModelTC/lightllm/tree/main/lightllm-kernel"
+Source = "https://github.com/ModelTC/lightllm/tree/main/lightllm-kernel"
+Issues = "https://github.com/ModelTC/lightllm/issues"
+
+[tool.wheel]
+
+exclude = ["dist*", "tests*"]
+
+[tool.scikit-build]
+cmake.minimum-version = "3.22"
+cmake.build-type = "Release"
+cmake.verbose = true
+
+cmake.args = ["-DCMAKE_CUDA_ARCHITECTURES=80;86;89;90"]
+
+wheel.py-api = "cp39"
+wheel.packages = ["lightllm_kernel"]
+wheel.license-files = ["LICENSE"]
+
+sdist.include = [
+  "CMakeLists.txt",
+  "csrc/**/*",
+  "third-party/cutlass/**/*",
+  "lightllm_kernel/**/*",
+  "LICENSE",
+  "README.md",
+]
+
+install.components = ["Python"]
diff --git a/lightllm-kernel/setup.py b/lightllm-kernel/setup.py
new file mode 100644
index 000000000..da6b69000
--- /dev/null
+++ b/lightllm-kernel/setup.py
@@ -0,0 +1,75 @@
+import os
+from pathlib import Path
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+repo_root = Path(__file__).resolve().parents[1]
+kernels_root = Path(__file__).resolve().parents[0]
+csrc_dir = kernels_root / "csrc"
+if not csrc_dir.exists():
+    raise ImportError(
+        "Cannot import compiled extension 'lightllm_kernel.ops' and no source "
+        "directory (csrc/) found; please ensure you have run "
+        "'cmake --install' or placed lightllm_kernel.ops.so on PYTHONPATH."
+    )
+
+PROGRAM_NAME = "lightllm_kernel._C"
+INCLUDE_DIR = "include"
+CUTLASS_DIR = "third-party/cutlass/include"
+
+sources = []
+file_names = []  # Store file names for printing
+for subdir, _, files in os.walk(csrc_dir):
+    for file in files:
+        if file.endswith((".cpp", ".cu")):
+            sources.append(os.path.join(subdir, file))
+            file_names.append(file)
+
+# Print all detected source file names
+print(f"{PROGRAM_NAME}: Detected source files:")
+for file_name in file_names:
+    print(f"  - {file_name}")
+
+ext_modules = [
+    CUDAExtension(
+        name=PROGRAM_NAME,
+        sources=sources,
+        libraries=["cuda"],
+        library_dirs=["/lib/x86_64-linux-gnu"],
+        extra_link_args=["-lcuda"],  # <-- 备选/补充方法
+        extra_compile_args={
+            "cxx": ["-O3"],
+            "nvcc": [
+                "-DNDEBUG",
+                "-O3",
+                "--use_fast_math",
+                # A100 (compute_80)
+                "-gencode=arch=compute_80,code=sm_80",
+                "-gencode=arch=compute_80,code=compute_80",
+                # A10 / other Ampere (compute_86)
+                "-gencode=arch=compute_86,code=sm_86",
+                "-gencode=arch=compute_86,code=compute_86",
+                # L40s / 4090 (compute_89)
+                "-gencode=arch=compute_89,code=sm_89",
+                "-gencode=arch=compute_89,code=compute_89",
+                # H100 (compute_90)
+                "-gencode=arch=compute_90,code=sm_90",
+                "-gencode=arch=compute_90,code=compute_90",
+                "-gencode=arch=compute_90a, code=sm_90a",
+            ],
+        },
+        include_dirs=[
+            os.path.join(kernels_root, INCLUDE_DIR),
+            os.path.join(repo_root, CUTLASS_DIR),
+        ],
+    )
+]
+
+setup(
+    name="lightllm_kernel",
+    packages=["lightllm_kernel", "lightllm_kernel.ops"],
+    version="0.1",
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension},
+    package_dir={"ops": "ops"},
+)
diff --git a/lightllm-kernel/test/__init__.py b/lightllm-kernel/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/lightllm-kernel/test/fusion/add_norm_quant_test.py b/lightllm-kernel/test/fusion/add_norm_quant_test.py
new file mode 100755
index 000000000..1af329677
--- /dev/null
+++ b/lightllm-kernel/test/fusion/add_norm_quant_test.py
@@ -0,0 +1,96 @@
+import unittest
+import torch
+from lightllm_kernel.ops import add_norm_quant_bf16_fp8
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from test.utils import benchmark, error
+
+
+def torch_add_norm_quant_bf16_fp8(X, R, W, eps=1e-6):
+    N = X.size(1)
+    # 1. Add residual
+    X = X.add_(R)
+    # 2. rmsnorm
+    normalized = torch.nn.functional.rms_norm(X, (N,), W, eps=eps)
+    # 3. per token quant
+    quantized, scales = ops.scaled_fp8_quant(normalized, scale=None, use_per_token_if_dynamic=True)
+
+    return quantized, scales
+
+
+class TestFusedAddNormQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [13]
+        self.seqLens = [1025]
+        self.embed_dims = [16, 32, 64, 512, 1024, 3200, 4096, 12800, 24, 511, 513, 1023, 1025, 1032, 4097]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+        self.eps = 1e-6
+
+    def test_accuracy(self):
+        """Test the accuracy of FusedAddNormQuant against torch."""
+        for batch in self.batchs:
+            for seqLen in self.seqLens:
+                for embed_dim in self.embed_dims:
+                    with self.subTest(shape=[batch, seqLen, embed_dim]):
+                        X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        X2 = X1.clone()
+                        R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R2 = R1.clone()
+                        W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        output_real, scales_real = torch_add_norm_quant_bf16_fp8(
+                            X1.reshape(-1, X1.shape[2]), R1.reshape(-1, R1.shape[2]), W, self.eps
+                        )
+                        output_pred, scales_pred = add_norm_quant_bf16_fp8(
+                            X2.reshape(-1, X1.shape[2]), R2.reshape(-1, R2.shape[2]), W, self.eps
+                        )
+
+                        self.assertTrue(
+                            error(output_real, output_pred) < 0.01,
+                            f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. "
+                            f"output_real={output_real}, output_pred={output_pred}",
+                        )
+                        self.assertTrue(
+                            error(scales_real, scales_pred) < 0.01,
+                            f"Accuracy test failed for size {batch}, {seqLen}, {embed_dim}. "
+                            f"scales_real={scales_real}, scales_pred={scales_pred}",
+                        )
+
+    def test_performance(self):
+        """Test the performance of FusedAddNormQuant using benchmark."""
+        for batch in self.batchs:
+            for seqLen in self.seqLens:
+                for embed_dim in self.embed_dims:
+                    with self.subTest(shape=[batch, seqLen, embed_dim]):
+                        X1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        X2 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R1 = torch.rand(size=[batch, seqLen, embed_dim], device=self.device, dtype=self.dtype) - 0.5
+                        R2 = R1.clone()
+                        W = torch.rand(size=[embed_dim], device=self.device, dtype=self.dtype) - 0.5
+
+                        shape = [[batch, seqLen, embed_dim]]
+                        tflops = 0.0
+                        benchmark(
+                            torch_add_norm_quant_bf16_fp8,
+                            shape,
+                            tflops,
+                            100,
+                            X1.reshape(-1, X1.shape[2]),
+                            R1.reshape(-1, R1.shape[2]),
+                            W,
+                            self.eps,
+                        )
+                        benchmark(
+                            add_norm_quant_bf16_fp8,
+                            shape,
+                            tflops,
+                            100,
+                            X2.reshape(-1, X1.shape[2]),
+                            R2.reshape(-1, R2.shape[2]),
+                            W,
+                            self.eps,
+                        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py b/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
new file mode 100644
index 000000000..66a605be3
--- /dev/null
+++ b/lightllm-kernel/test/fusion/gelu_per_token_quant_test.py
@@ -0,0 +1,56 @@
+import unittest
+import torch
+from lightllm.models.vit.triton_kernel.gelu_vit import gelu_fwd
+from lightllm_kernel.ops import per_token_quant_bf16_fp8, gelu_per_token_quant_bf16_fp8
+from test.utils import benchmark, error
+
+
+def gelu_quant(x):
+    y = gelu_fwd(x)
+    return per_token_quant_bf16_fp8(y)
+
+
+class TestGeluQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [13325]
+        self.hiddenDims = [3200, 4800, 12800, 511, 1032, 1023, 1025]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        """Test the accuracy of gelu_per_token_quant"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    input = torch.normal(
+                        mean=0.0, std=10, size=[token, hiddenDim], device=self.device, dtype=self.dtype
+                    )
+
+                    y_real, scales_real = gelu_quant(input)
+                    y_pred, scales_pred = gelu_per_token_quant_bf16_fp8(input)
+
+                    self.assertTrue(
+                        error(scales_real, scales_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. "
+                        f"scales_real={scales_real}, scales_pred={scales_pred}",
+                    )
+                    self.assertTrue(
+                        error(y_real, y_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}." f"y_real={y_real}, y_pred={y_pred}",
+                    )
+
+    def test_performance(self):
+        """Test the performance of gelu_per_token_quant using benchmark."""
+        for token in self.tokens:
+            for size in self.hiddenDims:
+                with self.subTest(shape=[token, size]):
+                    input = torch.rand(size=[token, size], device=self.device, dtype=self.dtype) - 0.5
+                    shape = [[token, size]]
+                    tflops = 0.0
+                    benchmark(gelu_per_token_quant_bf16_fp8, shape, tflops, 100, input)
+                    benchmark(gelu_quant, shape, tflops, 100, input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/post_tp_norm_test.py b/lightllm-kernel/test/fusion/post_tp_norm_test.py
new file mode 100755
index 000000000..0772aae3c
--- /dev/null
+++ b/lightllm-kernel/test/fusion/post_tp_norm_test.py
@@ -0,0 +1,56 @@
+import unittest
+import torch
+from lightllm_kernel.ops import post_tp_norm_bf16
+from test.utils import benchmark, error
+
+
+def post_tp_norm(input, weight, tp_variance, embed_dim, eps):
+    input = input.to(torch.float32)
+    variance = tp_variance / embed_dim
+    variance = variance.unsqueeze(-1)
+    input = input * torch.rsqrt(variance + eps)
+    out = weight * input.to(torch.bfloat16)
+    return out
+
+
+class TestPostTpNormBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [1024, 13325]
+        self.sizes = [1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+        self.embed_dim = 3200
+        self.eps = 1e-6
+
+    def test_accuracy(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+                    V = torch.rand(size=[batch], device=self.device, dtype=torch.float32)
+
+                    y_real = post_tp_norm(X, W, V, self.embed_dim, self.eps)
+                    y_pred = post_tp_norm_bf16(X, W, V, self.embed_dim, self.eps)
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
+                    )
+
+    def test_performance(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+                    V = torch.rand(size=[batch], device=self.device, dtype=torch.float32)
+
+                    shape = [[batch, size], [size], [batch, size]]
+                    tflops = 0.0
+                    benchmark(post_tp_norm_bf16, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
+                    benchmark(post_tp_norm, shape, tflops, 100, X, W, V, self.embed_dim, self.eps)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/fusion/pre_tp_norm_test.py b/lightllm-kernel/test/fusion/pre_tp_norm_test.py
new file mode 100755
index 000000000..5f82a189a
--- /dev/null
+++ b/lightllm-kernel/test/fusion/pre_tp_norm_test.py
@@ -0,0 +1,48 @@
+import unittest
+import torch
+from lightllm_kernel.ops import pre_tp_norm_bf16
+from test.utils import benchmark, error
+
+
+def pre_tp_norm(input):
+    input = input.to(torch.float32)
+    tp_variance = input.pow(2).sum(-1, keepdim=False)
+    return tp_variance
+
+
+class TestPreTpNormBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [1024, 13325]
+        self.sizes = [1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+
+                    y_real = pre_tp_norm(X)
+                    y_pred = pre_tp_norm_bf16(X)
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
+                    )
+
+    def test_performance(self):
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    # W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+
+                    shape = [[batch, size], [size], [batch, size]]
+                    tflops = 0.0
+                    benchmark(pre_tp_norm_bf16, shape, tflops, 100, X)
+                    benchmark(pre_tp_norm, shape, tflops, 100, X)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py b/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
new file mode 100644
index 000000000..1ef8be74d
--- /dev/null
+++ b/lightllm-kernel/test/gemm/cutlass_scaled_mm_test.py
@@ -0,0 +1,110 @@
+import unittest
+import torch
+from lightllm_kernel.ops import cutlass_scaled_mm_bias_ls
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from test.utils import benchmark, error
+
+
+def torch_cutlass_scale_gemm_with_ls(x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=None, ls=None):
+    y_pred_tmp = ops.cutlass_scaled_mm(x_q, w_q_t, x_scale, w_scale, out_dtype=out_dtype, bias=bias)
+    y_pred = y_pred_tmp * ls
+    return y_pred
+
+
+class TestQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [128, 1024, 13325]
+        self.hiddenDims = [256, 512, 1024, 3200]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        """Test the accuracy of cutlass_scaled_mm_bias_ls"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    M, N, K = token, 3 * hiddenDim, hiddenDim
+
+                    input = torch.randn(size=[M, K], device=self.device, dtype=self.dtype)
+                    x_q, x_scale = ops.scaled_fp8_quant(input, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+                    # 生成权重张量w_q（N×K），转置后为K×N（列优先）
+                    weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype)
+                    w_q, w_scale = ops.scaled_fp8_quant(
+                        weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True
+                    )
+
+                    # 转置，w_q_t为列优先
+                    w_q_t = w_q.t()
+                    assert w_q_t.stride(0) == 1, "权重转置后步幅需列优先"
+
+                    y_pred = torch.empty((M, N), dtype=input.dtype, device=input.device)
+                    bias = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+                    ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+
+                    cutlass_scaled_mm_bias_ls(y_pred, x_q, w_q_t, x_scale, w_scale, bias=bias, ls=ls)
+                    y_real = torch_cutlass_scale_gemm_with_ls(
+                        x_q, w_q_t, x_scale, w_scale, out_dtype=torch.bfloat16, bias=bias, ls=ls
+                    )
+
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_pred={y_pred}, y_real={y_real}",
+                    )
+
+    def test_performance(self):
+        """Test the performance of cutlass_scaled_mm_bias_ls"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    M, N, K = token, 3 * hiddenDim, hiddenDim
+
+                    input = torch.randn(size=[M, K], device=self.device, dtype=self.dtype) - 0.5
+                    x_q, x_scale = ops.scaled_fp8_quant(input, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
+
+                    # 生成权重张量w_q（N×K），转置后为K×N（列优先）
+                    weight = torch.randn(size=[N, K], device=self.device, dtype=self.dtype) - 0.5
+                    w_q, w_scale = ops.scaled_fp8_quant(
+                        weight, scale=None, scale_ub=None, use_per_token_if_dynamic=True
+                    )
+
+                    bias = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+                    ls = torch.randn(size=[N], device=self.device, dtype=torch.bfloat16)
+                    # 转置，w_q_t为列优先
+                    w_q_t = w_q.t()
+                    assert w_q_t.stride(0) == 1, "权重转置后步幅需列优先"
+
+                    y_pred = torch.empty((M, N), dtype=input.dtype, device=input.device)
+                    shape = [[token, hiddenDim]]
+                    tflops = 2 * token * (3 * hiddenDim) * hiddenDim / 1024 ** 4
+                    benchmark(
+                        cutlass_scaled_mm_bias_ls,
+                        shape,
+                        tflops,
+                        100,
+                        y_pred,
+                        x_q,
+                        w_q_t,
+                        x_scale,
+                        w_scale,
+                        bias=bias,
+                        ls=ls,
+                    )
+                    benchmark(
+                        torch_cutlass_scale_gemm_with_ls,
+                        shape,
+                        tflops,
+                        100,
+                        x_q,
+                        w_q_t,
+                        x_scale,
+                        w_scale,
+                        out_dtype=torch.bfloat16,
+                        bias=bias,
+                        ls=ls,
+                    )  # 无bias 495GB/s, 有bias 482GB/s
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/norm/rmsnorm_test.py b/lightllm-kernel/test/norm/rmsnorm_test.py
new file mode 100755
index 000000000..c79951052
--- /dev/null
+++ b/lightllm-kernel/test/norm/rmsnorm_test.py
@@ -0,0 +1,46 @@
+import unittest
+import torch
+from lightllm_kernel.ops import rmsnorm_bf16
+from test.utils import benchmark, error
+
+
+class TestRmsNormBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.batchs = [1024, 13325]
+        self.sizes = [1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        """Test the accuracy of rmsnorm against torch.rmsnorm."""
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+
+                    y_real = torch.nn.functional.rms_norm(X, (size,), W)
+                    y_pred = rmsnorm_bf16(X, W)
+                    self.assertTrue(
+                        error(y_pred, y_real) < 0.01,
+                        f"Accuracy test failed for size {batch}, {size}. y_real={y_real}, y_pred={y_pred}",
+                    )
+                    print(f"{error(y_pred, y_real) = }")
+
+    def test_performance(self):
+        """Test the performance of rmsnorm using benchmark."""
+        for batch in self.batchs:
+            for size in self.sizes:
+                with self.subTest(shape=[batch, size]):
+                    X = torch.rand(size=[batch, size], device=self.device, dtype=self.dtype) - 0.5
+                    W = torch.rand(size=[size], device=self.device, dtype=self.dtype) - 0.5
+
+                    shape = [[batch, size], [size], [batch, size]]
+                    tflops = 0.0
+                    benchmark(rmsnorm_bf16, shape, tflops, 100, X, W)
+                    benchmark(torch.nn.functional.rms_norm, shape, tflops, 100, X, (size,), W)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/quant/fp8_quant_test.py b/lightllm-kernel/test/quant/fp8_quant_test.py
new file mode 100755
index 000000000..e584e2fde
--- /dev/null
+++ b/lightllm-kernel/test/quant/fp8_quant_test.py
@@ -0,0 +1,49 @@
+import unittest
+import torch
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm_kernel.ops import per_token_quant_bf16_fp8
+from test.utils import benchmark, error
+
+
+class TestQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [1024, 13325]
+        self.hiddenDims = [3, 256, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda"
+        self.dtype = torch.bfloat16
+
+    def test_accuracy(self):
+        """Test the accuracy of per_token_quant"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
+                    y_real, scales_real = ops.scaled_fp8_quant(
+                        input.contiguous(), scale=None, use_per_token_if_dynamic=True
+                    )
+                    y_pred, scales_pred = per_token_quant_bf16_fp8(input)
+                    self.assertTrue(
+                        error(scales_real, scales_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}."
+                        f"scales_real={scales_real}, scales_pred={scales_pred}",
+                    )
+                    self.assertTrue(
+                        error(y_real, y_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}",
+                    )
+
+    def test_performance(self):
+        """Test the performance of per_token_quant"""
+        for token in self.tokens:
+            for size in self.hiddenDims:
+                with self.subTest(shape=[token, size]):
+                    input = torch.rand(size=[token, size], device=self.device, dtype=self.dtype) - 0.5
+                    shape = [[token, size]]
+                    tflops = token * size / 1024 ** 4
+                    benchmark(per_token_quant_bf16_fp8, shape, tflops, 100, input)
+                    benchmark(ops.scaled_fp8_quant, shape, tflops, 100, input, None, True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/quant/int8_quant_test.py b/lightllm-kernel/test/quant/int8_quant_test.py
new file mode 100644
index 000000000..1ab42546b
--- /dev/null
+++ b/lightllm-kernel/test/quant/int8_quant_test.py
@@ -0,0 +1,50 @@
+import unittest
+import torch
+from lightllm.common.vllm_kernel import _custom_ops as ops
+from lightllm_kernel.ops import per_token_quant_bf16_int8
+from test.utils import benchmark, error
+
+
+class TestQuantBF16(unittest.TestCase):
+    def setUp(self):
+        """Set up common test parameters."""
+        self.tokens = [1024, 13325]
+        self.hiddenDims = [3, 256, 257, 511, 1023, 1024, 1025, 1032, 3200, 3201, 3208, 12800]
+        self.device = "cuda:2"
+        self.dtype = torch.bfloat16
+        torch.cuda.set_device(self.device)
+
+    def test_accuracy(self):
+        """Test the accuracy of per_token_quant"""
+        for token in self.tokens:
+            for hiddenDim in self.hiddenDims:
+                with self.subTest(shape=[token, hiddenDim]):
+                    input = torch.rand(size=[token, hiddenDim], device=self.device, dtype=self.dtype) - 0.5
+                    y_real, scales_real, _ = ops.scaled_int8_quant(
+                        input.contiguous()
+                    )
+                    y_pred, scales_pred = per_token_quant_bf16_int8(input)
+                    self.assertTrue(
+                        error(scales_real, scales_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}."
+                        f"scales_real={scales_real}, scales_pred={scales_pred}",
+                    )
+                    self.assertTrue(
+                        error(y_real, y_pred) < 0.01,
+                        f"Accuracy test failed for size {token}, {hiddenDim}. y_real={y_real}, y_pred={y_pred}",
+                    )
+
+    def test_performance(self):
+        """Test the performance of per_token_quant"""
+        for token in self.tokens:
+            for size in self.hiddenDims:
+                with self.subTest(shape=[token, size]):
+                    input = torch.rand(size=[token, size], device=self.device, dtype=self.dtype) - 0.5
+                    shape = [[token, size]]
+                    tflops = token * size / 1024 ** 4
+                    benchmark(per_token_quant_bf16_int8, shape, tflops, 100, input)
+                    benchmark(ops.scaled_int8_quant, shape, tflops, 100, input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/lightllm-kernel/test/utils.py b/lightllm-kernel/test/utils.py
new file mode 100644
index 000000000..ed79fdfec
--- /dev/null
+++ b/lightllm-kernel/test/utils.py
@@ -0,0 +1,127 @@
+import torch
+from typing import Callable
+from typing import List
+
+
+def error(y_pred: torch.Tensor, y_real: torch.Tensor) -> torch.Tensor:
+    """
+    Compute SNR between y_pred(tensor) and y_real(tensor)
+
+    SNR can be calcualted as following equation:
+
+        SNR(pred, real) = (pred - real) ^ 2 / (real) ^ 2
+
+    if x and y are matrixs, SNR error over matrix should be the mean value of SNR error over all elements.
+
+        SNR(pred, real) = mean((pred - real) ^ 2 / (real) ^ 2)
+
+
+    Args:
+        y_pred (torch.Tensor): _description_
+        y_real (torch.Tensor): _description_
+        reduction (str, optional): _description_. Defaults to 'mean'.
+
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
+
+    Returns:
+        torch.Tensor: _description_
+    """
+    y_pred = torch.flatten(y_pred).float()
+    y_real = torch.flatten(y_real).float()
+
+    if y_pred.shape != y_real.shape:
+        raise ValueError(
+            f"Can not compute snr loss for tensors with different shape. " f"({y_pred.shape} and {y_real.shape})"
+        )
+
+    noise_power = torch.pow(y_pred - y_real, 2).sum(dim=-1)
+    signal_power = torch.pow(y_real, 2).sum(dim=-1)
+    snr = (noise_power) / (signal_power + 1e-7)
+    return snr.item()
+
+
+def benchmark(func: Callable, shape: List[int], tflops: float, steps: int, *args, **kwargs):
+    """
+    A decorator function to assist in performance testing of CUDA operations.
+
+    This function will:
+    1. Automatically determine whether any parameters in the argument list,
+       or the output of the `func`, are of type `torch.Tensor`.
+    2. If so, calculate the memory usage of the input and output tensors
+       on the GPU (based on their data type and `torch.numel()`).
+    3. Establish a CUDA graph and attempt to execute `func` repeatedly for `steps` iterations.
+    4. Record the execution time during these iterations.
+    5. Use the information above to compute the compute performance (TFLOPS) and memory throughput.
+
+    Args:
+        func (function): The function to benchmark.
+        shape (list of int): The problem shape.
+        tflops (float): The computational workload (in TFLOPS) per call of `func`.
+        steps (int): The number of times the function is executed during benchmarking.
+        *args: Positional arguments to be passed to the `func`.
+        **kwargs: Keyword arguments to be passed to the `func`.
+
+    Returns:
+        function result
+    """
+
+    # Ensure CUDA is available
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for benchmarking.")
+
+    # Check for torch.Tensor in inputs and outputs
+    input_tensors = [arg for arg in args if isinstance(arg, torch.Tensor)]
+    input_tensors += [value for value in kwargs.values() if isinstance(value, torch.Tensor)]
+
+    def calculate_memory(tensor: torch.Tensor):
+        """Calculate memory usage in bytes for a tensor."""
+        return tensor.numel() * tensor.element_size()
+
+    input_memory = sum(calculate_memory(t) for t in input_tensors)
+
+    # Execute the function to inspect outputs
+    with torch.no_grad():
+        output = func(*args, **kwargs)
+
+    output_memory = 0
+    if isinstance(output, torch.Tensor):
+        output_memory = calculate_memory(output)
+    elif isinstance(output, (list, tuple)):
+        output_memory = sum(calculate_memory(o) for o in output if isinstance(o, torch.Tensor))
+
+    total_memory = input_memory + output_memory
+
+    # Warm-up and CUDA graph creation
+    for _ in range(10):  # Warm-up
+        func(*args, **kwargs)
+
+    torch.cuda.synchronize()  # Ensure no pending operations
+
+    # Benchmark the function
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    start_event.record()
+    for _ in range(steps):
+        func(*args, **kwargs)
+    end_event.record()
+
+    torch.cuda.synchronize()  # Ensure all operations are finished
+    elapsed_time_ms = start_event.elapsed_time(end_event)  # Time in milliseconds
+
+    # Calculate performance metrics
+    elapsed_time_s = elapsed_time_ms / 1000  # Convert to seconds
+    avg_time_per_step = elapsed_time_s / steps
+    compute_performance = tflops / avg_time_per_step  # TFLOPS
+    memory_throughput = (total_memory * steps / (1024 ** 3)) / elapsed_time_s  # GB/s
+
+    # Print performance metrics
+    print(f"Function: {func.__name__}{shape}")
+    # print(f"Function: {func.__ne__}{shape}")
+    print(f"Elapsed Time (total): {elapsed_time_s:.4f} seconds")
+    print(f"Average Time Per Step: {avg_time_per_step * 1000 :.3f} ms")
+    print(f"Compute Performance: {compute_performance:.2f} TFLOPS")
+    print(f"Memory Throughput: {memory_throughput:.2f} GB/s")
+    print("")  # print a blank line.
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
index 706c328b8..23e458344 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/mm_weight.py
@@ -41,11 +41,20 @@ def __init__(
         self.has_bias: bool = None
 
     def mm(
-        self, input_tensor: torch.Tensor, out: Optional[torch.Tensor] = None, use_custom_tensor_mananger: bool = True
+        self,
+        input_tensor: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        ls_weight: Optional[torch.Tensor] = None,
+        use_custom_tensor_mananger: bool = True,
     ) -> torch.Tensor:
         if self.quant_method is not None:
             return self.quant_method.apply(
-                input_tensor, self.weight, self.bias, out, use_custom_tensor_mananger=use_custom_tensor_mananger
+                input_tensor,
+                self.weight,
+                self.bias,
+                out,
+                ls_weight=ls_weight,
+                use_custom_tensor_mananger=use_custom_tensor_mananger,
             )
         if out is None:
             shape = (input_tensor.shape[0], self.weight.shape[1])
diff --git a/lightllm/common/quantization/w8a8_quant.py b/lightllm/common/quantization/w8a8_quant.py
index 6b5bd5843..cec8173c4 100644
--- a/lightllm/common/quantization/w8a8_quant.py
+++ b/lightllm/common/quantization/w8a8_quant.py
@@ -99,7 +99,16 @@ def quantize_moe(self, weight):
         weight_scale = torch.cat(weight_scales, dim=0).reshape(-1)
         return qweights, weight_scale
 
-    def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_custom_tensor_mananger=True):
+    def apply(
+        self,
+        input_tensor,
+        weights,
+        bias=None,
+        out=None,
+        ls_weight=None,
+        workspace=None,
+        use_custom_tensor_mananger=True,
+    ):
         x_q, x_scale = scaled_fp8_quant(input_tensor, scale=None, scale_ub=None, use_per_token_if_dynamic=True)
         m = input_tensor.shape[0]
         n = weights[0].shape[1]
@@ -110,7 +119,10 @@ def apply(self, input_tensor, weights, bias=None, out=None, workspace=None, use_
                 )
             else:
                 out = torch.empty((m, n), dtype=input_tensor.dtype, device=input_tensor.device)
-        cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
+        if ls_weight is not None:
+            light_ops.cutlass_scaled_mm_bias_ls(out, x_q, weights[0], x_scale, weights[1], bias, ls_weight)
+        else:
+            cutlass_scaled_mm(out, x_q, weights[0], x_scale, weights[1], bias)
         return out
 
 
diff --git a/lightllm/models/llama/layer_infer/transformer_layer_infer.py b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
index 125134659..cc7dc0cb8 100755
--- a/lightllm/models/llama/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/llama/layer_infer/transformer_layer_infer.py
@@ -5,7 +5,7 @@
 import numpy as np
 from typing import Tuple
 from functools import partial
-
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 from lightllm.models.llama.layer_weights.transformer_layer_weight import LlamaTransformerLayerWeight
 from lightllm.models.llama.triton_kernel.context_flashattention_nopad import (
     context_attention_fwd,
@@ -26,7 +26,6 @@
 from lightllm.distributed.communication_op import all_gather_into_tensor, reduce_scatter_tensor
 from lightllm.utils.log_utils import init_logger
 from lightllm.utils.envs_utils import get_env_start_args
-from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
 logger = init_logger(__name__)
 
@@ -542,7 +541,7 @@ def _token_decode_attention_ppl_int8kv(self, q, infer_state: LlamaInferStateInfo
 
         # group_int8kv_decode_attention(at::Tensor o, at::Tensor q, at::Tensor k, at::Tensor k_s,  at::Tensor v,
         # at::Tensor v_s, at::Tensor b_loc, at::Tensor b_seq_len, int max_len_in_batch)
-        light_ops.group8_int8kv_decode_attention(
+        light_ops.group_int8kv_decode_attention(
             o_tensor.view(calcu_shape1),
             q.view(calcu_shape1),
             infer_state.mem_manager.kv_buffer[self.layer_num_][:, 0 : self.tp_k_head_num_, :],
diff --git a/lightllm/models/vit/layer_infer/transformer_layer_infer.py b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
index 14ba9cfed..f51df0f66 100644
--- a/lightllm/models/vit/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/vit/layer_infer/transformer_layer_infer.py
@@ -1,3 +1,4 @@
+import os
 import torch
 import torch.functional as F
 import torch.distributed as dist
@@ -13,11 +14,10 @@
 from lightllm.models.vit.triton_kernel.gelu_vit import gelu_fwd
 from lightllm.models.vit.triton_kernel.rms_norm_vit import rms_norm
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
+from lightllm.utils.light_utils import HAS_LIGHTLLM_KERNEL, light_ops
 
 
 class ViTTransformerLayerInfer:
-    """ """
-
     def __init__(self, layer_num, network_config, mode=[]):
         self.tp_rank_ = get_current_rank_in_dp()
         self.tp_world_size_ = get_dp_world_size()
@@ -45,6 +45,15 @@ def norm(self, input, weight):
         out = out.reshape(input_shape)
         return out
 
+    def tp_norm_cuda(self, input, weight):
+        if self.tp_world_size_ == 1:
+            out = light_ops.rmsnorm_bf16(input, weight, self.eps_)
+        else:
+            tp_variance = light_ops.pre_tp_norm_bf16(input)
+            dist.all_reduce(tp_variance, op=dist.ReduceOp.SUM, async_op=False)
+            out = light_ops.post_tp_norm_bf16(input, weight, tp_variance, self.embed_dim_, self.eps_)
+        return out
+
     def tp_norm(self, input, weight):
         input_shape = input.shape
         input = input.view(-1, self.tp_padding_head_num * self.head_dim_)
@@ -89,8 +98,12 @@ def _ffn_norm(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Ten
             )
 
     def _qk_norm(self, q, k, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
-        q_norm = self.tp_norm(q, layer_weight.q_norm_weight_.weight)
-        k_norm = self.tp_norm(k, layer_weight.k_norm_weight_.weight)
+        if HAS_LIGHTLLM_KERNEL:
+            q_norm = self.tp_norm_cuda(q, layer_weight.q_norm_weight_.weight)
+            k_norm = self.tp_norm_cuda(k, layer_weight.k_norm_weight_.weight)
+        else:
+            q_norm = self.tp_norm(q, layer_weight.q_norm_weight_.weight)
+            k_norm = self.tp_norm(k, layer_weight.k_norm_weight_.weight)
         return q_norm, k_norm
 
     def _get_qkv(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
@@ -112,10 +125,10 @@ def _get_o(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor
         batch_size = input.shape[0]
         seq_len = input.shape[1]
         o_tensor = layer_weight.o_proj.mm(
-            input.view(-1, self.tp_padding_head_num * self.head_dim_), use_custom_tensor_mananger=True
+            input.view(-1, self.tp_padding_head_num * self.head_dim_),
+            ls_weight=layer_weight.ls1,
+            use_custom_tensor_mananger=True,
         )
-        if layer_weight.use_ls:
-            o_tensor.mul_(layer_weight.ls1)
         return o_tensor.reshape((batch_size, seq_len, -1))
 
     def _ffn(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
@@ -123,10 +136,8 @@ def _ffn(self, input, layer_weight: ViTTransformerLayerWeight) -> torch.Tensor:
         input_shape = input.shape
         input = None
         ffn1_out = gelu_fwd(fc1, use_custom_tensor_mananger=True)
-        ffn2_out = layer_weight.ffn_2_proj_.mm(ffn1_out, use_custom_tensor_mananger=True)
+        ffn2_out = layer_weight.ffn_2_proj_.mm(ffn1_out, ls_weight=layer_weight.ls2, use_custom_tensor_mananger=True)
         ffn1_out = None
-        if layer_weight.use_ls:
-            ffn2_out.mul_(layer_weight.ls2)
         return ffn2_out.reshape(input_shape)
 
     def _context_attention(self, input_embding, layer_weight):
diff --git a/lightllm/models/vit/triton_kernel/flashattention_nopad.py b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
index f36404ca6..4ab4dd4b2 100644
--- a/lightllm/models/vit/triton_kernel/flashattention_nopad.py
+++ b/lightllm/models/vit/triton_kernel/flashattention_nopad.py
@@ -167,6 +167,7 @@ def flash_attention_v3_fwd(
             v,
             None,
             None,  # k_new, v_new
+            None,
             o,  # out
             None,
             None,
@@ -183,6 +184,7 @@ def flash_attention_v3_fwd(
             None,
             None,
             None,
+            None,
             softmax_scale,
             False,  # causal
             window_size=(-1, -1),
diff --git a/third-party/cutlass b/third-party/cutlass
new file mode 160000
index 000000000..bf9da7b76
--- /dev/null
+++ b/third-party/cutlass
@@ -0,0 +1 @@
+Subproject commit bf9da7b76c766d7ee7d536afc77880a4ef1f1156