[EP] Add cuda kernel for moe_ep_pre_reorder (#6699) #7
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: PR Test | |
on: | |
push: | |
branches: [ main ] | |
paths: | |
- "python/**" | |
- "scripts/**" | |
- "test/**" | |
- ".github/workflows/pr-test.yml" | |
pull_request: | |
branches: [ main ] | |
paths: | |
- "python/**" | |
- "scripts/**" | |
- "test/**" | |
- ".github/workflows/pr-test.yml" | |
workflow_dispatch: | |
inputs: | |
version: | |
description: "FlashInfer version" | |
required: true | |
type: choice | |
default: 'release' | |
options: | |
- 'release' | |
- 'nightly' | |
concurrency: | |
group: pr-test-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
unit-test-frontend: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 1-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Run test | |
timeout-minutes: 10 | |
run: | | |
cd test/lang | |
python3 run_suite.py --suite per-commit | |
unit-test-backend-1-gpu: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 1-gpu-runner | |
strategy: | |
fail-fast: false | |
matrix: | |
part: [0, 1, 2, 3, 4, 5, 6, 7, 8] | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Run test | |
timeout-minutes: 30 | |
run: | | |
cd test/srt | |
python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 9 | |
unit-test-backend-2-gpu: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 2-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Run test | |
timeout-minutes: 30 | |
run: | | |
cd test/srt | |
python3 run_suite.py --suite per-commit-2-gpu | |
unittest-test-backend-4-gpu: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
needs: [unit-test-frontend, unit-test-backend-2-gpu] | |
runs-on: 4-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Run test | |
timeout-minutes: 30 | |
run: | | |
cd test/srt | |
python3 run_suite.py --suite per-commit-4-gpu | |
unittest-test-backend-8-gpu: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
needs: [unit-test-frontend, unit-test-backend-2-gpu] | |
runs-on: 8-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Run test | |
timeout-minutes: 20 | |
run: | | |
cd test/srt | |
python3 run_suite.py --suite per-commit-8-gpu | |
performance-test-1-gpu-part-1: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 1-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Benchmark single latency | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small | |
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default | |
- name: Benchmark online latency | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default | |
- name: Benchmark offline throughput | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default | |
- name: Benchmark offline throughput (Non-streaming, small batch size) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size | |
- name: Benchmark online latency (EAGLE) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle | |
performance-test-1-gpu-part-2: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 1-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Benchmark offline throughput (w/o RadixAttention) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache | |
- name: Benchmark offline throughput (w/ Triton) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend | |
- name: Benchmark offline throughput (w/ FP8) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 | |
- name: Benchmark VLM offline throughput | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput | |
- name: Benchmark VLM online latency | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency | |
performance-test-2-gpu: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 2-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
- name: Benchmark single latency (TP=2) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 | |
- name: Benchmark single latency + torch.compile (TP=2) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 | |
- name: Benchmark offline throughput (TP=2) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default | |
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache | |
- name: Benchmark offline decode throughput (PP=2) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode | |
- name: Benchmark offline prefill throughput (PP=2) | |
timeout-minutes: 10 | |
run: | | |
cd test/srt | |
python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill | |
accuracy-test-1-gpu: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 1-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
git clone https://github.com/merrymercy/human-eval.git | |
cd human-eval | |
pip install -e . | |
- name: Evaluate accuracy | |
timeout-minutes: 20 | |
run: | | |
cd test/srt | |
python3 test_eval_accuracy_large.py | |
accuracy-test-2-gpu: | |
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && | |
github.event.pull_request.draft == false | |
runs-on: 2-gpu-runner | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Install dependencies | |
run: | | |
bash scripts/ci_install_dependency.sh | |
git clone https://github.com/merrymercy/human-eval.git | |
cd human-eval | |
pip install -e . | |
- name: Evaluate accuracy (TP=2) | |
timeout-minutes: 20 | |
run: | | |
cd test/srt | |
python3 test_moe_eval_accuracy_large.py | |
finish: | |
if: always() | |
needs: [ | |
unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unittest-test-backend-8-gpu, | |
performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, | |
accuracy-test-1-gpu, accuracy-test-2-gpu, | |
] | |
runs-on: ubuntu-latest | |
steps: | |
- name: Check all dependent job statuses | |
run: | | |
results=(${{ join(needs.*.result, ' ') }}) | |
for result in "${results[@]}"; do | |
if [ "$result" = "failure" ] || [ "$result" = "cancelled" ]; then | |
echo "Job failed with result: $result" | |
exit 1 | |
fi | |
done | |
echo "All jobs completed successfully" | |
exit 0 |