Add E2E test

tengyifei · tengyifei · commit ec87c922cd60 · 2025-06-18T05:45:33.000Z
diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml
@@ -22,6 +22,7 @@ jobs:
       ARTIFACT_DIR: gs://torchprime-e2e-tests/${{ github.job }}/${{ github.run_id }}-${{ github.run_attempt }}
     outputs:
       llama-3-8b-name: ${{ steps.run-llama-3-8b.outputs.name }}
+      llama-3-8b-pure-mlp-name: ${{ steps.run-llama-3-8b-pure-mlp.outputs.name }}
       llama-3_1-8b-sa-name: ${{ steps.run-llama-3_1-8b-SplashAttention.outputs.name }}
       llama-3_1-8b-scan-offload-name: ${{ steps.run-llama-3_1-8b-scan-offload.outputs.name }}
       llama-3-8b-2d-name: ${{ steps.run-llama-3-8b-2d.outputs.name }}
@@ -83,6 +84,27 @@ jobs:
             ici_mesh.fsdp=4 \
             profile_start_step=3
 
+      - name: Run Llama 3.0 8B (@assume_pure)
+        id: run-llama-3-8b-pure-mlp
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          XLA_IR_DEBUG: 1
+          XLA_HLO_DEBUG: 1
+        run: |
+          name=$(e2e_testing/gen_name.py llama-3-8b-pure-mlp)
+          echo "name=$name" >> "$GITHUB_OUTPUT"
+          tp run ${{ steps.docker-url-option.outputs.value }} \
+            --name $name \
+            torchprime/torch_xla_models/train.py \
+            model=llama-3-8b \
+            dataset=wikitext \
+            task=train \
+            task.global_batch_size=8 \
+            task.max_steps=15 \
+            ici_mesh.fsdp=4 \
+            profile_start_step=3 \
+            model.pure_modules=[LlamaMLP,EinsumLinear]
+
       - name: Run Llama 3.1 8B (Splash Attention)
         id: run-llama-3_1-8b-SplashAttention
         env:
@@ -259,6 +281,7 @@ jobs:
       jobset_name: >-
         ${{
           matrix.config.benchmark == 'llama-3-8b' && needs.tp-run.outputs.llama-3-8b-name ||
+          matrix.config.benchmark == 'llama-3-8b-pure-mlp' && needs.tp-run.outputs.llama-3-8b-pure-mlp-name ||
           matrix.config.benchmark == 'llama-3_1-8b-sa' && needs.tp-run.outputs.llama-3_1-8b-sa-name ||
           matrix.config.benchmark == 'llama-3_1-8b-scan-offload' && needs.tp-run.outputs.llama-3_1-8b-scan-offload-name ||
           matrix.config.benchmark == 'llama-3-8b-2d' && needs.tp-run.outputs.llama-3-8b-2d-name ||
diff --git a/e2e_testing/step_time_bounds.yaml b/e2e_testing/step_time_bounds.yaml
@@ -1,54 +1,54 @@
 benchmarks:
   llama-3-8b:
     name: Llama 3.0 8B
-    step_time_lower_bound: 2.68109009
-    step_time_upper_bound: 2.789223
-    confidence_interval: 0.05407
-    average: 2.7352
-    sample_size: 427
+    step_time_lower_bound: 0.894678
+    step_time_upper_bound: 4.54563437
+    confidence_interval: 1.82548
+    average: 2.7202
+    sample_size: 114
   llama-3_1-8b-sa:
     name: Llama 3.1 8B (Splash Attention)
-    step_time_lower_bound: 2.34653077
-    step_time_upper_bound: 2.467111
-    confidence_interval: 0.06029
-    average: 2.4068
-    sample_size: 428
+    step_time_lower_bound: 2.35428493
+    step_time_upper_bound: 2.470571
+    confidence_interval: 0.05814
+    average: 2.4124
+    sample_size: 112
   llama-3_1-8b-scan-offload:
     name: Llama 3.1 8B (Scan + Offload)
-    step_time_lower_bound: 2.74099553
-    step_time_upper_bound: 2.860302
-    confidence_interval: 0.05965
-    average: 2.8006
-    sample_size: 428
+    step_time_lower_bound: 2.74872464
+    step_time_upper_bound: 2.871284
+    confidence_interval: 0.06128
+    average: 2.81
+    sample_size: 94
   llama-3-8b-2d:
     name: Llama 3.0 8B (2D sharding)
-    step_time_lower_bound: 3.28827914
-    step_time_upper_bound: 3.38842977
-    confidence_interval: 0.05008
-    average: 3.3384
-    sample_size: 428
+    step_time_lower_bound: 3.31281298
+    step_time_upper_bound: 3.41371084
+    confidence_interval: 0.05045
+    average: 3.3633
+    sample_size: 114
   mixtral-8x7b:
     name: Mixtral 8x7B
-    step_time_lower_bound: 3.09900735
-    step_time_upper_bound: 3.19339336
-    confidence_interval: 0.04719
-    average: 3.1462
-    sample_size: 427
+    step_time_lower_bound: 3.12225098
+    step_time_upper_bound: 3.21734492
+    confidence_interval: 0.04755
+    average: 3.1698
+    sample_size: 114
   llama-3-8b-2-slice:
     name: Llama 3.0 8B (2 Slice)
-    step_time_lower_bound: 3.82985294
-    step_time_upper_bound: 4.087614
-    confidence_interval: 0.12888
-    average: 3.9587
-    sample_size: 416
+    step_time_lower_bound: 3.47510115
+    step_time_upper_bound: 4.505638
+    confidence_interval: 0.51527
+    average: 3.9904
+    sample_size: 110
   llama-3-8b-ddp-fsdp:
     name: Llama 3.0 8B (ddp + fsdp)
-    step_time_lower_bound: 3.22420277
-    step_time_upper_bound: 3.351676
-    confidence_interval: 0.06374
-    average: 3.2879
-    sample_size: 47
+    step_time_lower_bound: 3.2263914
+    step_time_upper_bound: 3.341676
+    confidence_interval: 0.05764
+    average: 3.284
+    sample_size: 110
 metadata:
-  query_start: '2025-05-26T18:37:58.674556-07:00'
-  query_end: '2025-06-13T13:20:09-07:00'
+  query_start: '2025-06-12T22:37:43+00:00'
+  query_end: '2025-06-17T22:37:43+00:00'
   confidence_level: 0.999
diff --git a/e2e_testing/update_step_time.py b/e2e_testing/update_step_time.py
@@ -29,6 +29,23 @@ def match_llama3_8b(row):
     and config["dcn_mesh"]["data"] == 1
     and config["dcn_mesh"]["fsdp"] == 1
     and config["ici_mesh"]["tensor"] == 1
+    and (
+      "pure_modules" not in config["model"] or len(config["model"]["pure_modules"]) == 0
+    )
+  )
+
+
+def match_llama3_8b_pure_mlp(row):
+  config = json.loads(row.configs_framework)
+  return (
+    row.run_id.startswith("llama-3-8b-pure-mlp")
+    and config["dcn_mesh"]["data"] == 1
+    and config["dcn_mesh"]["fsdp"] == 1
+    and config["ici_mesh"]["tensor"] == 1
+    and (
+      "pure_modules" in config["model"]
+      and config["model"]["pure_modules"] == ["LlamaMLP", "EinsumLinear"]
+    )
   )
 
 
@@ -86,6 +103,7 @@ def match_llama_3_8b_ddp_fsdp(row):
 
 BENCHMARKS = {
   "Llama 3.0 8B": match_llama3_8b,
+  "Llama 3.0 8B (@assume_pure)": match_llama3_8b_pure_mlp,
   "Llama 3.1 8B (Splash Attention)": match_llama3_1_8b_sa,
   "Llama 3.1 8B (Scan + Offload)": match_llama3_1_8b_scan_offload,
   "Llama 3.0 8B (2D sharding)": match_llama3_8b_2d,
@@ -96,6 +114,7 @@ def match_llama_3_8b_ddp_fsdp(row):
 
 STEP_ID_MAPPING = {
   "Llama 3.0 8B": "llama-3-8b",
+  "Llama 3.0 8B (@assume_pure)": "llama-3-8b-pure-mlp",
   "Llama 3.1 8B (Splash Attention)": "llama-3_1-8b-sa",
   "Llama 3.1 8B (Scan + Offload)": "llama-3_1-8b-scan-offload",
   "Llama 3.0 8B (2D sharding)": "llama-3-8b-2d",