Merge pull request #1823 from AI-Hypercomputer:sujinesh/mcjax_long_running

maxtext authors · maxtext authors · commit 02b6b8d2558f · 2025-06-13T14:31:30.000-07:00
PiperOrigin-RevId: 771226904
diff --git a/benchmarks/maxtext_trillium_model_configs.py b/benchmarks/maxtext_trillium_model_configs.py
@@ -33,6 +33,7 @@
     "checkpoint_storage_use_ocdbt": False,
     "checkpoint_storage_use_zarr3": False,
     "enable_pathways_goodput": True,
+    "enable_goodput_recording": True,
     "enable_single_controller": True,
     "metrics_file": "metrics.txt",
     "goodput_upload_interval_seconds": 30,
@@ -44,6 +45,7 @@
     "async_checkpointing": True,
     "checkpoint_period": 100,
     "enable_checkpoint_cloud_logger": True,
+    "enable_goodput_recording": True,
 }
 
 # The set of tuning params required for short-running pathways jobs.
@@ -52,6 +54,7 @@
     "async_checkpointing": True,
     "checkpoint_period": 20,
     "enable_checkpoint_cloud_logger": True,
+    "enable_goodput_recording": True,
 }
 
 
diff --git a/benchmarks/maxtext_xpk_runner.py b/benchmarks/maxtext_xpk_runner.py
@@ -477,12 +477,19 @@ def _get_pathways_proxy_flags(wl_config: WorkloadConfig):
     flags_to_add = wl_config.model.pathways_xla_flag_options[
         xla_flags.ADD_PROXY
     ]
-    proxy_flags.append(flags_to_add)
+    flags_to_add_list = flags_to_add.strip().split()
+    proxy_flags += flags_to_add_list
 
   # Join the list of flags back into a single string, space-separated
   return ' '.join(proxy_flags)
 
 
+def _combine_flag_strings(base_flags: str, flags_to_add: str) -> str:
+  """Combines two flag strings and removes extraneous whitespace."""
+  all_flags = base_flags.split() + flags_to_add.split()
+  return ' '.join(all_flags)
+
+
 def _get_pathways_worker_flags(wl_config: WorkloadConfig):
   """Get the pathways worker flags for the workload and removes any extras."""
   # Add in the xla flags alongside the worker flags from the pathways config.
@@ -499,7 +506,8 @@ def _get_pathways_worker_flags(wl_config: WorkloadConfig):
     flags_to_add = wl_config.model.pathways_xla_flag_options[
         xla_flags.ADD_WORKER
     ]
-    worker_flags += flags_to_add
+
+    worker_flags = _combine_flag_strings(worker_flags, flags_to_add)
 
   # Join the list of flags back into a single string, space-separated
   return worker_flags
@@ -521,7 +529,7 @@ def _get_pathways_server_flags(wl_config: WorkloadConfig):
     flags_to_add = wl_config.model.pathways_xla_flag_options[
         xla_flags.ADD_SERVER
     ]
-    server_flags += flags_to_add
+    server_flags = _combine_flag_strings(server_flags, flags_to_add)
 
   # Join the list of flags back into a single string, space-separated
   return server_flags
@@ -581,22 +589,23 @@ def generate_xpk_workload_cmd(
       random.choice(string.ascii_lowercase + string.digits) for _ in range(length_of_random_str)
   )
 
-  truncate_model_name = 12
-  truncate_prefix = 5
-  common_post_fix = f"-{wl_config.num_slices}-{time.strftime('%m%d%H', time.localtime())}-{temp_post_fix}"
+  truncate_model_name = 10
+  truncate_prefix = 3
+  post_fix = f"-{wl_config.num_slices}-{time.strftime('%m%d%H', time.localtime())}-{temp_post_fix}"
   common_prefix = os.environ['USER']
   pw_prefix = "pw-"
 
   if workload_name is None: # Generate name if not provided
     if is_pathways_enabled:
+      post_fix = f"-{wl_config.num_slices}-{temp_post_fix}"
       name = (
           f"{pw_prefix}{wl_config.model.model_name.replace('_', '-')[:truncate_model_name - len(pw_prefix)]}"
       )
     else:
       name = (
         f"{wl_config.model.model_name.replace('_', '-')[:truncate_model_name]}"
       )
-    name = f"{common_prefix[:truncate_prefix]}-{name}{common_post_fix}"
+    name = f"{common_prefix[:truncate_prefix]}-{name}{post_fix}"
   else:
     name = workload_name # Use provided name
 
@@ -629,7 +638,7 @@ def generate_xpk_workload_cmd(
         f'--docker-image={pw_config.runner_image}'
     )
   else:
-    docker_image_flag = f'--base-docker-image="{wl_config.base_docker_image}"'
+    docker_image_flag = f'--docker-image="{wl_config.base_docker_image}"'
 
   upload_metrics_to_bq_cmd = ""
   if wl_config.generate_metrics_and_upload_to_big_query:
diff --git a/benchmarks/recipes/args_helper.py b/benchmarks/recipes/args_helper.py
@@ -38,14 +38,14 @@ def _handle_delete(
       **kwargs: Optional keyword arguments, such as xpk_path
   """
   xpk_path = kwargs.get("xpk_path", "xpk")  # Default to "xpk" if not provided
-  first_five_chars = user[:5]
+  first_three_chars = user[:3]
   delete_command = (
       f"python3 {xpk_path}/xpk.py workload delete "
       f"--project={cluster_config.project} --cluster={cluster_config.cluster_name}"
-      f" --filter-by-job={first_five_chars} --zone={cluster_config.zone}"
+      f" --filter-by-job={first_three_chars} --zone={cluster_config.zone}"
   )
   print(
-      f"Deleting workloads starting with: {first_five_chars} using command:"
+      f"Deleting workloads starting with: {first_three_chars} using command:"
       f" {delete_command}"
   )
   os.system(delete_command)
diff --git a/benchmarks/recipes/mcjax_long_running_recipe.py b/benchmarks/recipes/mcjax_long_running_recipe.py
@@ -0,0 +1,133 @@
+"""
+ Copyright 2025 Google LLC
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+      https://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ """
+
+import datetime
+import sys
+import os
+
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.append(parent_dir)
+
+import recipes.args_helper as helper
+import maxtext_trillium_model_configs as model_configs
+import maxtext_xpk_runner as mxr
+from xpk_configs import XpkClusterConfig
+
+# Cluster Params
+CLUSTER = "v6e-256-cluster"
+PROJECT = "tpu-prod-env-cluster"
+ZONE = "us-east5-b"
+REGION = "us-east5"
+COUNTRY = "us"
+DEVICE_TYPE = "v6e-256"
+
+# Other parameters (MUST BE SET BY USER)
+XPK_PATH = os.path.join("~", "xpk")
+USER = os.environ["USER"]
+BASE_OUTPUT_DIRECTORY = (
+    f"gs://{USER}-{PROJECT}-{COUNTRY}/mcjax_long_run/"
+)
+# Generate your own runner image from MaxText repo.
+RUNNER = f"gcr.io/{PROJECT}/{USER}_latest"
+
+MAX_RESTARTS = 10_000
+BENCHMARK_STEPS=10_000_000
+
+
+def main() -> int:
+  # V6e cluster config
+  cluster_config = XpkClusterConfig(
+      cluster_name=CLUSTER,
+      project=PROJECT,
+      zone=ZONE,
+      device_type=DEVICE_TYPE,
+  )
+
+  # Handle command line arguments using args_helper
+  should_continue = helper.handle_cmd_args(
+      cluster_config, helper.DELETE, xpk_path=XPK_PATH
+  )
+
+  if not should_continue:
+    return 0
+
+  model_list = [
+      # model_configs.llama3_1_70b_8192_pw_lr_real_data,
+      # model_configs.llama3_1_8b_8192,
+      model_configs.llama3_1_70b_8192_iter_synth_data_and_checkpointing,
+      # model_configs.llama3_1_70b_8192_iter_real_data_and_checkpointing_tfds,
+  ]
+  num_slices_list = [
+      2
+  ]
+
+  xpk_workload_cmds = []
+  xpk_workload_names = []
+
+  for model in model_list:
+    # Run workloads on the below clusters
+    for cluster_config in [
+        cluster_config,
+    ]:
+
+      # Make modifications to the model config here to add in any additional
+      # flags or changes to the model config.
+      model.tuning_params["use_vertex_tensorboard"] = True
+      model.tuning_params["vertex_tensorboard_project"] = PROJECT
+      model.tuning_params["vertex_tensorboard_region"] = REGION
+
+      # Run workloads in the following slice configurations
+      for num_slices in num_slices_list:
+        wl_config = mxr.WorkloadConfig(
+            model=model,
+            num_slices=num_slices,
+            device_type=cluster_config.device_type,
+            base_output_directory=BASE_OUTPUT_DIRECTORY,
+            max_restarts=MAX_RESTARTS,
+            libtpu_type=mxr.LibTpuType.MAXTEXT,
+            libtpu_nightly_version="",
+            base_docker_image=RUNNER,
+            xpk_path=XPK_PATH,
+            num_steps=BENCHMARK_STEPS,
+            priority="medium",
+        )
+        command, name = mxr.generate_xpk_workload_cmd(
+            cluster_config=cluster_config, wl_config=wl_config
+        )
+
+        print(f"Name of the workload is: {name} \n")
+        xpk_workload_names.append(name)
+
+        print(f"XPK command to be used is: {command} \n")
+        xpk_workload_cmds.append(command)
+
+  for xpk_workload_name, xpk_workload_cmd in zip(
+      xpk_workload_names, xpk_workload_cmds
+  ):
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(
+        f"[{timestamp}] Running workload: {xpk_workload_name} with command:"
+        f" {xpk_workload_cmd}"
+    )
+    return_code = mxr.run_command_with_updates(
+        xpk_workload_cmd, xpk_workload_name
+    )
+    if return_code != 0:
+      print(f"Unable to run xpk workload: {xpk_workload_name}")
+
+
+if __name__ == "__main__":
+  main()
diff --git a/benchmarks/recipes/pw_long_running_recipe.py b/benchmarks/recipes/pw_long_running_recipe.py
@@ -17,11 +17,11 @@
 import datetime
 import sys
 import os
-import args_helper as helper
 
 parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 sys.path.append(parent_dir)
 
+import recipes.args_helper as helper
 import maxtext_trillium_model_configs as model_configs
 import maxtext_xpk_runner as mxr
 from xpk_configs import XpkClusterConfig
@@ -42,7 +42,7 @@
 XPK_PATH = os.path.join("~", "xpk")  # We're running this script from the maxtext directory
 USER = os.environ["USER"]
 BASE_OUTPUT_DIRECTORY = (
-    f"gs://{USER}-{PROJECT}-{COUNTRY}/pw_mcjax_benchmarking/"
+    f"gs://{USER}-{PROJECT}-{COUNTRY}/pw_long_run/"
 )
 
 MAX_RESTARTS = 10_000
@@ -70,8 +70,10 @@ def main() -> int:
       # model_configs.llama3_1_70b_8192_pw_lr_real_data,
       # model_configs.llama3_1_8b_8192,
       # model_configs.llama3_1_70b_8192_iter_synth_data_and_checkpointing,
-      model_configs.llama3_1_70b_8192_iter_real_data_and_checkpointing_tfds,
+      # model_configs.llama3_1_70b_8192_iter_real_data_and_checkpointing_tfds,
+      model_configs.llama3_1_70b_8192_iter_synthetic,
   ]
+
   pathways_config = mxr.PathwaysConfig(
       server_image=SERVER_IMAGE,
       proxy_server_image=PROXY_IMAGE,
@@ -104,6 +106,7 @@ def main() -> int:
       model.tuning_params["use_vertex_tensorboard"] = True
       model.tuning_params["vertex_tensorboard_project"] = PROJECT
       model.tuning_params["vertex_tensorboard_region"] = REGION
+      model.tuning_params["profiler"] = "xplane"
 
       # Run workloads in the following slice configurations
       for num_slices in num_slices_list:
diff --git a/benchmarks/upload_metrics_to_bq.py b/benchmarks/upload_metrics_to_bq.py
@@ -239,7 +239,7 @@ def update_config_with_tuning_params(base_config: omegaconf.DictConfig,
 
 def main(argv: Sequence[str]) -> None:
   is_pathways = os.environ.get('JAX_PLATFORMS', '') == 'proxy'
-  is_mcjax_0th_worker = int(os.environ['TPU_WORKER_ID']) == 0
+  is_mcjax_0th_worker = int(os.environ.get('TPU_WORKER_ID', -1)) == 0
 
   # Only write once for McJAX. Pathways is single controller,
   # so only can write once.