From c449121503074caecf2b88b6fa7983ede88455f4 Mon Sep 17 00:00:00 2001
From: Bhavya Bahl <bbahl@google.com>
Date: Fri, 14 Feb 2025 22:39:41 +0000
Subject: [PATCH] Update dockerfile to print metrics

---
 training/trillium/Llama3-70B-PyTorch/GCE/README.md      | 8 ++++----
 training/trillium/Llama3-70B-PyTorch/GCE/host.sh        | 3 +--
 training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile | 5 ++---
 3 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/training/trillium/Llama3-70B-PyTorch/GCE/README.md b/training/trillium/Llama3-70B-PyTorch/GCE/README.md
index bba9572..7333c89 100644
--- a/training/trillium/Llama3-70B-PyTorch/GCE/README.md
+++ b/training/trillium/Llama3-70B-PyTorch/GCE/README.md
@@ -28,17 +28,17 @@ gcloud alpha compute tpus tpu-vm create $TPU_NAME \
 
 The following setup runs the training job with Llama 3 70B on GCE TPUs using
 the docker image from this registry
-(`us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:jan15built`).
-The docker image uses torch and torch_xla nightly build from 09/28/2024
+(`us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:feb14build`).
+The docker image uses torch and torch_xla nightly build from 02/11/2024
 and comes with all the package dependency needed to run the model training.
 All the command below should run from your own machine (not the TPU host you
-created).
+created). The dockerfile used is to build the image is at https://github.com/AI-Hypercomputer/tpu-recipes/blob/main/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile
 
 1. git clone and navigate to this README repo and run training script:
 
 ```bash
 git clone --depth 1 https://github.com/AI-Hypercomputer/tpu-recipes.git
-cd training/trillium/GCE/Llama3-70B-PyTorch
+cd training/trillium/Llama3-70B-PyTorch/GCE
 ```
 
 2. Edit `env.sh` to add the hugging face token and/or setup the training parameters.
diff --git a/training/trillium/Llama3-70B-PyTorch/GCE/host.sh b/training/trillium/Llama3-70B-PyTorch/GCE/host.sh
index 3f2d750..9f12b05 100644
--- a/training/trillium/Llama3-70B-PyTorch/GCE/host.sh
+++ b/training/trillium/Llama3-70B-PyTorch/GCE/host.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 
-DOCKER_IMAGE=us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:jan15built
-
+DOCKER_IMAGE=us-central1-docker.pkg.dev/deeplearning-images/reproducibility/pytorch-xla/llama3-70b:feb14build
 worker_id=$(curl -s "http://metadata.google.internal/computeMetadata/v1/instance/attributes/agent-worker-number" -H 'Metadata-Flavor: Google')
 
 cat >> /dev/null <<EOF
diff --git a/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile b/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile
index 2b697ca..67b016e 100644
--- a/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile
+++ b/training/trillium/Llama3-70B-PyTorch/GCE/tpu.Dockerfile
@@ -1,6 +1,5 @@
 # Base package containing nightly PyTorch/XLA
-ARG BASE_IMAGE=us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm
-FROM ${BASE_IMAGE}
+FROM us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_cxx11_20250211
 
 # Install transformers library
 ARG TRANSFORMERS_REPO=https://github.com/pytorch-tpu/transformers.git
@@ -10,7 +9,7 @@ RUN git clone "${TRANSFORMERS_REPO}" transformers && cd transformers && git chec
 
 # Install transformers dependencies
 WORKDIR /workspace/transformers
-RUN pip3 install git+file://$PWD accelerate datasets evaluate "huggingface_hub[cli]" \
+RUN pip3 install git+file://$PWD accelerate datasets protobuf evaluate "huggingface_hub[cli]" \
     "torch_xla[pallas]" \
     -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html \
     -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html