Skip to content

Commit 1755edb

Browse files
committed
Maxtext unit tests with Pathways backend.
Modified Pathways workflow to run specifically on Pathways runner. Final tests on Pathways. New changes to help run tests. Trial to get Pathways working. run Pathways containers and Maxtext as part of the same job. Move installation to script More changes. Installing docker also as part of the script. Simplified flow test. Few more changes. New way of installation Docker compose with Maxtext and Pathways containers. Other changes to use maxtext container in docker compose YAML. Trying to merge all services together. Adding everything as services again. Directly running docker compose. Elegant solution with Pathways containers as services. Elegant solution with Pathways containers as services. Reverting to Maxtext in docker compose. Changes to run with latest JAX SS image. Run tests with correct markers. Move installation to Github runner step, simplify command. Formalize the changes. Formalize the changes. Formalize the changes. Notify on Pathways test failures, modify test script. Error handling.
1 parent 842c231 commit 1755edb

File tree

4 files changed

+187
-2
lines changed

4 files changed

+187
-2
lines changed

.github/workflows/RunTests.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,15 @@ jobs:
6767
tf_force_gpu_allow_growth: false
6868
container_resource_option: "--privileged"
6969

70+
tpu_pathways_unit_tests:
71+
needs: [tpu_image]
72+
uses: ./.github/workflows/run_pathways_tests_internal.yml
73+
with:
74+
device_type: tpu
75+
device_name: v4-8
76+
pytest_marker: 'not gpu_only and not integration_test'
77+
test_directory: 'tests'
78+
7079
tpu_integration_tests:
7180
needs: tpu_image
7281
uses: ./.github/workflows/run_tests_internal.yml
@@ -105,7 +114,7 @@ jobs:
105114

106115
clean_up:
107116
if: ${{ always() }} # always execute, regardless of previous jobs or steps.
108-
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
117+
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_pathways_unit_tests, tpu_integration_tests]
109118
name: "Clean up"
110119
runs-on: ["self-hosted"]
111120
permissions:
@@ -119,7 +128,7 @@ jobs:
119128

120129
notify:
121130
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
122-
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
131+
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_pathways_unit_tests, tpu_integration_tests]
123132
runs-on: ["self-hosted"]
124133
steps:
125134
- name: Check whether one of the jobs failed
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This file runs unit tests with Pathways backend.
16+
17+
name: Run Pathways Tests
18+
19+
on:
20+
workflow_call:
21+
inputs:
22+
device_type:
23+
required: true
24+
type: string
25+
device_name:
26+
required: true
27+
type: string
28+
pytest_marker:
29+
required: true
30+
type: string
31+
test_directory:
32+
required: true
33+
type: string
34+
35+
jobs:
36+
start_pathways:
37+
name: Start Pathways tests on (${{ inputs.device_name }})
38+
runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
39+
steps:
40+
- name: Install prerequisites on the self-hosted runner for Pathways
41+
run: |
42+
echo "Setting up the prerequisites"
43+
apt-get install -y docker --quiet
44+
apt-get install -y docker-compose-plugin --quiet
45+
docker compose version # To ensure docker compose is installed
46+
gcloud auth configure-docker us-docker.pkg.dev --quiet
47+
- uses: actions/checkout@v4
48+
- name: Start Pathways and Maxtext containers using script
49+
run: |
50+
bash docker_run_pathways_containers.sh maxtext_image=gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu command="cd MaxText ; python3 -m pytest ${{ inputs.test_directory }} -m '${{ inputs.pytest_marker }}' -s"

docker_run_pathways_containers.sh

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/bin/bash
2+
3+
# Copyright 2025 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# https://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
# Examples -
18+
# TPU unit tests - bash docker_run_pathways_containers.sh maxtext_image=<your Maxtext image> command="cd MaxText ; python3 -m pytest tests -m 'not gpu_only and not integration_test' -s"
19+
# Subset of unit tests - bash docker_run_pathways_containers.sh maxtext_image=<your Maxtext image> command="cd MaxText ; python3 -m pytest tests/train_tests.py -m 'not gpu_only and not integration_test' -s"
20+
21+
echo "Running docker_run_pathways_containers.sh"
22+
23+
# Stop execution if any command exits with error
24+
set -e
25+
26+
# Parse input variables
27+
for ARGUMENT in "$@"; do
28+
IFS='=' read -r KEY VALUE <<< "$ARGUMENT"
29+
export "$KEY"="$VALUE"
30+
echo "$KEY"="$VALUE"
31+
done
32+
33+
cd utils_pathways
34+
35+
# Error handling - Run 'docker compose down' if 'docker compose up' errors.
36+
# Exit the script with the exit code of docker compose up.
37+
clean_up () {
38+
overall_test_exit_code=$?
39+
echo "Tests are not successful, exit code is $overall_test_exit_code"
40+
MAXTEXT_IMAGE=${maxtext_image} COMMAND=${command} docker compose down
41+
exit $overall_test_exit_code
42+
43+
}
44+
trap clean_up EXIT
45+
46+
# Setting up and tearing down the test setup -
47+
MAXTEXT_IMAGE=${maxtext_image} COMMAND=${command} docker compose up --exit-code-from maxtext
48+
MAXTEXT_IMAGE=${maxtext_image} COMMAND=${command} docker compose down

utils_pathways/docker-compose.yml

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# https://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
services:
16+
17+
resource_manager:
18+
image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:latest
19+
ports:
20+
- "29001:29001"
21+
- "29002:29002"
22+
entrypoint:
23+
- /usr/pathways/run/cloud_pathways_server_sanitized
24+
- --server_port=29001
25+
- --node_type=resource_manager
26+
- --instance_count=1
27+
- --gcs_scratch_location=gs://cloud-pathways-staging/tmp
28+
- --instance_type=tpuv4:2x2x1
29+
environment:
30+
- HOST_ADDRESS=resource_manager
31+
- TPU_SKIP_MDS_QUERY=true
32+
33+
34+
worker:
35+
image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:latest
36+
ports:
37+
- "29005:29005"
38+
- "29006:29006"
39+
- "8471:8471"
40+
- "8080:8080"
41+
entrypoint:
42+
- /usr/pathways/run/cloud_pathways_server_sanitized
43+
- --alsologtostderr
44+
- --server_port=29005
45+
- --resource_manager_address=resource_manager:29001
46+
- --gcs_scratch_location=gs://cloud-pathways-staging/tmp
47+
privileged: true
48+
depends_on:
49+
- resource_manager
50+
51+
52+
proxy:
53+
image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server:latest
54+
ports:
55+
- "29008:29008"
56+
entrypoint:
57+
- /usr/pathways/run/cloud_proxy_server_sanitized
58+
- --server_port=29008
59+
- --resource_manager_address=resource_manager:29001
60+
- --gcs_scratch_location=gs://cloud-pathways-staging/tmp
61+
depends_on:
62+
- worker
63+
- resource_manager
64+
65+
66+
maxtext:
67+
image: ${MAXTEXT_IMAGE}
68+
privileged: true
69+
volumes:
70+
- ../:/deps
71+
environment:
72+
- JAX_PLATFORMS=proxy
73+
- JAX_BACKEND_TARGET=grpc://proxy:29008
74+
entrypoint: ["bash", "-c"]
75+
command:
76+
- ${COMMAND}
77+
depends_on:
78+
- proxy

0 commit comments

Comments
 (0)