Skip to content

Commit 6283883

Browse files
committed
Formalize the changes.
1 parent dd9268e commit 6283883

10 files changed

+86
-185
lines changed

.github/workflows/RunTests.yml

Lines changed: 73 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -41,116 +41,104 @@ jobs:
4141
needs: prelim
4242
uses: ./.github/workflows/build_upload_internal.yml
4343
with:
44-
device_type: tpu_pathways
44+
device_type: tpu
4545
device_name: v4-8
4646
build_mode: stable_stack
4747
base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
4848

49-
# tpu_pathways_image:
50-
# needs: prelim
51-
# uses: ./.github/workflows/build_upload_internal.yml
52-
# with:
53-
# device_type: tpu_pathways
54-
# device_name: v4-8
55-
# build_mode: stable_stack
56-
# base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
57-
58-
# gpu_image:
59-
# needs: prelim
60-
# uses: ./.github/workflows/build_upload_internal.yml
61-
# with:
62-
# device_type: gpu
63-
# device_name: a100-40gb-4
64-
# build_mode: pinned
65-
# base_image: gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_pinned:latest
66-
67-
# tpu_unit_tests:
68-
# needs: tpu_image
69-
# uses: ./.github/workflows/run_tests_internal.yml
70-
# with:
71-
# device_type: tpu_pathways
72-
# device_name: v4-8
73-
# pytest_marker: 'not gpu_only and not integration_test'
74-
# test_directory: 'tests'
75-
# xla_python_client_mem_fraction: 0.75
76-
# tf_force_gpu_allow_growth: false
77-
# container_resource_option: "--privileged"
49+
gpu_image:
50+
needs: prelim
51+
uses: ./.github/workflows/build_upload_internal.yml
52+
with:
53+
device_type: gpu
54+
device_name: a100-40gb-4
55+
build_mode: pinned
56+
base_image: gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_pinned:latest
57+
58+
tpu_unit_tests:
59+
needs: tpu_image
60+
uses: ./.github/workflows/run_tests_internal.yml
61+
with:
62+
device_type: tpu
63+
device_name: v4-8
64+
pytest_marker: 'not gpu_only and not integration_test'
65+
test_directory: 'tests'
66+
xla_python_client_mem_fraction: 0.75
67+
tf_force_gpu_allow_growth: false
68+
container_resource_option: "--privileged"
7869

7970
tpu_pathways_unit_tests:
8071
needs: [tpu_image]
8172
uses: ./.github/workflows/run_pathways_tests_internal.yml
8273
with:
83-
device_type: tpu_pathways
74+
device_type: tpu
8475
device_name: v4-8
8576
pytest_marker: 'not gpu_only and not integration_test'
8677
test_directory: 'tests'
8778
# xla_python_client_mem_fraction: 0.75
8879
# tf_force_gpu_allow_growth: false
8980
# container_resource_option: "--privileged"
9081

91-
# tpu_integration_tests:
92-
# needs: tpu_image
93-
# uses: ./.github/workflows/run_tests_internal.yml
94-
# with:
95-
# device_type: tpu
96-
# device_name: v4-8
97-
# pytest_marker: 'not gpu_only and integration_test'
98-
# test_directory: 'tests/integration_tests'
99-
# xla_python_client_mem_fraction: 0.75
100-
# tf_force_gpu_allow_growth: false
101-
# container_resource_option: "--privileged"
102-
103-
104-
# gpu_unit_tests:
105-
# needs: gpu_image
106-
# uses: ./.github/workflows/run_tests_internal.yml
107-
# with:
108-
# device_type: gpu
109-
# device_name: a100-40gb-4
110-
# pytest_marker: 'not tpu_only and not integration_test'
111-
# test_directory: 'tests'
112-
# xla_python_client_mem_fraction: 0.65
113-
# tf_force_gpu_allow_growth: true
114-
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
82+
tpu_integration_tests:
83+
needs: tpu_image
84+
uses: ./.github/workflows/run_tests_internal.yml
85+
with:
86+
device_type: tpu
87+
device_name: v4-8
88+
pytest_marker: 'not gpu_only and integration_test'
89+
test_directory: 'tests/integration_tests'
90+
xla_python_client_mem_fraction: 0.75
91+
tf_force_gpu_allow_growth: false
92+
container_resource_option: "--privileged"
93+
94+
gpu_unit_tests:
95+
needs: gpu_image
96+
uses: ./.github/workflows/run_tests_internal.yml
97+
with:
98+
device_type: gpu
99+
device_name: a100-40gb-4
100+
pytest_marker: 'not tpu_only and not integration_test'
101+
test_directory: 'tests'
102+
xla_python_client_mem_fraction: 0.65
103+
tf_force_gpu_allow_growth: true
104+
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
115105

116-
# gpu_integration_tests:
117-
# needs: gpu_image
118-
# uses: ./.github/workflows/run_tests_internal.yml
119-
# with:
120-
# device_type: gpu
121-
# device_name: a100-40gb-4
122-
# pytest_marker: 'not tpu_only and integration_test'
123-
# test_directory: 'tests/integration_tests'
124-
# xla_python_client_mem_fraction: 0.65
125-
# tf_force_gpu_allow_growth: true
126-
# container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
106+
gpu_integration_tests:
107+
needs: gpu_image
108+
uses: ./.github/workflows/run_tests_internal.yml
109+
with:
110+
device_type: gpu
111+
device_name: a100-40gb-4
112+
pytest_marker: 'not tpu_only and integration_test'
113+
test_directory: 'tests/integration_tests'
114+
xla_python_client_mem_fraction: 0.65
115+
tf_force_gpu_allow_growth: true
116+
container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
127117

128118
clean_up:
129119
if: ${{ always() }} # always execute, regardless of previous jobs or steps.
130-
# needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_pathways_unit_tests, tpu_integration_tests]
131-
needs: [tpu_pathways_unit_tests]
120+
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_pathways_unit_tests, tpu_integration_tests]
132121
name: "Clean up"
133-
runs-on: ["self-hosted", "tpu_pathways" ]
122+
runs-on: ["self-hosted"]
134123
permissions:
135124
contents: read
136125
issues: write # for failed-build-issue
137126
steps:
138-
# - name: Delete GPU image
139-
# run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
127+
- name: Delete GPU image
128+
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
140129
- name: Delete TPU image
141130
run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet
142131

143-
144-
# notify:
145-
# name: Notify failed build # creates an issue or modifies last open existing issue for failed build
146-
# needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
147-
# runs-on: ["self-hosted"]
148-
# steps:
149-
# - name: Check whether one of the jobs failed
150-
# if: ${{ failure() && github.event.pull_request == null }}
151-
# uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
152-
# with:
153-
# github-token: ${{ secrets.GITHUB_TOKEN }}
154-
# - name: Log message if dependent job succeeded
155-
# if: ${{ ! (failure() && github.event.pull_request == null) }}
156-
# run: echo "Conditions for creating/updating issue not met. Skipping."
132+
notify:
133+
name: Notify failed build # creates an issue or modifies last open existing issue for failed build
134+
needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
135+
runs-on: ["self-hosted"]
136+
steps:
137+
- name: Check whether one of the jobs failed
138+
if: ${{ failure() && github.event.pull_request == null }}
139+
uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
140+
with:
141+
github-token: ${{ secrets.GITHUB_TOKEN }}
142+
- name: Log message if dependent job succeeded
143+
if: ${{ ! (failure() && github.event.pull_request == null) }}
144+
run: echo "Conditions for creating/updating issue not met. Skipping."

.github/workflows/build_upload_internal.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,11 @@ jobs:
4040
- uses: actions/checkout@v4
4141
- name: Build an image
4242
run: |
43-
bash docker_build_dependency_image.sh MODE=${{ inputs.build_mode }} DEVICE=tpu BASEIMAGE=${{ inputs.base_image }}
43+
bash docker_build_dependency_image.sh MODE=${{ inputs.build_mode }} DEVICE=${{ inputs.device_type }} BASEIMAGE=${{ inputs.base_image }}
4444
- name: Tag the image
4545
run: |
46-
docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu
46+
docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
4747
- name: Upload the image
4848
run: |
49-
docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu
49+
docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
5050

.github/workflows/run_pathways_tests_internal.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ jobs:
3737
name: Start Pathways tests on (${{ inputs.device_name }})
3838
runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
3939
steps:
40-
- name: Install prerequisites on the self-hosted runner for Pathways.
40+
- name: Install prerequisites on the self-hosted runner for Pathways
4141
run: |
4242
echo "Setting up the prerequisites"
4343
apt-get install docker
@@ -47,4 +47,4 @@ jobs:
4747
- uses: actions/checkout@v4
4848
- name: Start Pathways and Maxtext containers using script
4949
run: |
50-
bash docker_run_pathways_containers.sh maxtext_image=gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu command="cd MaxText ; python3 -m pytest ${{ inputs.test_directory }} -m '${{ inputs.pytest_marker }}' -s"
50+
bash docker_run_pathways_containers.sh maxtext_image=gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu command="cd MaxText ; python3 -m pytest ${{ inputs.test_directory }} -m '${{ inputs.pytest_marker }}' -s"

.github/workflows/run_tests_internal.yml

Lines changed: 1 addition & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -45,48 +45,16 @@ jobs:
4545
run:
4646
runs-on: ["self-hosted", "${{ inputs.device_type }}", "${{ inputs.device_name }}"]
4747
container:
48-
image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu
48+
image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ inputs.device_type }}
4949
volumes:
5050
- /home/runner/actions-runner/_work/maxtext/maxtext:/deps
5151
env:
5252
XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ inputs.xla_python_client_mem_fraction }}
5353
TF_FORCE_GPU_ALLOW_GROWTH: ${{ inputs.tf_force_gpu_allow_growth }}
54-
JAX_PLATFORMS: "proxy"
55-
JAX_BACKEND_TARGET: "grpc://proxy:29008"
5654
options: ${{ inputs.container_resource_option }}
5755
steps:
5856
- uses: actions/checkout@v4
5957
- name: Run Tests
6058
run: |
6159
cd MaxText
6260
python3 -m pytest ${{ inputs.test_directory }} -m "${{ inputs.pytest_marker }}"
63-
64-
services:
65-
resource_manager:
66-
image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:latest
67-
ports:
68-
- "29001:29001"
69-
- "29002:29002"
70-
options:
71-
--entrypoint=[/usr/pathways/run/cloud_pathways_server_sanitized, --server_port=29001, --node_type=resource_manager, --instance_count=1, --instance_type=tpuv4:2x2x1, --gcs_scratch_location=gs://cloud-pathways-staging/tmp]
72-
env:
73-
HOST_ADDRESS: resource_manager
74-
TPU_SKIP_MDS_QUERY: true
75-
76-
worker:
77-
image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:latest
78-
ports:
79-
- "29005:29005"
80-
- "29006:29006"
81-
- "8471:8471"
82-
- "8080:8080"
83-
options:
84-
--privileged
85-
--entrypoint=[/usr/pathways/run/cloud_pathways_server_sanitized, --server_port=29005, --resource_manager_address=resource_manager:29001, --gcs_scratch_location=gs://cloud-pathways-staging/tmp]
86-
87-
proxy:
88-
image: us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server:latest
89-
ports:
90-
- "29008:29008"
91-
options:
92-
--entrypoint=[/usr/pathways/run/cloud_proxy_server_sanitized, --server_port=29000, --resource_manager_address=resource_manager:29001, --gcs_scratch_location=gs://cloud-pathways-staging/tmp]

MaxText/tests/hf_data_processing_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,4 +106,4 @@ def get_first_batch(iterator):
106106

107107

108108
if __name__ == "__main__":
109-
unittest.main()
109+
unittest.main()

MaxText/tests/train_tests.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,8 +166,9 @@ def test_tpu_dropout(self):
166166
def test_gpu_dropout(self):
167167
train_main(TrainTests.CONFIGS["dropout"] + ["attention=dot_product"])
168168

169-
# TODO (b/393393501) : MaxText build failure in hf_data_processing_test.py
170-
@pytest.mark.skip(reason="Tests are currently flaking / failing due to HF token issues")
169+
# # TODO (b/393393501) : MaxText build failure in hf_data_processing_test.py
170+
# @pytest.mark.skip(reason="Tests are currently flaking / failing due to HF token issues")
171+
@pytest.mark.tpu_only
171172
def test_tpu_hf_input_pipeline(self):
172173
train_main(TrainTests.CONFIGS["hf_input_pipeline"])
173174

docker_run_pathways_containers.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
# Example - bash docker_run_pathways_containers.sh maxtext_image=us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest command="cd MaxText ; python3 -m pytest tests -m 'not gpu_only and not integration_test' -s"
17+
# Example - bash docker_run_pathways_containers.sh maxtext_image=us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest command="cd MaxText ; python3 -m pytest tests -m 'not gpu_only and not integration_test' -s"
1818

1919
# Stop execution if any command exits with error
2020

@@ -23,7 +23,7 @@ echo "Running docker_run_pathways_containers.sh"
2323

2424
set -e
2525

26-
# Defaults -
26+
# Defaults -
2727
maxtext_image=us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
2828
command="cd MaxText ; python3 -m pytest tests -m 'not gpu_only and not integration_test' -s"
2929

utils_pathways/docker-compose-successful.yml

Lines changed: 0 additions & 49 deletions
This file was deleted.

utils_pathways/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ services:
6666
maxtext:
6767
image: ${MAXTEXT_IMAGE}
6868
privileged: true
69-
volumes:
69+
volumes:
7070
- ../:/deps
7171
environment:
7272
- JAX_PLATFORMS=proxy

utils_pathways/pathways_server.Dockerfile

Lines changed: 0 additions & 7 deletions
This file was deleted.

0 commit comments

Comments
 (0)