@@ -41,116 +41,104 @@ jobs:
41
41
needs : prelim
42
42
uses : ./.github/workflows/build_upload_internal.yml
43
43
with :
44
- device_type : tpu_pathways
44
+ device_type : tpu
45
45
device_name : v4-8
46
46
build_mode : stable_stack
47
47
base_image : us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
48
48
49
- # tpu_pathways_image:
50
- # needs: prelim
51
- # uses: ./.github/workflows/build_upload_internal.yml
52
- # with:
53
- # device_type: tpu_pathways
54
- # device_name: v4-8
55
- # build_mode: stable_stack
56
- # base_image: us-docker.pkg.dev/tpu-prod-env-multipod/jax-stable-stack/candidate/tpu:latest
57
-
58
- # gpu_image:
59
- # needs: prelim
60
- # uses: ./.github/workflows/build_upload_internal.yml
61
- # with:
62
- # device_type: gpu
63
- # device_name: a100-40gb-4
64
- # build_mode: pinned
65
- # base_image: gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_pinned:latest
66
-
67
- # tpu_unit_tests:
68
- # needs: tpu_image
69
- # uses: ./.github/workflows/run_tests_internal.yml
70
- # with:
71
- # device_type: tpu_pathways
72
- # device_name: v4-8
73
- # pytest_marker: 'not gpu_only and not integration_test'
74
- # test_directory: 'tests'
75
- # xla_python_client_mem_fraction: 0.75
76
- # tf_force_gpu_allow_growth: false
77
- # container_resource_option: "--privileged"
49
+ gpu_image :
50
+ needs : prelim
51
+ uses : ./.github/workflows/build_upload_internal.yml
52
+ with :
53
+ device_type : gpu
54
+ device_name : a100-40gb-4
55
+ build_mode : pinned
56
+ base_image : gcr.io/tpu-prod-env-multipod/maxtext_gpu_jax_pinned:latest
57
+
58
+ tpu_unit_tests :
59
+ needs : tpu_image
60
+ uses : ./.github/workflows/run_tests_internal.yml
61
+ with :
62
+ device_type : tpu
63
+ device_name : v4-8
64
+ pytest_marker : ' not gpu_only and not integration_test'
65
+ test_directory : ' tests'
66
+ xla_python_client_mem_fraction : 0.75
67
+ tf_force_gpu_allow_growth : false
68
+ container_resource_option : " --privileged"
78
69
79
70
tpu_pathways_unit_tests :
80
71
needs : [tpu_image]
81
72
uses : ./.github/workflows/run_pathways_tests_internal.yml
82
73
with :
83
- device_type : tpu_pathways
74
+ device_type : tpu
84
75
device_name : v4-8
85
76
pytest_marker : ' not gpu_only and not integration_test'
86
77
test_directory : ' tests'
87
78
# xla_python_client_mem_fraction: 0.75
88
79
# tf_force_gpu_allow_growth: false
89
80
# container_resource_option: "--privileged"
90
81
91
- # tpu_integration_tests:
92
- # needs: tpu_image
93
- # uses: ./.github/workflows/run_tests_internal.yml
94
- # with:
95
- # device_type: tpu
96
- # device_name: v4-8
97
- # pytest_marker: 'not gpu_only and integration_test'
98
- # test_directory: 'tests/integration_tests'
99
- # xla_python_client_mem_fraction: 0.75
100
- # tf_force_gpu_allow_growth: false
101
- # container_resource_option: "--privileged"
102
-
103
-
104
- # gpu_unit_tests:
105
- # needs: gpu_image
106
- # uses: ./.github/workflows/run_tests_internal.yml
107
- # with:
108
- # device_type: gpu
109
- # device_name: a100-40gb-4
110
- # pytest_marker: 'not tpu_only and not integration_test'
111
- # test_directory: 'tests'
112
- # xla_python_client_mem_fraction: 0.65
113
- # tf_force_gpu_allow_growth: true
114
- # container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
82
+ tpu_integration_tests :
83
+ needs : tpu_image
84
+ uses : ./.github/workflows/run_tests_internal.yml
85
+ with :
86
+ device_type : tpu
87
+ device_name : v4-8
88
+ pytest_marker : ' not gpu_only and integration_test'
89
+ test_directory : ' tests/integration_tests'
90
+ xla_python_client_mem_fraction : 0.75
91
+ tf_force_gpu_allow_growth : false
92
+ container_resource_option : " --privileged"
93
+
94
+ gpu_unit_tests :
95
+ needs : gpu_image
96
+ uses : ./.github/workflows/run_tests_internal.yml
97
+ with :
98
+ device_type : gpu
99
+ device_name : a100-40gb-4
100
+ pytest_marker : ' not tpu_only and not integration_test'
101
+ test_directory : ' tests'
102
+ xla_python_client_mem_fraction : 0.65
103
+ tf_force_gpu_allow_growth : true
104
+ container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
115
105
116
- # gpu_integration_tests:
117
- # needs: gpu_image
118
- # uses: ./.github/workflows/run_tests_internal.yml
119
- # with:
120
- # device_type: gpu
121
- # device_name: a100-40gb-4
122
- # pytest_marker: 'not tpu_only and integration_test'
123
- # test_directory: 'tests/integration_tests'
124
- # xla_python_client_mem_fraction: 0.65
125
- # tf_force_gpu_allow_growth: true
126
- # container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
106
+ gpu_integration_tests :
107
+ needs : gpu_image
108
+ uses : ./.github/workflows/run_tests_internal.yml
109
+ with :
110
+ device_type : gpu
111
+ device_name : a100-40gb-4
112
+ pytest_marker : ' not tpu_only and integration_test'
113
+ test_directory : ' tests/integration_tests'
114
+ xla_python_client_mem_fraction : 0.65
115
+ tf_force_gpu_allow_growth : true
116
+ container_resource_option : " --shm-size 2g --runtime=nvidia --gpus all --privileged"
127
117
128
118
clean_up :
129
119
if : ${{ always() }} # always execute, regardless of previous jobs or steps.
130
- # needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_pathways_unit_tests, tpu_integration_tests]
131
- needs : [tpu_pathways_unit_tests]
120
+ needs : [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_pathways_unit_tests, tpu_integration_tests]
132
121
name : " Clean up"
133
- runs-on : ["self-hosted", "tpu_pathways" ]
122
+ runs-on : ["self-hosted"]
134
123
permissions :
135
124
contents : read
136
125
issues : write # for failed-build-issue
137
126
steps :
138
- # - name: Delete GPU image
139
- # run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
127
+ - name : Delete GPU image
128
+ run : gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
140
129
- name : Delete TPU image
141
130
run : gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet
142
131
143
-
144
- # notify:
145
- # name: Notify failed build # creates an issue or modifies last open existing issue for failed build
146
- # needs: [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
147
- # runs-on: ["self-hosted"]
148
- # steps:
149
- # - name: Check whether one of the jobs failed
150
- # if: ${{ failure() && github.event.pull_request == null }}
151
- # uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
152
- # with:
153
- # github-token: ${{ secrets.GITHUB_TOKEN }}
154
- # - name: Log message if dependent job succeeded
155
- # if: ${{ ! (failure() && github.event.pull_request == null) }}
156
- # run: echo "Conditions for creating/updating issue not met. Skipping."
132
+ notify :
133
+ name : Notify failed build # creates an issue or modifies last open existing issue for failed build
134
+ needs : [gpu_unit_tests, gpu_integration_tests, tpu_unit_tests, tpu_integration_tests]
135
+ runs-on : ["self-hosted"]
136
+ steps :
137
+ - name : Check whether one of the jobs failed
138
+ if : ${{ failure() && github.event.pull_request == null }}
139
+ uses : jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b # v1.2.0
140
+ with :
141
+ github-token : ${{ secrets.GITHUB_TOKEN }}
142
+ - name : Log message if dependent job succeeded
143
+ if : ${{ ! (failure() && github.event.pull_request == null) }}
144
+ run : echo "Conditions for creating/updating issue not met. Skipping."
0 commit comments