Skip to content

Commit a446607

Browse files
committed
Update transformer engine job manifest syntax
1 parent acad98a commit a446607

File tree

1 file changed

+48
-39
lines changed

1 file changed

+48
-39
lines changed

.github/eks-workflow-files/transformer-engine/unit-tests.yml

Lines changed: 48 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,53 @@
11
apiVersion: batch/v1
22
kind: Job
33
metadata:
4-
name: {{ JOB_NAME }}
5-
labels:
6-
kueue.x-k8s.io/queue-name: p5-queue
4+
name:
5+
'[object Object]': null
6+
labels:
7+
kueue.x-k8s.io/queue-name: p5-queue
78
spec:
8-
# the job will run for 20 mins, as we can't set max_steps
9-
activeDeadlineSeconds: 1200
10-
completions: 1
11-
parallelism: 1
12-
template:
13-
spec:
14-
restartPolicy: Never
15-
containers:
16-
- name: transformer-engine
17-
image: {{ IMAGE_URI }}
18-
command:
19-
- bash
20-
- -xo
21-
- pipefail
22-
- -c
23-
- |
24-
pip install pytest-reportlog pytest-xdist
25-
# Start MPS daemon
26-
nvidia-cuda-mps-control -d
27-
# TE's default is slightly different, without the hyphen
28-
export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
29-
# 1 GPU per worker, 6 workers per GPU
30-
pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh
9+
activeDeadlineSeconds: 1200
10+
completions: 1
11+
parallelism: 1
12+
template:
13+
spec:
14+
restartPolicy: Never
15+
containers:
16+
- name: transformer-engine
17+
image:
18+
'[object Object]': null
19+
command:
20+
- bash
21+
- '-xo'
22+
- pipefail
23+
- '-c'
24+
- >
25+
pip install pytest-reportlog pytest-xdist
26+
27+
# Start MPS daemon
28+
29+
nvidia-cuda-mps-control -d
30+
31+
# TE's default is slightly different, without the hyphen
32+
33+
export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE}
34+
35+
# 1 GPU per worker, 6 workers per GPU
36+
37+
pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash
38+
${TE_PATH}/qa/L0_jax_unittest/test.sh
39+
resources:
40+
limits:
41+
nvidia.com/gpu: 8
42+
requests:
43+
nvidia.com/gpu: 1
44+
volumeMounts:
45+
- name: output
46+
mountPath: /opt/output
47+
imagePullSecrets:
48+
- name:
49+
'[object Object]': null
50+
volumes:
51+
- name: output
52+
emptyDir: {}
3153

32-
resources:
33-
limits:
34-
nvidia.com/gpu: 8
35-
requests:
36-
nvidia.com/gpu: 1
37-
volumeMounts:
38-
- name: output
39-
mountPath: /opt/output
40-
imagePullSecrets:
41-
- name: {{ IMAGE_PULL_SECRET }}
42-
volumes:
43-
- name: output
44-
emptyDir: {}

0 commit comments

Comments
 (0)