|
1 | 1 | apiVersion: batch/v1 |
2 | 2 | kind: Job |
3 | 3 | metadata: |
4 | | - name: {{ JOB_NAME }} |
5 | | - labels: |
6 | | - kueue.x-k8s.io/queue-name: p5-queue |
| 4 | + name: |
| 5 | + '[object Object]': null |
| 6 | + labels: |
| 7 | + kueue.x-k8s.io/queue-name: p5-queue |
7 | 8 | spec: |
8 | | - # the job will run for 20 mins, as we can't set max_steps |
9 | | - activeDeadlineSeconds: 1200 |
10 | | - completions: 1 |
11 | | - parallelism: 1 |
12 | | - template: |
13 | | - spec: |
14 | | - restartPolicy: Never |
15 | | - containers: |
16 | | - - name: transformer-engine |
17 | | - image: {{ IMAGE_URI }} |
18 | | - command: |
19 | | - - bash |
20 | | - - -xo |
21 | | - - pipefail |
22 | | - - -c |
23 | | - - | |
24 | | - pip install pytest-reportlog pytest-xdist |
25 | | - # Start MPS daemon |
26 | | - nvidia-cuda-mps-control -d |
27 | | - # TE's default is slightly different, without the hyphen |
28 | | - export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} |
29 | | - # 1 GPU per worker, 6 workers per GPU |
30 | | - pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash ${TE_PATH}/qa/L0_jax_unittest/test.sh |
| 9 | + activeDeadlineSeconds: 1200 |
| 10 | + completions: 1 |
| 11 | + parallelism: 1 |
| 12 | + template: |
| 13 | + spec: |
| 14 | + restartPolicy: Never |
| 15 | + containers: |
| 16 | + - name: transformer-engine |
| 17 | + image: |
| 18 | + '[object Object]': null |
| 19 | + command: |
| 20 | + - bash |
| 21 | + - '-xo' |
| 22 | + - pipefail |
| 23 | + - '-c' |
| 24 | + - > |
| 25 | + pip install pytest-reportlog pytest-xdist |
| 26 | +
|
| 27 | + # Start MPS daemon |
| 28 | +
|
| 29 | + nvidia-cuda-mps-control -d |
| 30 | +
|
| 31 | + # TE's default is slightly different, without the hyphen |
| 32 | +
|
| 33 | + export TE_PATH=${SRC_PATH_TRANSFORMER_ENGINE} |
| 34 | +
|
| 35 | + # 1 GPU per worker, 6 workers per GPU |
| 36 | +
|
| 37 | + pytest-xdist.sh 1 6 pytest-report-L0-unittest.jsonl bash |
| 38 | + ${TE_PATH}/qa/L0_jax_unittest/test.sh |
| 39 | + resources: |
| 40 | + limits: |
| 41 | + nvidia.com/gpu: 8 |
| 42 | + requests: |
| 43 | + nvidia.com/gpu: 1 |
| 44 | + volumeMounts: |
| 45 | + - name: output |
| 46 | + mountPath: /opt/output |
| 47 | + imagePullSecrets: |
| 48 | + - name: |
| 49 | + '[object Object]': null |
| 50 | + volumes: |
| 51 | + - name: output |
| 52 | + emptyDir: {} |
31 | 53 |
|
32 | | - resources: |
33 | | - limits: |
34 | | - nvidia.com/gpu: 8 |
35 | | - requests: |
36 | | - nvidia.com/gpu: 1 |
37 | | - volumeMounts: |
38 | | - - name: output |
39 | | - mountPath: /opt/output |
40 | | - imagePullSecrets: |
41 | | - - name: {{ IMAGE_PULL_SECRET }} |
42 | | - volumes: |
43 | | - - name: output |
44 | | - emptyDir: {} |
|
0 commit comments