Skip to content

Commit c4f1532

Browse files
committed
tests: add test_cd_failover.bats and support
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent 7c0b313 commit c4f1532

File tree

5 files changed

+119
-1
lines changed

5 files changed

+119
-1
lines changed

tests/bats/Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@ FROM debian:trixie
22

33
# GNU parallel: bats may want to use that
44
# gettext-base: provides envsubst, used by nickelpie
5+
# bc: by bash wrappers, for calculation
56
RUN apt-get update && apt-get install -y -q --no-install-recommends \
6-
parallel git ca-certificates curl make gettext-base jq && \
7+
parallel git ca-certificates curl make gettext-base jq bc yq && \
78
rm -rf /var/lib/apt/lists/*
89

910
# Set by BuiltKit, of the form amd64/arm64.
@@ -15,6 +16,7 @@ RUN git clone https://github.com/bats-core/bats-core.git && cd bats-core && \
1516
git checkout 658f442f0fcdd6f9e2ea01625999217e8f7bfe7d && ./install.sh /usr/local
1617

1718
RUN mkdir -p /bats-libraries
19+
# These are conservatively patched -- so far; maybe pin them in the future
1820
RUN git clone https://github.com/bats-core/bats-support /bats-libraries/bats-support
1921
RUN git clone https://github.com/bats-core/bats-assert /bats-libraries/bats-assert
2022
RUN git clone https://github.com/bats-core/bats-file /bats-libraries/bats-file

tests/bats/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,5 +110,6 @@ tests: image
110110
tests/bats/test_cd_imex_chan_inject.bats \
111111
tests/bats/test_cd_mnnvl_workload.bats \
112112
tests/bats/test_cd_logging.bats \
113+
tests/bats/test_cd_failover.bats \
113114
tests/bats/test_cd_updowngrade.bats \
114115
"

tests/bats/cleanup-from-previous-run.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ timeout -v 5 kubectl delete -f demo/specs/imex/channel-injection-all.yaml 2> /de
5656
timeout -v 5 kubectl delete jobs nickelpie-test 2> /dev/null
5757
timeout -v 5 kubectl delete computedomain nickelpie-test-compute-domain 2> /dev/null
5858
timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-1.yaml 2> /dev/null
59+
timeout -v 5 kubectl delete -f demo/specs/imex/nvbandwidth-test-job-2.yaml 2> /dev/null
60+
timeout -v 5 kubectl delete -f tests/bats/specs/nvb2.yaml 2> /dev/null
5961
timeout -v 5 kubectl delete pods -l env=batssuite 2> /dev/null
6062
timeout -v 2 kubectl delete resourceclaim batssuite-rc-bad-opaque-config --force 2> /dev/null
6163

tests/bats/specs/nvb2.yaml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
---
2+
apiVersion: resource.nvidia.com/v1beta1
3+
kind: ComputeDomain
4+
metadata:
5+
name: test-failover-cd
6+
spec:
7+
numNodes: 2
8+
channel:
9+
resourceClaimTemplate:
10+
name: test-failover-rct-channel
11+
---
12+
apiVersion: kubeflow.org/v2beta1
13+
kind: MPIJob
14+
metadata:
15+
name: test-failover-job
16+
spec:
17+
slotsPerWorker: 2
18+
launcherCreationPolicy: WaitForWorkersReady
19+
runPolicy:
20+
cleanPodPolicy: Running
21+
sshAuthMountPath: /home/mpiuser/.ssh
22+
mpiReplicaSpecs:
23+
Launcher:
24+
replicas: 1
25+
template:
26+
metadata:
27+
labels:
28+
mpi-memcpy-dra-test-replica: mpi-launcher
29+
spec:
30+
containers:
31+
- image: ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
32+
name: mpi-launcher
33+
securityContext:
34+
runAsUser: 1000
35+
command:
36+
- mpirun
37+
args:
38+
- --show-progress
39+
- --bind-to
40+
- core
41+
- --map-by
42+
- ppr:2:node
43+
- -np
44+
- "4"
45+
- --report-bindings
46+
- -q
47+
- nvbandwidth
48+
- --verbose
49+
- --testSamples
50+
- "20"
51+
- --bufferSize
52+
- "2048"
53+
- -t
54+
- multinode_device_to_device_memcpy_read_ce
55+
affinity:
56+
nodeAffinity:
57+
requiredDuringSchedulingIgnoredDuringExecution:
58+
nodeSelectorTerms:
59+
- matchExpressions:
60+
- key: node-role.kubernetes.io/control-plane
61+
operator: Exists
62+
Worker:
63+
replicas: 2
64+
template:
65+
metadata:
66+
labels:
67+
mpi-memcpy-dra-test-replica: mpi-worker
68+
spec:
69+
containers:
70+
- image: ghcr.io/nvidia/k8s-samples:nvbandwidth-6dc12f17
71+
name: mpi-worker
72+
securityContext:
73+
runAsUser: 1000
74+
env:
75+
command:
76+
- /usr/sbin/sshd
77+
args:
78+
- -De
79+
- -f
80+
- /home/mpiuser/.sshd_config
81+
resources:
82+
limits:
83+
nvidia.com/gpu: 2
84+
claims:
85+
- name: test-failover-rc-channel
86+
resourceClaims:
87+
- name: test-failover-rc-channel
88+
resourceClaimTemplateName: test-failover-rct-channel

tests/bats/test_cd_failover.bats

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# shellcheck disable=SC2148
2+
# shellcheck disable=SC2329
3+
4+
setup_file() {
5+
load 'helpers.sh'
6+
_common_setup
7+
8+
local _iargs=("--set" "logVerbosity=6")
9+
iupgrade_wait "${TEST_CHART_REPO}" "${TEST_CHART_VERSION}" _iargs
10+
}
11+
12+
13+
@test "CD failover nvb2 0" {
14+
bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 0
15+
}
16+
17+
18+
@test "CD failover nvb2 1" {
19+
bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 1
20+
}
21+
22+
23+
@test "CD failover nvb2 2" {
24+
bash tests/bats/lib/test_cd_nvb_failover.sh tests/bats/specs/nvb2.yaml 2
25+
}

0 commit comments

Comments
 (0)