Skip to content

Commit 49764c3

Browse files
authored
Merge pull request #617 from jgehrcke/jp/numnodes-spec
api: add numNodes spec
2 parents 0690792 + d20a6a0 commit 49764c3

File tree

2 files changed

+57
-2
lines changed

2 files changed

+57
-2
lines changed

api/nvidia.com/resource/v1beta1/computedomain.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ type ComputeDomain struct {
3939
metav1.TypeMeta `json:",inline"`
4040
metav1.ObjectMeta `json:"metadata,omitempty"`
4141

42-
Spec ComputeDomainSpec `json:"spec,omitempty"`
42+
Spec ComputeDomainSpec `json:"spec,omitempty"`
43+
// Global ComputeDomain status. Can be used to guide debugging efforts.
44+
// Workload however should not rely on inspecting this field at any point
45+
// during its lifecycle.
4346
Status ComputeDomainStatus `json:"status,omitempty"`
4447
}
4548

@@ -57,6 +60,30 @@ type ComputeDomainList struct {
5760

5861
// ComputeDomainSpec provides the spec for a ComputeDomain.
5962
type ComputeDomainSpec struct {
63+
// Intended number of IMEX daemons (i.e., individual compute nodes) in the
64+
// ComputeDomain. Must be zero or greater.
65+
//
66+
// With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is
67+
// recommended to be set to zero. Workload must implement and consult its
68+
// own source of truth for the number of workers online before trying to
69+
// share GPU memory (and hence triggering IMEX interaction). When non-zero,
70+
// `numNodes` is used only for automatically updating the global
71+
// ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX
72+
// daemons equals `numNodes`). In this mode, a `numNodes` value greater than
73+
// zero in particular does not gate the startup of IMEX daemons: individual
74+
// IMEX daemons are started immediately without waiting for its peers, and
75+
// any workload pod gets released right after its local IMEX daemon has
76+
// started.
77+
//
78+
// With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set
79+
// to the expected number of worker nodes joining the ComputeDomain. In that
80+
// mode, all workload pods are held back (with containers in state
81+
// `ContainerCreating`) until the underlying IMEX domain has been joined by
82+
// `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to
83+
// join the ComputeDomain may lead to unexpected behavior.
84+
//
85+
// The `numNodes` parameter is deprecated and will be removed in the next
86+
// API version.
6087
NumNodes int `json:"numNodes"`
6188
Channel *ComputeDomainChannelSpec `json:"channel"`
6289
}

deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,31 @@ spec:
6666
- resourceClaimTemplate
6767
type: object
6868
numNodes:
69+
description: |-
70+
Intended number of IMEX daemons (i.e., individual compute nodes) in the
71+
ComputeDomain. Must be zero or greater.
72+
73+
With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is
74+
recommended to be set to zero. Workload must implement and consult its
75+
own source of truth for the number of workers online before trying to
76+
share GPU memory (and hence triggering IMEX interaction). When non-zero,
77+
`numNodes` is used only for automatically updating the global
78+
ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX
79+
daemons equals `numNodes`). In this mode, a `numNodes` value greater than
80+
zero in particular does not gate the startup of IMEX daemons: individual
81+
IMEX daemons are started immediately without waiting for its peers, and
82+
any workload pod gets released right after its local IMEX daemon has
83+
started.
84+
85+
With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set
86+
to the expected number of worker nodes joining the ComputeDomain. In that
87+
mode, all workload pods are held back (with containers in state
88+
`ContainerCreating`) until the underlying IMEX domain has been joined by
89+
`numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to
90+
join the ComputeDomain may lead to unexpected behavior.
91+
92+
The `numNodes` parameter is deprecated and will be removed in the next
93+
API version.
6994
type: integer
7095
required:
7196
- channel
@@ -75,7 +100,10 @@ spec:
75100
- message: A computeDomain.spec is immutable
76101
rule: self == oldSelf
77102
status:
78-
description: ComputeDomainStatus provides the status for a ComputeDomain.
103+
description: |-
104+
Global ComputeDomain status. Can be used to guide debugging efforts.
105+
Workload however should not rely on inspecting this field at any point
106+
during its lifecycle.
79107
properties:
80108
nodes:
81109
items:

0 commit comments

Comments
 (0)