Skip to content

Commit e7bde15

Browse files
committed
api: add numNodes spec
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent b332500 commit e7bde15

File tree

2 files changed

+31
-2
lines changed

2 files changed

+31
-2
lines changed

api/nvidia.com/resource/v1beta1/computedomain.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ type ComputeDomain struct {
3939
metav1.TypeMeta `json:",inline"`
4040
metav1.ObjectMeta `json:"metadata,omitempty"`
4141

42-
Spec ComputeDomainSpec `json:"spec,omitempty"`
42+
Spec ComputeDomainSpec `json:"spec,omitempty"`
43+
// Global ComputeDomain status. Updated only when `Spec.numNodes` is
44+
// non-zero. Can be used to guide debugging efforts. Workload however should
45+
// not rely on inspecting this field at any point during its lifecycle.
4346
Status ComputeDomainStatus `json:"status,omitempty"`
4447
}
4548

@@ -57,6 +60,17 @@ type ComputeDomainList struct {
5760

5861
// ComputeDomainSpec provides the spec for a ComputeDomain.
5962
type ComputeDomainSpec struct {
63+
// Intended number of IMEX daemons (i.e., individual compute nodes) in the
64+
// ComputeDomain. Must be zero or greater. Recommended to be set to zero:
65+
// workload must implement and consult its own source of truth for the
66+
// number of workers online before trying to share GPU memory (and hence
67+
// triggering IMEX interaction). When non-zero, `numNodes` is used only for
68+
// setting the global ComputeDomain `Status` (indicating `Ready` when the
69+
// number of ready IMEX daemons equals `numNodes`). In particular,
70+
// `numNodes` does not gate the startup of IMEX daemons and their
71+
// corresponding workload pods anymore (to restore this behavior, set
72+
// `featureGates.IMEXDaemonsWithDNSNames=false`). This parameter is
73+
// deprecated and will be removed in the next API version.
6074
NumNodes int `json:"numNodes"`
6175
Channel *ComputeDomainChannelSpec `json:"channel"`
6276
}

deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,18 @@ spec:
6666
- resourceClaimTemplate
6767
type: object
6868
numNodes:
69+
description: |-
70+
Intended number of IMEX daemons (i.e., individual compute nodes) in the
71+
ComputeDomain. Must be zero or greater. Recommended to be set to zero:
72+
workload must implement and consult its own source of truth for the
73+
number of workers online before trying to share GPU memory (and hence
74+
triggering IMEX interaction). When non-zero, `numNodes` is used only for
75+
setting the global ComputeDomain `Status` (indicating `Ready` when the
76+
number of ready IMEX daemons equals `numNodes`). In particular,
77+
`numNodes` does not gate the startup of IMEX daemons and their
78+
corresponding workload pods anymore (to restore this behavior, set
79+
`featureGates.IMEXDaemonsWithDNSNames=false`). This parameter is
80+
deprecated and will be removed in the next API version.
6981
type: integer
7082
required:
7183
- channel
@@ -75,7 +87,10 @@ spec:
7587
- message: A computeDomain.spec is immutable
7688
rule: self == oldSelf
7789
status:
78-
description: ComputeDomainStatus provides the status for a ComputeDomain.
90+
description: |-
91+
Global ComputeDomain status. Updated only when `Spec.numNodes` is
92+
non-zero. Can be used to guide debugging efforts. Workload however should
93+
not rely on inspecting this field at any point during its lifecycle.
7994
properties:
8095
nodes:
8196
items:

0 commit comments

Comments
 (0)