Skip to content

Commit d402d9f

Browse files
committed
api: add numNodes spec
Signed-off-by: Dr. Jan-Philip Gehrcke <[email protected]>
1 parent b332500 commit d402d9f

File tree

2 files changed

+29
-2
lines changed

2 files changed

+29
-2
lines changed

api/nvidia.com/resource/v1beta1/computedomain.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ type ComputeDomain struct {
3939
metav1.TypeMeta `json:",inline"`
4040
metav1.ObjectMeta `json:"metadata,omitempty"`
4141

42-
Spec ComputeDomainSpec `json:"spec,omitempty"`
42+
Spec ComputeDomainSpec `json:"spec,omitempty"`
43+
// Global ComputeDomain status. Updated only when `Spec.numNodes` is
44+
// non-zero. Can be used to guide debugging efforts. Workload however should
45+
// not rely on inspecting this field at any point during its lifecycle.
4346
Status ComputeDomainStatus `json:"status,omitempty"`
4447
}
4548

@@ -57,6 +60,16 @@ type ComputeDomainList struct {
5760

5861
// ComputeDomainSpec provides the spec for a ComputeDomain.
5962
type ComputeDomainSpec struct {
63+
// The intended number of IMEX daemons (and, hence, individual nodes) in the
64+
// ComputeDomain. Must be zero or greater. Recommended to be set to zero:
65+
// workload must implement its own source of truth about the number of
66+
// workers online before trying to share GPU memory (and hence triggering
67+
// IMEX interaction). When non-zero, `numNodes` is currently used only for
68+
// setting the global ComputeDomain `Status` (`Ready` or `NotReady`, by
69+
// keeping count of how many IMEX daemons are currently ready). In
70+
// particular, `numNodes` does not gate the startup of any IMEX daemons or
71+
// their corresponding workload pods anymore. This is a legacy field is
72+
// going to be removed in a future API version.
6073
NumNodes int `json:"numNodes"`
6174
Channel *ComputeDomainChannelSpec `json:"channel"`
6275
}

deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,17 @@ spec:
6666
- resourceClaimTemplate
6767
type: object
6868
numNodes:
69+
description: |-
70+
The intended number of IMEX daemons (and, hence, individual nodes) in the
71+
ComputeDomain. Must be zero or greater. Recommended to be set to zero:
72+
workload must implement its own source of truth about the number of
73+
workers online before trying to share GPU memory (and hence triggering
74+
IMEX interaction). When non-zero, `numNodes` is currently used only for
75+
setting the global ComputeDomain `Status` (`Ready` or `NotReady`, by
76+
keeping count of how many IMEX daemons are currently ready). In
77+
particular, `numNodes` does not gate the startup of any IMEX daemons or
78+
their corresponding workload pods anymore. This is a legacy field is
79+
going to be removed in a future API version.
6980
type: integer
7081
required:
7182
- channel
@@ -75,7 +86,10 @@ spec:
7586
- message: A computeDomain.spec is immutable
7687
rule: self == oldSelf
7788
status:
78-
description: ComputeDomainStatus provides the status for a ComputeDomain.
89+
description: |-
90+
Global ComputeDomain status. Updated only when `Spec.numNodes` is
91+
non-zero. Can be used to guide debugging efforts. Workload however should
92+
not rely on inspecting this field at any point during its lifecycle.
7993
properties:
8094
nodes:
8195
items:

0 commit comments

Comments
 (0)