From d20a6a0ff5192e7ea74f8449a342055e1c2c1d4b Mon Sep 17 00:00:00 2001 From: "Dr. Jan-Philip Gehrcke" Date: Wed, 24 Sep 2025 07:50:00 +0000 Subject: [PATCH] api: add numNodes spec Signed-off-by: Dr. Jan-Philip Gehrcke --- .../resource/v1beta1/computedomain.go | 29 +++++++++++++++++- .../resource.nvidia.com_computedomains.yaml | 30 ++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/api/nvidia.com/resource/v1beta1/computedomain.go b/api/nvidia.com/resource/v1beta1/computedomain.go index 7ffa8fd73..cc1268207 100644 --- a/api/nvidia.com/resource/v1beta1/computedomain.go +++ b/api/nvidia.com/resource/v1beta1/computedomain.go @@ -39,7 +39,10 @@ type ComputeDomain struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` - Spec ComputeDomainSpec `json:"spec,omitempty"` + Spec ComputeDomainSpec `json:"spec,omitempty"` + // Global ComputeDomain status. Can be used to guide debugging efforts. + // Workload however should not rely on inspecting this field at any point + // during its lifecycle. Status ComputeDomainStatus `json:"status,omitempty"` } @@ -57,6 +60,30 @@ type ComputeDomainList struct { // ComputeDomainSpec provides the spec for a ComputeDomain. type ComputeDomainSpec struct { + // Intended number of IMEX daemons (i.e., individual compute nodes) in the + // ComputeDomain. Must be zero or greater. + // + // With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is + // recommended to be set to zero. Workload must implement and consult its + // own source of truth for the number of workers online before trying to + // share GPU memory (and hence triggering IMEX interaction). When non-zero, + // `numNodes` is used only for automatically updating the global + // ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX + // daemons equals `numNodes`). In this mode, a `numNodes` value greater than + // zero in particular does not gate the startup of IMEX daemons: individual + // IMEX daemons are started immediately without waiting for its peers, and + // any workload pod gets released right after its local IMEX daemon has + // started. + // + // With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set + // to the expected number of worker nodes joining the ComputeDomain. In that + // mode, all workload pods are held back (with containers in state + // `ContainerCreating`) until the underlying IMEX domain has been joined by + // `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to + // join the ComputeDomain may lead to unexpected behavior. + // + // The `numNodes` parameter is deprecated and will be removed in the next + // API version. NumNodes int `json:"numNodes"` Channel *ComputeDomainChannelSpec `json:"channel"` } diff --git a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml index 8cfdf4e2b..5a28ae17c 100644 --- a/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml +++ b/deployments/helm/nvidia-dra-driver-gpu/crds/resource.nvidia.com_computedomains.yaml @@ -66,6 +66,31 @@ spec: - resourceClaimTemplate type: object numNodes: + description: |- + Intended number of IMEX daemons (i.e., individual compute nodes) in the + ComputeDomain. Must be zero or greater. + + With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is + recommended to be set to zero. Workload must implement and consult its + own source of truth for the number of workers online before trying to + share GPU memory (and hence triggering IMEX interaction). When non-zero, + `numNodes` is used only for automatically updating the global + ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX + daemons equals `numNodes`). In this mode, a `numNodes` value greater than + zero in particular does not gate the startup of IMEX daemons: individual + IMEX daemons are started immediately without waiting for its peers, and + any workload pod gets released right after its local IMEX daemon has + started. + + With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set + to the expected number of worker nodes joining the ComputeDomain. In that + mode, all workload pods are held back (with containers in state + `ContainerCreating`) until the underlying IMEX domain has been joined by + `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to + join the ComputeDomain may lead to unexpected behavior. + + The `numNodes` parameter is deprecated and will be removed in the next + API version. type: integer required: - channel @@ -75,7 +100,10 @@ spec: - message: A computeDomain.spec is immutable rule: self == oldSelf status: - description: ComputeDomainStatus provides the status for a ComputeDomain. + description: |- + Global ComputeDomain status. Can be used to guide debugging efforts. + Workload however should not rely on inspecting this field at any point + during its lifecycle. properties: nodes: items: