@@ -39,7 +39,10 @@ type ComputeDomain struct {
3939 metav1.TypeMeta `json:",inline"`
4040 metav1.ObjectMeta `json:"metadata,omitempty"`
4141
42- Spec ComputeDomainSpec `json:"spec,omitempty"`
42+ Spec ComputeDomainSpec `json:"spec,omitempty"`
43+ // Global ComputeDomain status. Can be used to guide debugging efforts.
44+ // Workload however should not rely on inspecting this field at any point
45+ // during its lifecycle.
4346 Status ComputeDomainStatus `json:"status,omitempty"`
4447}
4548
@@ -57,6 +60,30 @@ type ComputeDomainList struct {
5760
5861// ComputeDomainSpec provides the spec for a ComputeDomain.
5962type ComputeDomainSpec struct {
63+ // Intended number of IMEX daemons (i.e., individual compute nodes) in the
64+ // ComputeDomain. Must be zero or greater.
65+ //
66+ // With `featureGates.IMEXDaemonsWithDNSNames=true` (the default), this is
67+ // recommended to be set to zero. Workload must implement and consult its
68+ // own source of truth for the number of workers online before trying to
69+ // share GPU memory (and hence triggering IMEX interaction). When non-zero,
70+ // `numNodes` is used only for automatically updating the global
71+ // ComputeDomain `Status` (indicating `Ready` when the number of ready IMEX
72+ // daemons equals `numNodes`). In this mode, a `numNodes` value greater than
73+ // zero in particular does not gate the startup of IMEX daemons: individual
74+ // IMEX daemons are started immediately without waiting for its peers, and
75+ // any workload pod gets released right after its local IMEX daemon has
76+ // started.
77+ //
78+ // With `featureGates.IMEXDaemonsWithDNSNames=false`, `numNodes` must be set
79+ // to the expected number of worker nodes joining the ComputeDomain. In that
80+ // mode, all workload pods are held back (with containers in state
81+ // `ContainerCreating`) until the underlying IMEX domain has been joined by
82+ // `numNodes` IMEX daemons. Pods from more than `numNodes` nodes trying to
83+ // join the ComputeDomain may lead to unexpected behavior.
84+ //
85+ // The `numNodes` parameter is deprecated and will be removed in the next
86+ // API version.
6087 NumNodes int `json:"numNodes"`
6188 Channel * ComputeDomainChannelSpec `json:"channel"`
6289}
0 commit comments