diff --git a/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/customizations.yaml b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/customizations.yaml new file mode 100644 index 000000000000..0432dd84c7c6 --- /dev/null +++ b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/customizations.yaml @@ -0,0 +1,388 @@ +apiVersion: config.karmada.io/v1alpha1 +kind: ResourceInterpreterCustomization +metadata: + name: declarative-configuration-rayjob +spec: + target: + apiVersion: ray.io/v1 + kind: RayJob + customizations: + componentResource: + luaScript: | + local kube = require("kube") + + local function isempty(s) + return s == nil or s == '' + end + + function GetComponents(desiredObj) + local components = {} + + -- RayJob contains a RayCluster spec + if desiredObj.spec == nil or desiredObj.spec.rayClusterSpec == nil then + return components + end + + local clusterSpec = desiredObj.spec.rayClusterSpec + + -- Head component (always 1 replica) + if clusterSpec.headGroupSpec ~= nil and clusterSpec.headGroupSpec.template ~= nil then + local headRequires = kube.accuratePodRequirements(clusterSpec.headGroupSpec.template) + local headComponent = { + name = "ray-head", + replicas = 1, + replicaRequirements = headRequires + } + table.insert(components, headComponent) + end + + -- Worker group components + if clusterSpec.workerGroupSpecs ~= nil then + for i, workerGroup in ipairs(clusterSpec.workerGroupSpecs) do + local name = workerGroup.groupName + if isempty(name) then + name = "worker-" .. tostring(i) + end + local replicas = workerGroup.replicas or 0 + local requires = nil + if workerGroup.template ~= nil then + requires = kube.accuratePodRequirements(workerGroup.template) + end + local wgComponent = { + name = name, + replicas = replicas, + replicaRequirements = requires + } + table.insert(components, wgComponent) + end + end + + return components + end + healthInterpretation: + luaScript: > + function InterpretHealth(observedObj) + if observedObj.status == nil then + return false + end + + local jobDeploymentStatus = observedObj.status.jobDeploymentStatus + local jobStatus = observedObj.status.jobStatus + + -- Consider healthy if job is running or completed successfully + if jobDeploymentStatus == 'Running' then + return true + end + + if jobDeploymentStatus == 'Complete' and jobStatus == 'SUCCEEDED' then + return true + end + + -- Unhealthy states + if jobDeploymentStatus == 'Failed' or jobDeploymentStatus == 'ValidationFailed' then + return false + end + + if jobStatus == 'FAILED' or jobStatus == 'STOPPED' then + return false + end + + -- Initializing, Waiting, Retrying, Suspending, Suspended are transitional - not fully healthy yet + return false + end + statusAggregation: + luaScript: > + function AggregateStatus(desiredObj, statusItems) + if statusItems == nil then + return desiredObj + end + if desiredObj.status == nil then + desiredObj.status = {} + end + + -- If only one item, use it directly + if #statusItems == 1 then + desiredObj.status = statusItems[1].status + return desiredObj + end + + -- Initialize aggregated values + local jobStatus = nil + local jobDeploymentStatus = nil + local reason = nil + local message = nil + + -- Priority for jobDeploymentStatus (worst state wins) + local deploymentStatusPriority = { + [""] = 0, + ["Initializing"] = 1, + ["Waiting"] = 2, + ["Suspending"] = 3, + ["Suspended"] = 3, + ["Retrying"] = 4, + ["Running"] = 5, + ["Complete"] = 6, + ["ValidationFailed"] = 7, + ["Failed"] = 8, + } + + -- Priority for jobStatus (worst/most critical state wins) + local jobStatusPriority = { + [""] = 0, + ["PENDING"] = 1, + ["RUNNING"] = 2, + ["SUCCEEDED"] = 3, + ["STOPPED"] = 4, + ["FAILED"] = 5, + } + + + local worstDeploymentPriority = -1 + local worstJobStatusPriority = -1 + local worstClusterStatePriority = -1 + + -- Aggregate status from all member clusters + for i = 1, #statusItems do + local currentStatus = statusItems[i].status + if currentStatus ~= nil then + -- Take the worst jobDeploymentStatus + if currentStatus.jobDeploymentStatus ~= nil then + local priority = deploymentStatusPriority[currentStatus.jobDeploymentStatus] or 0 + if priority > worstDeploymentPriority then + worstDeploymentPriority = priority + jobDeploymentStatus = currentStatus.jobDeploymentStatus + reason = currentStatus.reason + message = currentStatus.message + end + end + + -- Take the worst jobStatus + if currentStatus.jobStatus ~= nil and currentStatus.jobStatus ~= "" then + local priority = jobStatusPriority[currentStatus.jobStatus] or 0 + if priority > worstJobStatusPriority then + worstJobStatusPriority = priority + jobStatus = currentStatus.jobStatus + end + end + end + end + + -- Set aggregated status + desiredObj.status.jobStatus = jobStatus + desiredObj.status.jobDeploymentStatus = jobDeploymentStatus + desiredObj.status.reason = reason + desiredObj.status.message = message + return desiredObj + end + dependencyInterpretation: + luaScript: > + function GetDependencies(desiredObj) + dependentConfigMaps = {} + dependentSecrets = {} + dependentSas = {} + dependentPVCs = {} + refs = {} + local idx = 1 + + -- Helper function to extract dependencies from a pod template spec + local function extractDependenciesFromPodSpec(podSpec) + if podSpec == nil then + return + end + + -- Service account + if podSpec.serviceAccountName ~= nil and podSpec.serviceAccountName ~= '' and podSpec.serviceAccountName ~= 'default' then + dependentSas[podSpec.serviceAccountName] = true + end + + -- Image pull secrets + if podSpec.imagePullSecrets ~= nil then + for _, secretRef in pairs(podSpec.imagePullSecrets) do + if secretRef.name ~= nil and secretRef.name ~= '' then + dependentSecrets[secretRef.name] = true + end + end + end + + -- Volumes + if podSpec.volumes ~= nil then + for _, volume in pairs(podSpec.volumes) do + -- ConfigMap volumes + if volume.configMap ~= nil and volume.configMap.name ~= nil and volume.configMap.name ~= '' then + dependentConfigMaps[volume.configMap.name] = true + end + -- Secret volumes + if volume.secret ~= nil and volume.secret.secretName ~= nil and volume.secret.secretName ~= '' then + dependentSecrets[volume.secret.secretName] = true + end + -- Projected volumes + if volume.projected ~= nil and volume.projected.sources ~= nil then + for _, source in pairs(volume.projected.sources) do + if source.configMap ~= nil and source.configMap.name ~= nil and source.configMap.name ~= '' then + dependentConfigMaps[source.configMap.name] = true + end + if source.secret ~= nil and source.secret.name ~= nil and source.secret.name ~= '' then + dependentSecrets[source.secret.name] = true + end + if source.serviceAccountToken ~= nil then + -- ServiceAccount tokens don't need explicit dependency tracking + end + end + end + -- PVC volumes + if volume.persistentVolumeClaim ~= nil and volume.persistentVolumeClaim.claimName ~= nil and volume.persistentVolumeClaim.claimName ~= '' then + dependentPVCs[volume.persistentVolumeClaim.claimName] = true + end + -- Other secret references in volumes + if volume.azureFile ~= nil and volume.azureFile.secretName ~= nil and volume.azureFile.secretName ~= '' then + dependentSecrets[volume.azureFile.secretName] = true + end + if volume.cephfs ~= nil and volume.cephfs.secretRef ~= nil and volume.cephfs.secretRef.name ~= nil and volume.cephfs.secretRef.name ~= '' then + dependentSecrets[volume.cephfs.secretRef.name] = true + end + if volume.cinder ~= nil and volume.cinder.secretRef ~= nil and volume.cinder.secretRef.name ~= nil and volume.cinder.secretRef.name ~= '' then + dependentSecrets[volume.cinder.secretRef.name] = true + end + if volume.flexVolume ~= nil and volume.flexVolume.secretRef ~= nil and volume.flexVolume.secretRef.name ~= nil and volume.flexVolume.secretRef.name ~= '' then + dependentSecrets[volume.flexVolume.secretRef.name] = true + end + if volume.rbd ~= nil and volume.rbd.secretRef ~= nil and volume.rbd.secretRef.name ~= nil and volume.rbd.secretRef.name ~= '' then + dependentSecrets[volume.rbd.secretRef.name] = true + end + if volume.scaleIO ~= nil and volume.scaleIO.secretRef ~= nil and volume.scaleIO.secretRef.name ~= nil and volume.scaleIO.secretRef.name ~= '' then + dependentSecrets[volume.scaleIO.secretRef.name] = true + end + if volume.iscsi ~= nil and volume.iscsi.secretRef ~= nil and volume.iscsi.secretRef.name ~= nil and volume.iscsi.secretRef.name ~= '' then + dependentSecrets[volume.iscsi.secretRef.name] = true + end + if volume.storageos ~= nil and volume.storageos.secretRef ~= nil and volume.storageos.secretRef.name ~= nil and volume.storageos.secretRef.name ~= '' then + dependentSecrets[volume.storageos.secretRef.name] = true + end + if volume.csi ~= nil and volume.csi.nodePublishSecretRef ~= nil and volume.csi.nodePublishSecretRef.name ~= nil and volume.csi.nodePublishSecretRef.name ~= '' then + dependentSecrets[volume.csi.nodePublishSecretRef.name] = true + end + end + end + + -- Container envFrom references + if podSpec.containers ~= nil then + for _, container in pairs(podSpec.containers) do + if container.envFrom ~= nil then + for _, envFromSource in pairs(container.envFrom) do + if envFromSource.configMapRef ~= nil and envFromSource.configMapRef.name ~= nil and envFromSource.configMapRef.name ~= '' then + dependentConfigMaps[envFromSource.configMapRef.name] = true + end + if envFromSource.secretRef ~= nil and envFromSource.secretRef.name ~= nil and envFromSource.secretRef.name ~= '' then + dependentSecrets[envFromSource.secretRef.name] = true + end + end + end + -- Container env valueFrom references + if container.env ~= nil then + for _, envVar in pairs(container.env) do + if envVar.valueFrom ~= nil then + if envVar.valueFrom.configMapKeyRef ~= nil and envVar.valueFrom.configMapKeyRef.name ~= nil and envVar.valueFrom.configMapKeyRef.name ~= '' then + dependentConfigMaps[envVar.valueFrom.configMapKeyRef.name] = true + end + if envVar.valueFrom.secretKeyRef ~= nil and envVar.valueFrom.secretKeyRef.name ~= nil and envVar.valueFrom.secretKeyRef.name ~= '' then + dependentSecrets[envVar.valueFrom.secretKeyRef.name] = true + end + end + end + end + end + end + + -- Init containers + if podSpec.initContainers ~= nil then + for _, container in pairs(podSpec.initContainers) do + if container.envFrom ~= nil then + for _, envFromSource in pairs(container.envFrom) do + if envFromSource.configMapRef ~= nil and envFromSource.configMapRef.name ~= nil and envFromSource.configMapRef.name ~= '' then + dependentConfigMaps[envFromSource.configMapRef.name] = true + end + if envFromSource.secretRef ~= nil and envFromSource.secretRef.name ~= nil and envFromSource.secretRef.name ~= '' then + dependentSecrets[envFromSource.secretRef.name] = true + end + end + end + if container.env ~= nil then + for _, envVar in pairs(container.env) do + if envVar.valueFrom ~= nil then + if envVar.valueFrom.configMapKeyRef ~= nil and envVar.valueFrom.configMapKeyRef.name ~= nil and envVar.valueFrom.configMapKeyRef.name ~= '' then + dependentConfigMaps[envVar.valueFrom.configMapKeyRef.name] = true + end + if envVar.valueFrom.secretKeyRef ~= nil and envVar.valueFrom.secretKeyRef.name ~= nil and envVar.valueFrom.secretKeyRef.name ~= '' then + dependentSecrets[envVar.valueFrom.secretKeyRef.name] = true + end + end + end + end + end + end + end + + -- Extract dependencies from rayClusterSpec + if desiredObj.spec ~= nil and desiredObj.spec.rayClusterSpec ~= nil then + local clusterSpec = desiredObj.spec.rayClusterSpec + + -- Head group + if clusterSpec.headGroupSpec ~= nil and clusterSpec.headGroupSpec.template ~= nil and clusterSpec.headGroupSpec.template.spec ~= nil then + extractDependenciesFromPodSpec(clusterSpec.headGroupSpec.template.spec) + end + + -- Worker groups + if clusterSpec.workerGroupSpecs ~= nil then + for _, workerGroup in pairs(clusterSpec.workerGroupSpecs) do + if workerGroup.template ~= nil and workerGroup.template.spec ~= nil then + extractDependenciesFromPodSpec(workerGroup.template.spec) + end + end + end + end + + -- Extract dependencies from submitterPodTemplate + if desiredObj.spec ~= nil and desiredObj.spec.submitterPodTemplate ~= nil and desiredObj.spec.submitterPodTemplate.spec ~= nil then + extractDependenciesFromPodSpec(desiredObj.spec.submitterPodTemplate.spec) + end + + -- Build dependency references array + for key, _ in pairs(dependentConfigMaps) do + local dependObj = {} + dependObj.apiVersion = 'v1' + dependObj.kind = 'ConfigMap' + dependObj.name = key + dependObj.namespace = desiredObj.metadata.namespace + refs[idx] = dependObj + idx = idx + 1 + end + for key, _ in pairs(dependentSecrets) do + local dependObj = {} + dependObj.apiVersion = 'v1' + dependObj.kind = 'Secret' + dependObj.name = key + dependObj.namespace = desiredObj.metadata.namespace + refs[idx] = dependObj + idx = idx + 1 + end + for key, _ in pairs(dependentSas) do + local dependObj = {} + dependObj.apiVersion = 'v1' + dependObj.kind = 'ServiceAccount' + dependObj.name = key + dependObj.namespace = desiredObj.metadata.namespace + refs[idx] = dependObj + idx = idx + 1 + end + for key, _ in pairs(dependentPVCs) do + local dependObj = {} + dependObj.apiVersion = 'v1' + dependObj.kind = 'PersistentVolumeClaim' + dependObj.name = key + dependObj.namespace = desiredObj.metadata.namespace + refs[idx] = dependObj + idx = idx + 1 + end + + return refs + end diff --git a/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/customizations_tests.yaml b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/customizations_tests.yaml new file mode 100644 index 000000000000..176eef22fe24 --- /dev/null +++ b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/customizations_tests.yaml @@ -0,0 +1,11 @@ +tests: + - desiredInputPath: testdata/desired-rayjob.yaml + statusInputPath: testdata/status-file.yaml + operation: AggregateStatus + - observedInputPath: testdata/observed-rayjob.yaml + operation: InterpretHealth + - observedInputPath: testdata/observed-rayjob.yaml + operation: InterpretComponent + - desiredInputPath: testdata/desired-rayjob-with-dependencies.yaml + operation: InterpretDependency + diff --git a/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/desired-rayjob-with-dependencies.yaml b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/desired-rayjob-with-dependencies.yaml new file mode 100644 index 000000000000..7f9590fe5ac5 --- /dev/null +++ b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/desired-rayjob-with-dependencies.yaml @@ -0,0 +1,125 @@ +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: rayjob-with-dependencies + namespace: default +spec: + entrypoint: python /home/ray/samples/sample_code.py + shutdownAfterJobFinishes: false + submissionMode: K8sJobMode + runtimeEnvYAML: | + pip: + - requests==2.26.0 + env_vars: + counter_name: "test_counter" + rayClusterSpec: + rayVersion: '2.52.0' + headGroupSpec: + rayStartParams: {} + template: + spec: + serviceAccountName: ray-head-sa + imagePullSecrets: + - name: registry-secret + containers: + - name: ray-head + image: rayproject/ray:2.52.0 + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + resources: + limits: + cpu: "1" + memory: 2Gi + volumeMounts: + - mountPath: /home/ray/samples + name: code-sample + - mountPath: /etc/config + name: app-config + - mountPath: /etc/tls + name: tls-certs + envFrom: + - configMapRef: + name: ray-env-config + - secretRef: + name: ray-env-secrets + env: + - name: API_KEY + valueFrom: + secretKeyRef: + name: api-credentials + key: api-key + - name: DB_CONFIG + valueFrom: + configMapKeyRef: + name: db-config + key: connection-string + volumes: + - name: code-sample + configMap: + name: ray-job-code-sample + - name: app-config + configMap: + name: app-config + - name: tls-certs + secret: + secretName: tls-certs + - name: projected-vol + projected: + sources: + - configMap: + name: projected-config + - secret: + name: projected-secret + workerGroupSpecs: + - replicas: 2 + minReplicas: 1 + maxReplicas: 5 + groupName: small-group + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.52.0 + resources: + limits: + cpu: "1" + memory: 1Gi + volumeMounts: + - mountPath: /data + name: worker-data + - mountPath: /cache + name: worker-cache + initContainers: + - name: init-worker + image: busybox:latest + envFrom: + - configMapRef: + name: init-config + volumes: + - name: worker-data + secret: + secretName: worker-data-secret + - name: worker-cache + persistentVolumeClaim: + claimName: worker-cache-pvc + submitterPodTemplate: + spec: + restartPolicy: Never + containers: + - name: ray-job-submitter + image: rayproject/ray:2.52.0 + envFrom: + - secretRef: + name: submitter-secrets + volumeMounts: + - mountPath: /config + name: submitter-config + volumes: + - name: submitter-config + configMap: + name: submitter-config + diff --git a/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/desired-rayjob.yaml b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/desired-rayjob.yaml new file mode 100644 index 000000000000..91813ffbca60 --- /dev/null +++ b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/desired-rayjob.yaml @@ -0,0 +1,51 @@ +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: sample-rayjob + namespace: default +spec: + entrypoint: python /home/ray/samples/sample_code.py + shutdownAfterJobFinishes: true + ttlSecondsAfterFinished: 60 + rayClusterSpec: + rayVersion: '2.46.0' + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:2.46.0 + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: "2" + memory: 4Gi + requests: + cpu: "2" + memory: 4Gi + workerGroupSpecs: + - groupName: small-workers + replicas: 2 + minReplicas: 1 + maxReplicas: 5 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.46.0 + resources: + limits: + cpu: "1" + memory: 2Gi + requests: + cpu: "1" + memory: 2Gi + diff --git a/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/observed-rayjob.yaml b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/observed-rayjob.yaml new file mode 100644 index 000000000000..32d5b39f36e4 --- /dev/null +++ b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/observed-rayjob.yaml @@ -0,0 +1,69 @@ +apiVersion: ray.io/v1 +kind: RayJob +metadata: + name: sample-rayjob + namespace: default + generation: 1 +spec: + entrypoint: python /home/ray/samples/sample_code.py + shutdownAfterJobFinishes: true + ttlSecondsAfterFinished: 60 + rayClusterSpec: + rayVersion: '2.46.0' + headGroupSpec: + rayStartParams: {} + template: + spec: + containers: + - name: ray-head + image: rayproject/ray:2.46.0 + resources: + limits: + cpu: "2" + memory: 4Gi + workerGroupSpecs: + - groupName: small-workers + replicas: 2 + minReplicas: 1 + maxReplicas: 5 + rayStartParams: {} + template: + spec: + containers: + - name: ray-worker + image: rayproject/ray:2.46.0 + resources: + limits: + cpu: "1" + memory: 2Gi +status: + jobId: raysubmit_12345abcde + rayClusterName: sample-rayjob-raycluster-abc12 + dashboardURL: "sample-rayjob-raycluster-abc12-head-svc.default.svc.cluster.local:8265" + jobStatus: RUNNING + jobDeploymentStatus: Running + startTime: "2025-11-22T10:30:00Z" + succeeded: 0 + failed: 0 + observedGeneration: 1 + rayJobInfo: + startTime: "2025-11-22T10:30:15Z" + rayClusterStatus: + availableWorkerReplicas: 2 + conditions: + - lastTransitionTime: "2025-11-22T10:29:30Z" + message: "" + reason: HeadPodRunningAndReady + status: "True" + type: HeadPodReady + - lastTransitionTime: "2025-11-22T10:29:45Z" + message: All Ray Pods are ready for the first time + reason: AllPodRunningAndReadyFirstTime + status: "True" + type: RayClusterProvisioned + desiredCPU: "4" + desiredMemory: 8Gi + desiredWorkerReplicas: 2 + readyWorkerReplicas: 2 + state: ready + diff --git a/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/status-file.yaml b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/status-file.yaml new file mode 100644 index 000000000000..d9fcfd5ca573 --- /dev/null +++ b/pkg/resourceinterpreter/default/thirdparty/resourcecustomizations/ray.io/v1/RayJob/testdata/status-file.yaml @@ -0,0 +1,40 @@ +applied: true +clusterName: member1 +status: + jobId: raysubmit_12345abcde + rayClusterName: sample-rayjob-raycluster-abc12 + dashboardURL: "sample-rayjob-raycluster-abc12-head-svc.default.svc.cluster.local:8265" + jobStatus: RUNNING + jobDeploymentStatus: Running + startTime: "2025-11-22T10:30:00Z" + succeeded: 0 + failed: 0 + observedGeneration: 1 + rayJobInfo: + startTime: "2025-11-22T10:30:15Z" + rayClusterStatus: + availableWorkerReplicas: 2 + desiredCPU: "4" + desiredMemory: 8Gi + readyWorkerReplicas: 2 +--- +applied: true +clusterName: member2 +status: + jobId: raysubmit_12345abcde + rayClusterName: sample-rayjob-raycluster-xyz78 + dashboardURL: "sample-rayjob-raycluster-xyz78-head-svc.default.svc.cluster.local:8265" + jobStatus: RUNNING + jobDeploymentStatus: Running + startTime: "2025-11-22T10:31:00Z" + succeeded: 0 + failed: 0 + observedGeneration: 2 + rayJobInfo: + startTime: "2025-11-22T10:31:20Z" + rayClusterStatus: + availableWorkerReplicas: 3 + desiredCPU: "6" + desiredMemory: 12Gi + readyWorkerReplicas: 3 +