Skip to content

Commit 4a9fa77

Browse files
committed
update
Signed-off-by: You-Cheng Lin (Owen) <[email protected]>
1 parent 1d5f925 commit 4a9fa77

File tree

5 files changed

+398
-0
lines changed

5 files changed

+398
-0
lines changed
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
apiVersion: config.karmada.io/v1alpha1
2+
kind: ResourceInterpreterCustomization
3+
metadata:
4+
name: declarative-configuration-rayjob
5+
spec:
6+
target:
7+
apiVersion: ray.io/v1
8+
kind: RayJob
9+
customizations:
10+
componentResource:
11+
luaScript: |
12+
local kube = require("kube")
13+
14+
local function isempty(s)
15+
return s == nil or s == ''
16+
end
17+
18+
function GetComponents(desiredObj)
19+
local components = {}
20+
21+
-- RayJob contains a RayCluster spec
22+
if desiredObj.spec == nil or desiredObj.spec.rayClusterSpec == nil then
23+
return components
24+
end
25+
26+
local clusterSpec = desiredObj.spec.rayClusterSpec
27+
28+
-- Head component (always 1 replica)
29+
if clusterSpec.headGroupSpec ~= nil and clusterSpec.headGroupSpec.template ~= nil then
30+
local headRequires = kube.accuratePodRequirements(clusterSpec.headGroupSpec.template)
31+
local headComponent = {
32+
name = "ray-head",
33+
replicas = 1,
34+
replicaRequirements = headRequires
35+
}
36+
table.insert(components, headComponent)
37+
end
38+
39+
-- Worker group components
40+
if clusterSpec.workerGroupSpecs ~= nil then
41+
for i, workerGroup in ipairs(clusterSpec.workerGroupSpecs) do
42+
local name = workerGroup.groupName
43+
if isempty(name) then
44+
name = "worker-" .. tostring(i)
45+
end
46+
local replicas = workerGroup.replicas or 0
47+
local requires = nil
48+
if workerGroup.template ~= nil then
49+
requires = kube.accuratePodRequirements(workerGroup.template)
50+
end
51+
local wgComponent = {
52+
name = name,
53+
replicas = replicas,
54+
replicaRequirements = requires
55+
}
56+
table.insert(components, wgComponent)
57+
end
58+
end
59+
60+
return components
61+
end
62+
healthInterpretation:
63+
luaScript: >
64+
function InterpretHealth(observedObj)
65+
if observedObj.status == nil then
66+
return false
67+
end
68+
69+
local jobDeploymentStatus = observedObj.status.jobDeploymentStatus
70+
local jobStatus = observedObj.status.jobStatus
71+
72+
-- Consider healthy if job is running or completed successfully
73+
if jobDeploymentStatus == 'Running' then
74+
return true
75+
end
76+
77+
if jobDeploymentStatus == 'Complete' and jobStatus == 'SUCCEEDED' then
78+
return true
79+
end
80+
81+
-- Unhealthy states
82+
if jobDeploymentStatus == 'Failed' or jobDeploymentStatus == 'ValidationFailed' then
83+
return false
84+
end
85+
86+
if jobStatus == 'FAILED' or jobStatus == 'STOPPED' then
87+
return false
88+
end
89+
90+
-- Initializing, Waiting, Retrying, Suspending, Suspended are transitional - not fully healthy yet
91+
return false
92+
end
93+
statusAggregation:
94+
luaScript: >
95+
function AggregateStatus(desiredObj, statusItems)
96+
if statusItems == nil then
97+
return desiredObj
98+
end
99+
if desiredObj.status == nil then
100+
desiredObj.status = {}
101+
end
102+
103+
-- If only one item, use it directly
104+
if #statusItems == 1 then
105+
desiredObj.status = statusItems[1].status
106+
return desiredObj
107+
end
108+
109+
-- Initialize aggregated values
110+
local jobId = nil
111+
local rayClusterName = nil
112+
local dashboardURL = nil
113+
local jobStatus = nil
114+
local jobDeploymentStatus = nil
115+
local reason = nil
116+
local message = nil
117+
local startTime = nil
118+
local endTime = nil
119+
local succeeded = 0
120+
local failed = 0
121+
local observedGeneration = nil
122+
local rayJobStatusInfo = nil
123+
local rayClusterStatus = nil
124+
125+
-- Priority for jobDeploymentStatus (worst state wins)
126+
local deploymentStatusPriority = {
127+
[""] = 0,
128+
["Initializing"] = 1,
129+
["Waiting"] = 2,
130+
["Suspending"] = 3,
131+
["Suspended"] = 3,
132+
["Retrying"] = 4,
133+
["Running"] = 5,
134+
["Complete"] = 6,
135+
["ValidationFailed"] = 7,
136+
["Failed"] = 8,
137+
}
138+
139+
local worstDeploymentPriority = -1
140+
141+
-- Aggregate status from all member clusters
142+
for i = 1, #statusItems do
143+
local currentStatus = statusItems[i].status
144+
if currentStatus ~= nil then
145+
-- Take the worst jobDeploymentStatus
146+
if currentStatus.jobDeploymentStatus ~= nil then
147+
local priority = deploymentStatusPriority[currentStatus.jobDeploymentStatus] or 0
148+
if priority > worstDeploymentPriority then
149+
worstDeploymentPriority = priority
150+
jobDeploymentStatus = currentStatus.jobDeploymentStatus
151+
reason = currentStatus.reason
152+
message = currentStatus.message
153+
end
154+
end
155+
156+
-- Take first non-nil jobStatus (should be same across clusters)
157+
if jobStatus == nil and currentStatus.jobStatus ~= nil and currentStatus.jobStatus ~= "" then
158+
jobStatus = currentStatus.jobStatus
159+
end
160+
161+
-- Take first non-nil identifiers
162+
if jobId == nil and currentStatus.jobId ~= nil then
163+
jobId = currentStatus.jobId
164+
end
165+
if rayClusterName == nil and currentStatus.rayClusterName ~= nil then
166+
rayClusterName = currentStatus.rayClusterName
167+
end
168+
if dashboardURL == nil and currentStatus.dashboardURL ~= nil then
169+
dashboardURL = currentStatus.dashboardURL
170+
end
171+
172+
-- Take earliest startTime
173+
if currentStatus.startTime ~= nil then
174+
if startTime == nil or currentStatus.startTime < startTime then
175+
startTime = currentStatus.startTime
176+
end
177+
end
178+
179+
-- Take latest endTime
180+
if currentStatus.endTime ~= nil then
181+
if endTime == nil or currentStatus.endTime > endTime then
182+
endTime = currentStatus.endTime
183+
end
184+
end
185+
186+
-- Sum succeeded and failed counts
187+
if currentStatus.succeeded ~= nil then
188+
succeeded = succeeded + currentStatus.succeeded
189+
end
190+
if currentStatus.failed ~= nil then
191+
failed = failed + currentStatus.failed
192+
end
193+
194+
-- Take minimum observedGeneration (most conservative)
195+
if currentStatus.observedGeneration ~= nil then
196+
if observedGeneration == nil or currentStatus.observedGeneration < observedGeneration then
197+
observedGeneration = currentStatus.observedGeneration
198+
end
199+
end
200+
201+
-- Take first non-nil rayJobStatusInfo and rayClusterStatus
202+
if rayJobStatusInfo == nil and currentStatus.rayJobInfo ~= nil then
203+
rayJobStatusInfo = currentStatus.rayJobInfo
204+
end
205+
if rayClusterStatus == nil and currentStatus.rayClusterStatus ~= nil then
206+
rayClusterStatus = currentStatus.rayClusterStatus
207+
end
208+
end
209+
end
210+
211+
-- Set aggregated status
212+
desiredObj.status.jobId = jobId
213+
desiredObj.status.rayClusterName = rayClusterName
214+
desiredObj.status.dashboardURL = dashboardURL
215+
desiredObj.status.jobStatus = jobStatus
216+
desiredObj.status.jobDeploymentStatus = jobDeploymentStatus
217+
desiredObj.status.reason = reason
218+
desiredObj.status.message = message
219+
desiredObj.status.startTime = startTime
220+
desiredObj.status.endTime = endTime
221+
desiredObj.status.succeeded = succeeded
222+
desiredObj.status.failed = failed
223+
desiredObj.status.observedGeneration = observedGeneration
224+
desiredObj.status.rayJobInfo = rayJobStatusInfo
225+
desiredObj.status.rayClusterStatus = rayClusterStatus
226+
227+
return desiredObj
228+
end
229+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
tests:
2+
- desiredInputPath: testdata/desired-rayjob.yaml
3+
statusInputPath: testdata/status-file.yaml
4+
operation: AggregateStatus
5+
- observedInputPath: testdata/observed-rayjob.yaml
6+
operation: InterpretHealth
7+
- observedInputPath: testdata/observed-rayjob.yaml
8+
operation: InterpretComponent
9+
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
apiVersion: ray.io/v1
2+
kind: RayJob
3+
metadata:
4+
name: sample-rayjob
5+
namespace: default
6+
spec:
7+
entrypoint: python /home/ray/samples/sample_code.py
8+
shutdownAfterJobFinishes: true
9+
ttlSecondsAfterFinished: 60
10+
rayClusterSpec:
11+
rayVersion: '2.46.0'
12+
headGroupSpec:
13+
rayStartParams: {}
14+
template:
15+
spec:
16+
containers:
17+
- name: ray-head
18+
image: rayproject/ray:2.46.0
19+
ports:
20+
- containerPort: 6379
21+
name: gcs-server
22+
- containerPort: 8265
23+
name: dashboard
24+
- containerPort: 10001
25+
name: client
26+
resources:
27+
limits:
28+
cpu: "2"
29+
memory: 4Gi
30+
requests:
31+
cpu: "2"
32+
memory: 4Gi
33+
workerGroupSpecs:
34+
- groupName: small-workers
35+
replicas: 2
36+
minReplicas: 1
37+
maxReplicas: 5
38+
rayStartParams: {}
39+
template:
40+
spec:
41+
containers:
42+
- name: ray-worker
43+
image: rayproject/ray:2.46.0
44+
resources:
45+
limits:
46+
cpu: "1"
47+
memory: 2Gi
48+
requests:
49+
cpu: "1"
50+
memory: 2Gi
51+
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
apiVersion: ray.io/v1
2+
kind: RayJob
3+
metadata:
4+
name: sample-rayjob
5+
namespace: default
6+
generation: 1
7+
spec:
8+
entrypoint: python /home/ray/samples/sample_code.py
9+
shutdownAfterJobFinishes: true
10+
ttlSecondsAfterFinished: 60
11+
rayClusterSpec:
12+
rayVersion: '2.46.0'
13+
headGroupSpec:
14+
rayStartParams: {}
15+
template:
16+
spec:
17+
containers:
18+
- name: ray-head
19+
image: rayproject/ray:2.46.0
20+
resources:
21+
limits:
22+
cpu: "2"
23+
memory: 4Gi
24+
workerGroupSpecs:
25+
- groupName: small-workers
26+
replicas: 2
27+
minReplicas: 1
28+
maxReplicas: 5
29+
rayStartParams: {}
30+
template:
31+
spec:
32+
containers:
33+
- name: ray-worker
34+
image: rayproject/ray:2.46.0
35+
resources:
36+
limits:
37+
cpu: "1"
38+
memory: 2Gi
39+
status:
40+
jobId: raysubmit_12345abcde
41+
rayClusterName: sample-rayjob-raycluster-abc12
42+
dashboardURL: "sample-rayjob-raycluster-abc12-head-svc.default.svc.cluster.local:8265"
43+
jobStatus: RUNNING
44+
jobDeploymentStatus: Running
45+
startTime: "2025-11-22T10:30:00Z"
46+
succeeded: 0
47+
failed: 0
48+
observedGeneration: 1
49+
rayJobInfo:
50+
startTime: "2025-11-22T10:30:15Z"
51+
rayClusterStatus:
52+
availableWorkerReplicas: 2
53+
conditions:
54+
- lastTransitionTime: "2025-11-22T10:29:30Z"
55+
message: ""
56+
reason: HeadPodRunningAndReady
57+
status: "True"
58+
type: HeadPodReady
59+
- lastTransitionTime: "2025-11-22T10:29:45Z"
60+
message: All Ray Pods are ready for the first time
61+
reason: AllPodRunningAndReadyFirstTime
62+
status: "True"
63+
type: RayClusterProvisioned
64+
desiredCPU: "4"
65+
desiredMemory: 8Gi
66+
desiredWorkerReplicas: 2
67+
readyWorkerReplicas: 2
68+
state: ready
69+

0 commit comments

Comments
 (0)