Skip to content

Commit ebd956d

Browse files
authored
Merge pull request #6766 from owenowenisme/lfx/add-intepreter-for-ray-cluster
Add custom interpreter for Raycluster
2 parents 9734f66 + 6fa5a24 commit ebd956d

File tree

5 files changed

+443
-0
lines changed

5 files changed

+443
-0
lines changed
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
apiVersion: config.karmada.io/v1alpha1
2+
kind: ResourceInterpreterCustomization
3+
metadata:
4+
name: declarative-configuration-raycluster
5+
spec:
6+
target:
7+
apiVersion: ray.io/v1
8+
kind: RayCluster
9+
customizations:
10+
componentResource:
11+
luaScript: |
12+
local kube = require("kube")
13+
14+
local function isempty(s)
15+
return s == nil or s == ''
16+
end
17+
18+
function GetComponents(desiredObj)
19+
local components = {}
20+
21+
-- Head component (always 1 replica)
22+
if desiredObj.spec ~= nil and desiredObj.spec.headGroupSpec ~= nil and desiredObj.spec.headGroupSpec.template ~= nil then
23+
local headRequires = kube.accuratePodRequirements(desiredObj.spec.headGroupSpec.template)
24+
local headComponent = {
25+
name = "ray-head",
26+
replicas = 1,
27+
replicaRequirements = headRequires
28+
}
29+
table.insert(components, headComponent)
30+
end
31+
32+
-- Worker group components
33+
if desiredObj.spec ~= nil and desiredObj.spec.workerGroupSpecs ~= nil then
34+
for i, workerGroup in ipairs(desiredObj.spec.workerGroupSpecs) do
35+
local name = workerGroup.groupName
36+
if isempty(name) then
37+
name = "worker-" .. tostring(i)
38+
end
39+
local replicas = workerGroup.replicas or 0
40+
local requires = nil
41+
if workerGroup.template ~= nil then
42+
requires = kube.accuratePodRequirements(workerGroup.template)
43+
end
44+
local wgComponent = {
45+
name = name,
46+
replicas = replicas,
47+
replicaRequirements = requires
48+
}
49+
table.insert(components, wgComponent)
50+
end
51+
end
52+
53+
return components
54+
end
55+
healthInterpretation:
56+
luaScript: >
57+
function InterpretHealth(observedObj)
58+
if observedObj.status == nil or observedObj.status.conditions == nil then
59+
return false
60+
end
61+
62+
local headPodReady = false
63+
local clusterProvisioned = false
64+
local replicaFailure = false
65+
66+
for _, condition in ipairs(observedObj.status.conditions) do
67+
if condition.type == 'HeadPodReady' and condition.status == 'True' then
68+
headPodReady = true
69+
elseif condition.type == 'RayClusterProvisioned' and condition.status == 'True' then
70+
clusterProvisioned = true
71+
elseif condition.type == 'RayClusterReplicaFailure' and condition.status == 'True' then
72+
replicaFailure = true
73+
end
74+
end
75+
76+
return headPodReady and clusterProvisioned and not replicaFailure
77+
end
78+
statusAggregation:
79+
luaScript: >
80+
function AggregateStatus(desiredObj, statusItems)
81+
if statusItems == nil then
82+
return desiredObj
83+
end
84+
if desiredObj.status == nil then
85+
desiredObj.status = {}
86+
end
87+
88+
-- If only one item, use it directly
89+
if #statusItems == 1 then
90+
desiredObj.status = statusItems[1].status
91+
return desiredObj
92+
end
93+
94+
-- Initialize aggregated values
95+
local conditions = {}
96+
local readyWorkerReplicas = 0
97+
local availableWorkerReplicas = 0
98+
local maxWorkerReplicas = 0
99+
local minWorkerReplicas = 0
100+
local desiredWorkerReplicas = 0
101+
local desiredCPU = 0
102+
local desiredGPU = 0
103+
local desiredMemory = 0
104+
local desiredTPU = 0
105+
local lastUpdateTime = nil
106+
local endpoints = nil
107+
local head = nil
108+
local state = nil
109+
local stateTransitionTimes = nil
110+
111+
-- Aggregate status from all member clusters
112+
for i = 1, #statusItems do
113+
local currentStatus = statusItems[i].status
114+
if currentStatus ~= nil then
115+
-- Merge conditions from all clusters
116+
if currentStatus.conditions ~= nil then
117+
for _, condition in ipairs(currentStatus.conditions) do
118+
table.insert(conditions, condition)
119+
end
120+
end
121+
122+
-- Sum numeric replica counts
123+
if currentStatus.readyWorkerReplicas ~= nil then
124+
readyWorkerReplicas = readyWorkerReplicas + currentStatus.readyWorkerReplicas
125+
end
126+
if currentStatus.availableWorkerReplicas ~= nil then
127+
availableWorkerReplicas = availableWorkerReplicas + currentStatus.availableWorkerReplicas
128+
end
129+
if currentStatus.maxWorkerReplicas ~= nil then
130+
maxWorkerReplicas = maxWorkerReplicas + currentStatus.maxWorkerReplicas
131+
end
132+
if currentStatus.minWorkerReplicas ~= nil then
133+
minWorkerReplicas = minWorkerReplicas + currentStatus.minWorkerReplicas
134+
end
135+
if currentStatus.desiredWorkerReplicas ~= nil then
136+
desiredWorkerReplicas = desiredWorkerReplicas + currentStatus.desiredWorkerReplicas
137+
end
138+
139+
-- Sum resource quantities (stored as numeric strings like "2", "4", etc.)
140+
if currentStatus.desiredCPU ~= nil and currentStatus.desiredCPU ~= "" then
141+
desiredCPU = desiredCPU + tonumber(currentStatus.desiredCPU)
142+
end
143+
if currentStatus.desiredGPU ~= nil and currentStatus.desiredGPU ~= "" then
144+
desiredGPU = desiredGPU + tonumber(currentStatus.desiredGPU)
145+
end
146+
if currentStatus.desiredTPU ~= nil and currentStatus.desiredTPU ~= "" then
147+
desiredTPU = desiredTPU + tonumber(currentStatus.desiredTPU)
148+
end
149+
-- For memory, parse numeric part (e.g., "3G" -> 3)
150+
if currentStatus.desiredMemory ~= nil and currentStatus.desiredMemory ~= "" then
151+
local memNum = tonumber(string.match(currentStatus.desiredMemory, "%d+"))
152+
if memNum ~= nil then
153+
desiredMemory = desiredMemory + memNum
154+
end
155+
end
156+
157+
-- Take the most recent lastUpdateTime
158+
if currentStatus.lastUpdateTime ~= nil then
159+
if lastUpdateTime == nil or currentStatus.lastUpdateTime > lastUpdateTime then
160+
lastUpdateTime = currentStatus.lastUpdateTime
161+
end
162+
end
163+
164+
-- Keep endpoints and head from first non-nil cluster (likely the one with head pod)
165+
if endpoints == nil and currentStatus.endpoints ~= nil then
166+
endpoints = currentStatus.endpoints
167+
end
168+
if head == nil and currentStatus.head ~= nil then
169+
head = currentStatus.head
170+
end
171+
172+
-- Keep state and stateTransitionTimes for backward compatibility (deprecated fields)
173+
if state == nil and currentStatus.state ~= nil then
174+
state = currentStatus.state
175+
stateTransitionTimes = currentStatus.stateTransitionTimes
176+
end
177+
end
178+
end
179+
180+
-- Set aggregated status
181+
desiredObj.status.conditions = conditions
182+
desiredObj.status.readyWorkerReplicas = readyWorkerReplicas
183+
desiredObj.status.availableWorkerReplicas = availableWorkerReplicas
184+
desiredObj.status.maxWorkerReplicas = maxWorkerReplicas
185+
desiredObj.status.minWorkerReplicas = minWorkerReplicas
186+
desiredObj.status.desiredWorkerReplicas = desiredWorkerReplicas
187+
desiredObj.status.desiredCPU = tostring(desiredCPU)
188+
desiredObj.status.desiredGPU = tostring(desiredGPU)
189+
desiredObj.status.desiredTPU = tostring(desiredTPU)
190+
-- Reconstruct memory with unit suffix
191+
if desiredMemory > 0 then
192+
desiredObj.status.desiredMemory = tostring(desiredMemory) .. "G"
193+
end
194+
desiredObj.status.lastUpdateTime = lastUpdateTime
195+
desiredObj.status.endpoints = endpoints
196+
desiredObj.status.head = head
197+
desiredObj.status.state = state
198+
desiredObj.status.stateTransitionTimes = stateTransitionTimes
199+
200+
return desiredObj
201+
end
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
tests:
2+
- desiredInputPath: testdata/desired-raycluster.yaml
3+
statusInputPath: testdata/status-file.yaml
4+
operation: AggregateStatus
5+
- observedInputPath: testdata/observed-raycluster.yaml
6+
operation: InterpretHealth
7+
- observedInputPath: testdata/observed-raycluster.yaml
8+
operation: InterpretComponent
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
apiVersion: ray.io/v1
2+
kind: RayCluster
3+
metadata:
4+
name: sample
5+
spec:
6+
rayVersion: '2.46.0'
7+
headGroupSpec:
8+
rayStartParams: {}
9+
template:
10+
spec:
11+
containers:
12+
- name: ray-head
13+
image: rayproject/ray:2.46.0
14+
resources:
15+
limits:
16+
cpu: 1
17+
memory: 2G
18+
requests:
19+
cpu: 1
20+
memory: 2G
21+
ports:
22+
- containerPort: 6379
23+
name: gcs-server
24+
- containerPort: 8265
25+
name: dashboard
26+
- containerPort: 10001
27+
name: client
28+
workerGroupSpecs:
29+
- replicas: 1
30+
minReplicas: 1
31+
maxReplicas: 5
32+
groupName: workergroup
33+
rayStartParams: {}
34+
template:
35+
spec:
36+
containers:
37+
image: rayproject/ray:2.46.0
38+
resources:
39+
limits:
40+
cpu: 1
41+
memory: 1G
42+
requests:
43+
cpu: 1
44+
memory: 1G
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
apiVersion: ray.io/v1
2+
kind: RayCluster
3+
metadata:
4+
annotations:
5+
kubectl.kubernetes.io/last-applied-configuration: |
6+
{"apiVersion":"ray.io/v1","kind":"RayCluster","metadata":{"annotations":{},"name":"sample","namespace":"default"},"spec":{"headGroupSpec":{"rayStartParams":{},"template":{"spec":{"containers":[{"image":"rayproject/ray:2.46.0","name":"ray-head","ports":[{"containerPort":6379,"name":"gcs-server"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":1,"memory":"2G"},"requests":{"cpu":1,"memory":"2G"}}}]}}},"rayVersion":"2.46.0","workerGroupSpecs":[{"groupName":"workergroup","maxReplicas":5,"minReplicas":1,"rayStartParams":{},"replicas":1,"template":{"spec":{"containers":[{"image":"rayproject/ray:2.46.0","name":"ray-worker","resources":{"limits":{"cpu":1,"memory":"1G"},"requests":{"cpu":1,"memory":"1G"}}}]}}}]}}
7+
creationTimestamp: "2025-09-21T03:54:44Z"
8+
generation: 1
9+
name: sample
10+
namespace: default
11+
resourceVersion: "850"
12+
uid: 040acb09-4e53-4a23-a8b2-28b4300af70b
13+
spec:
14+
headGroupSpec:
15+
rayStartParams: {}
16+
template:
17+
spec:
18+
containers:
19+
- image: rayproject/ray:2.46.0
20+
name: ray-head
21+
ports:
22+
- containerPort: 6379
23+
name: gcs-server
24+
protocol: TCP
25+
- containerPort: 8265
26+
name: dashboard
27+
protocol: TCP
28+
- containerPort: 10001
29+
name: client
30+
protocol: TCP
31+
resources:
32+
limits:
33+
cpu: 1
34+
memory: 2G
35+
requests:
36+
cpu: 1
37+
memory: 2G
38+
rayVersion: 2.46.0
39+
workerGroupSpecs:
40+
- groupName: workergroup
41+
maxReplicas: 5
42+
minReplicas: 1
43+
numOfHosts: 1
44+
rayStartParams: {}
45+
replicas: 1
46+
template:
47+
spec:
48+
containers:
49+
- image: rayproject/ray:2.46.0
50+
name: ray-worker
51+
resources:
52+
limits:
53+
cpu: 1
54+
memory: 1G
55+
requests:
56+
cpu: 1
57+
memory: 1G
58+
status:
59+
availableWorkerReplicas: 1
60+
conditions:
61+
- lastTransitionTime: "2025-09-21T03:55:30Z"
62+
message: ""
63+
reason: HeadPodRunningAndReady
64+
status: "True"
65+
type: HeadPodReady
66+
- lastTransitionTime: "2025-09-21T03:55:45Z"
67+
message: All Ray Pods are ready for the first time
68+
reason: AllPodRunningAndReadyFirstTime
69+
status: "True"
70+
type: RayClusterProvisioned
71+
- lastTransitionTime: "2025-09-21T03:54:44Z"
72+
message: ""
73+
reason: RayClusterSuspended
74+
status: "False"
75+
type: RayClusterSuspended
76+
- lastTransitionTime: "2025-09-21T03:54:44Z"
77+
message: ""
78+
reason: RayClusterSuspending
79+
status: "False"
80+
type: RayClusterSuspending
81+
desiredCPU: "2"
82+
desiredGPU: "0"
83+
desiredMemory: 3G
84+
desiredTPU: "0"
85+
desiredWorkerReplicas: 1
86+
endpoints:
87+
client: "10001"
88+
dashboard: "8265"
89+
gcs-server: "6379"
90+
metrics: "8080"
91+
head:
92+
podIP: 10.244.0.6
93+
podName: sample-head-9cvfc
94+
serviceIP: 10.244.0.6
95+
serviceName: sample-head-svc
96+
lastUpdateTime: "2025-09-21T03:55:45Z"
97+
maxWorkerReplicas: 5
98+
minWorkerReplicas: 1
99+
observedGeneration: 1
100+
readyWorkerReplicas: 1
101+
state: ready
102+
stateTransitionTimes:
103+
ready: "2025-09-21T03:55:45Z"

0 commit comments

Comments
 (0)