88#
99# See the LICENSE file in the source distribution for further information.
1010
11+ import base64
1112import json
1213import os
1314from sos .report .plugins import (Plugin , RedHatPlugin , DebianPlugin ,
@@ -35,6 +36,14 @@ class DocaDpf(Plugin):
3536 """
3637 This plugin will capture information related to the DOCA Platform Framework
3738 resources and configurations in the system.
39+
40+ By default, it collects resources from the host Kubernetes cluster.
41+
42+ When the 'collect-dpu-clusters' option is enabled, it will automatically
43+ detect all DPU clusters (dpucluster resources) in the host cluster,
44+ retrieve their kubeconfigs from Kubernetes secrets, and collect resources
45+ from each child DPU cluster. The collected data is organized in a directory
46+ structure: dpu-clusters/{namespace}/{cluster-name}/
3847 """
3948 short_desc = 'DOCA Platform Framework resources and configurations'
4049 plugin_name = "doca_dpf"
@@ -90,7 +99,9 @@ class DocaDpf(Plugin):
9099 PluginOpt ('all' , default = True ,
91100 desc = 'collect all namespace output separately' ),
92101 PluginOpt ('describe' , default = False ,
93- desc = 'collect describe output of all resources' )
102+ desc = 'collect describe output of all resources' ),
103+ PluginOpt ('collect-dpu-clusters' , default = True ,
104+ desc = 'collect resources from DPU child clusters' )
94105 ]
95106
96107 kube_cmd = "kubectl"
@@ -115,8 +126,13 @@ def setup(self):
115126 if not self .check_is_master ():
116127 return
117128
129+ # Collect host cluster resources
118130 self .collect_per_resource_details ()
119131
132+ # Collect DPU cluster resources if enabled
133+ if self .get_option ('collect-dpu-clusters' ):
134+ self ._collect_all_dpu_clusters ()
135+
120136 def collect_per_resource_details (self ):
121137 """ Collect details about each resource in all namespaces """
122138 # get all namespaces in use
@@ -154,95 +170,140 @@ def collect_per_resource_details(self):
154170
155171 def _discover_dpu_clusters (self ):
156172 """Discover all dpucluster objects in the host cluster.
157-
158- Returns a list of dicts with cluster name and namespace.
173+
174+ Returns a list of dicts with cluster name, namespace, and
175+ kubeconfig secret name.
159176 """
160177 result = self .collect_cmd_output (
161178 f"{ self .kube_cmd } get dpucluster -A -o json" ,
162179 subdir = 'cluster-info'
163180 )
164-
181+
165182 if result ['status' ] != 0 :
166- self ._log_warn ("Failed to discover DPU clusters" )
167183 return []
168-
184+
169185 try :
170186 data = json .loads (result ['output' ])
171187 clusters = []
172188 for item in data .get ('items' , []):
173189 cluster_name = item ['metadata' ]['name' ]
174190 namespace = item ['metadata' ]['namespace' ]
191+ kubeconfig = item .get ('spec' , {}).get ('kubeconfig' )
192+
193+ # Skip clusters without kubeconfig specified
194+ if not kubeconfig :
195+ continue
196+
175197 clusters .append ({
176198 'name' : cluster_name ,
177- 'namespace' : namespace
199+ 'namespace' : namespace ,
200+ 'kubeconfig' : kubeconfig
178201 })
179-
180- self ._log_info (f"Discovered { len (clusters )} DPU cluster(s)" )
202+
181203 return clusters
182204 except (json .JSONDecodeError , KeyError ) as e :
183- self ._log_warn (f"Failed to parse DPU clusters : { e } " )
205+ self ._log_error (f"Failed to parse dpucluster data : { e } " )
184206 return []
185207
186208 def _collect_dpu_cluster_resources (self , cluster_info ):
187209 """Collect resources from a single DPU cluster.
188-
189- cluster_info: Dict with 'name' and 'namespace ' keys
210+
211+ cluster_info: Dict with 'name', 'namespace', and 'kubeconfig ' keys
190212 """
191213 cluster_name = cluster_info ['name' ]
192214 namespace = cluster_info ['namespace' ]
193- # expected secret name to be found in the namespace
194- secret_name = f" { cluster_name } -admin- kubeconfig"
215+ # Get secret name from dpucluster spec.kubeconfig
216+ secret_name = cluster_info [ ' kubeconfig' ]
195217 subdir_base = f'dpu-clusters/{ namespace } /{ cluster_name } '
196218
197219 # Create unique temp kubeconfig path
198220 mktemp_ret = self .exec_cmd ('mktemp /tmp/sos-dpu-kc.XXXXXX' )
199221 if mktemp_ret ['status' ] != 0 :
200- self ._log_warn (
201- f"Failed to create temporary kubeconfig for { cluster_name } "
222+ self ._log_error (
223+ f"Failed to create temp kubeconfig for { cluster_name } "
202224 )
203225 return
204226 kc_path = mktemp_ret ['output' ].strip ()
227+ self ._log_debug (f"Created temp kubeconfig at: { kc_path } " )
205228
229+ # Extract base64-encoded kubeconfig from secret
206230 extract_cmd = (
207231 f"{ self .kube_cmd } get secret { secret_name } -n { namespace } "
208- f"-o jsonpath='{{.data.admin\\ .conf}}' | base64 -d > { kc_path } "
232+ f"-o jsonpath='{{.data.admin\\ .conf}}'"
209233 )
210234 extract_result = self .exec_cmd (extract_cmd )
211- if extract_result ['status' ] != 0 :
212- self ._log_warn (
213- f"Failed to retrieve kubeconfig for DPU cluster "
214- f"{ cluster_name } in namespace { namespace } "
235+ kubeconfig_b64 = extract_result ['output' ].strip ()
236+
237+ if extract_result ['status' ] != 0 or not kubeconfig_b64 :
238+ self ._log_error (
239+ f"Failed to extract kubeconfig from secret "
240+ f"{ secret_name } in namespace { namespace } "
241+ )
242+ self ._log_error (
243+ f"Command output: { extract_result .get ('output' , 'N/A' )} "
215244 )
216- self .exec_cmd (f"rm -f { kc_path } " )
217245 return
218246
219- dpu_kube_cmd = f"kubectl --kubeconfig={ kc_path } --request-timeout=10s"
247+ # Decode base64 data
248+ try :
249+ kubeconfig_content = base64 .b64decode (
250+ kubeconfig_b64
251+ ).decode ('utf-8' )
252+ except Exception as e :
253+ self ._log_error (f"Failed to decode kubeconfig base64 data: { e } " )
254+ return
255+
256+ # Write kubeconfig to temp file
257+ try :
258+ with open (kc_path , 'w' ) as f :
259+ f .write (kubeconfig_content )
260+ except IOError as e :
261+ self ._log_error (f"Failed to write kubeconfig to { kc_path } : { e } " )
262+ return
263+
264+ dpu_kube_cmd = (
265+ f"kubectl --kubeconfig={ kc_path } --request-timeout=10s"
266+ )
267+ self ._log_debug (
268+ f"Fetching namespaces from DPU cluster { cluster_name } "
269+ )
220270 kns_result = self .collect_cmd_output (
221271 f"{ dpu_kube_cmd } get namespaces -o json" ,
222272 subdir = subdir_base
223273 )
224274
225275 if kns_result ['status' ] != 0 :
226- self ._log_warn (f"Failed to access DPU cluster { cluster_name } " )
227- self .exec_cmd (f"rm -f { kc_path } " )
276+ self ._log_error (
277+ f"Failed to get namespaces from DPU cluster { cluster_name } "
278+ )
279+ self ._log_error (
280+ f"Command output: { kns_result .get ('output' , 'N/A' )} "
281+ )
228282 return
229283
230284 try :
231285 ns_data = json .loads (kns_result ['output' ])
232- namespaces = [n ['metadata' ]['name' ] for n in ns_data .get ('items' , [])]
286+ namespaces = [
287+ n ['metadata' ]['name' ] for n in ns_data .get ('items' , [])
288+ ]
233289 except (json .JSONDecodeError , KeyError ) as e :
234- self ._log_warn (
235- f"Failed to parse namespaces for { cluster_name } : { e } "
290+ self ._log_error (
291+ f"Failed to parse namespaces from DPU cluster "
292+ f"{ cluster_name } : { e } "
236293 )
237- self .exec_cmd (f"rm -f { kc_path } " )
238294 return
239295
240296 # Collect resources from each namespace
241297 for nspace in namespaces :
298+ self ._log_debug (
299+ f"Collecting resources from namespace { nspace } "
300+ f"in DPU cluster { cluster_name } "
301+ )
242302 nspace_arg = f'--namespace={ nspace } '
243303 if self .get_option ('all' ):
244304 k_cmd = (
245- f"{ dpu_kube_cmd } get -o json { nspace_arg } --ignore-not-found"
305+ f"{ dpu_kube_cmd } get -o json { nspace_arg } "
306+ f"--ignore-not-found"
246307 )
247308 for res in self .resources :
248309 self .add_cmd_output (
@@ -251,23 +312,32 @@ def _collect_dpu_cluster_resources(self, cluster_info):
251312 )
252313
253314 if self .get_option ('describe' ):
254- k_base = f"{ dpu_kube_cmd } { nspace_arg } --ignore-not-found"
315+ k_base = (
316+ f"{ dpu_kube_cmd } { nspace_arg } --ignore-not-found"
317+ )
255318 for res in self .resources :
256319 ret = self .exec_cmd (f"{ k_base } get { res } " )
257320 if ret ['status' ] == 0 :
258321 items = [
259- l .split ()[0 ] for l in ret ['output' ].splitlines ()[1 :]
260- if l .strip ()
322+ line .split ()[0 ]
323+ for line in ret ['output' ].splitlines ()[1 :]
324+ if line .strip ()
261325 ]
262326 for item in items :
263327 self .add_cmd_output (
264328 f"{ k_base } describe { res } { item } " ,
265329 subdir = f"{ subdir_base } /{ nspace } /{ res } "
266330 )
267331
268- # Clean up temporary kubeconfig
269- self .exec_cmd (f"rm -f { kc_path } " )
270- self ._log_info (f"Collected resources from DPU cluster { cluster_name } " )
332+ def _collect_all_dpu_clusters (self ):
333+ """Main orchestration method for DPU cluster collection."""
334+ clusters = self ._discover_dpu_clusters ()
335+
336+ if not clusters :
337+ return
338+
339+ for cluster in clusters :
340+ self ._collect_dpu_cluster_resources (cluster )
271341
272342
273343class RedHatKubernetes (DocaDpf , RedHatPlugin ):
0 commit comments