Skip to content

Commit 3bcdaaa

Browse files
soulebfilanov
authored andcommitted
Enable collection dpuCluster resources
Add a new option to collect dpuclusters resources defaulted to true. Signed-off-by: Soule BA <[email protected]>
1 parent 8774140 commit 3bcdaaa

File tree

1 file changed

+106
-36
lines changed

1 file changed

+106
-36
lines changed

sos/report/plugins/doca_dpf.py

Lines changed: 106 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#
99
# See the LICENSE file in the source distribution for further information.
1010

11+
import base64
1112
import json
1213
import os
1314
from sos.report.plugins import (Plugin, RedHatPlugin, DebianPlugin,
@@ -35,6 +36,14 @@ class DocaDpf(Plugin):
3536
"""
3637
This plugin will capture information related to the DOCA Platform Framework
3738
resources and configurations in the system.
39+
40+
By default, it collects resources from the host Kubernetes cluster.
41+
42+
When the 'collect-dpu-clusters' option is enabled, it will automatically
43+
detect all DPU clusters (dpucluster resources) in the host cluster,
44+
retrieve their kubeconfigs from Kubernetes secrets, and collect resources
45+
from each child DPU cluster. The collected data is organized in a directory
46+
structure: dpu-clusters/{namespace}/{cluster-name}/
3847
"""
3948
short_desc = 'DOCA Platform Framework resources and configurations'
4049
plugin_name = "doca_dpf"
@@ -90,7 +99,9 @@ class DocaDpf(Plugin):
9099
PluginOpt('all', default=True,
91100
desc='collect all namespace output separately'),
92101
PluginOpt('describe', default=False,
93-
desc='collect describe output of all resources')
102+
desc='collect describe output of all resources'),
103+
PluginOpt('collect-dpu-clusters', default=True,
104+
desc='collect resources from DPU child clusters')
94105
]
95106

96107
kube_cmd = "kubectl"
@@ -115,8 +126,13 @@ def setup(self):
115126
if not self.check_is_master():
116127
return
117128

129+
# Collect host cluster resources
118130
self.collect_per_resource_details()
119131

132+
# Collect DPU cluster resources if enabled
133+
if self.get_option('collect-dpu-clusters'):
134+
self._collect_all_dpu_clusters()
135+
120136
def collect_per_resource_details(self):
121137
""" Collect details about each resource in all namespaces """
122138
# get all namespaces in use
@@ -154,95 +170,140 @@ def collect_per_resource_details(self):
154170

155171
def _discover_dpu_clusters(self):
156172
"""Discover all dpucluster objects in the host cluster.
157-
158-
Returns a list of dicts with cluster name and namespace.
173+
174+
Returns a list of dicts with cluster name, namespace, and
175+
kubeconfig secret name.
159176
"""
160177
result = self.collect_cmd_output(
161178
f"{self.kube_cmd} get dpucluster -A -o json",
162179
subdir='cluster-info'
163180
)
164-
181+
165182
if result['status'] != 0:
166-
self._log_warn("Failed to discover DPU clusters")
167183
return []
168-
184+
169185
try:
170186
data = json.loads(result['output'])
171187
clusters = []
172188
for item in data.get('items', []):
173189
cluster_name = item['metadata']['name']
174190
namespace = item['metadata']['namespace']
191+
kubeconfig = item.get('spec', {}).get('kubeconfig')
192+
193+
# Skip clusters without kubeconfig specified
194+
if not kubeconfig:
195+
continue
196+
175197
clusters.append({
176198
'name': cluster_name,
177-
'namespace': namespace
199+
'namespace': namespace,
200+
'kubeconfig': kubeconfig
178201
})
179-
180-
self._log_info(f"Discovered {len(clusters)} DPU cluster(s)")
202+
181203
return clusters
182204
except (json.JSONDecodeError, KeyError) as e:
183-
self._log_warn(f"Failed to parse DPU clusters: {e}")
205+
self._log_error(f"Failed to parse dpucluster data: {e}")
184206
return []
185207

186208
def _collect_dpu_cluster_resources(self, cluster_info):
187209
"""Collect resources from a single DPU cluster.
188-
189-
cluster_info: Dict with 'name' and 'namespace' keys
210+
211+
cluster_info: Dict with 'name', 'namespace', and 'kubeconfig' keys
190212
"""
191213
cluster_name = cluster_info['name']
192214
namespace = cluster_info['namespace']
193-
# expected secret name to be found in the namespace
194-
secret_name = f"{cluster_name}-admin-kubeconfig"
215+
# Get secret name from dpucluster spec.kubeconfig
216+
secret_name = cluster_info['kubeconfig']
195217
subdir_base = f'dpu-clusters/{namespace}/{cluster_name}'
196218

197219
# Create unique temp kubeconfig path
198220
mktemp_ret = self.exec_cmd('mktemp /tmp/sos-dpu-kc.XXXXXX')
199221
if mktemp_ret['status'] != 0:
200-
self._log_warn(
201-
f"Failed to create temporary kubeconfig for {cluster_name}"
222+
self._log_error(
223+
f"Failed to create temp kubeconfig for {cluster_name}"
202224
)
203225
return
204226
kc_path = mktemp_ret['output'].strip()
227+
self._log_debug(f"Created temp kubeconfig at: {kc_path}")
205228

229+
# Extract base64-encoded kubeconfig from secret
206230
extract_cmd = (
207231
f"{self.kube_cmd} get secret {secret_name} -n {namespace} "
208-
f"-o jsonpath='{{.data.admin\\.conf}}' | base64 -d > {kc_path}"
232+
f"-o jsonpath='{{.data.admin\\.conf}}'"
209233
)
210234
extract_result = self.exec_cmd(extract_cmd)
211-
if extract_result['status'] != 0:
212-
self._log_warn(
213-
f"Failed to retrieve kubeconfig for DPU cluster "
214-
f"{cluster_name} in namespace {namespace}"
235+
kubeconfig_b64 = extract_result['output'].strip()
236+
237+
if extract_result['status'] != 0 or not kubeconfig_b64:
238+
self._log_error(
239+
f"Failed to extract kubeconfig from secret "
240+
f"{secret_name} in namespace {namespace}"
241+
)
242+
self._log_error(
243+
f"Command output: {extract_result.get('output', 'N/A')}"
215244
)
216-
self.exec_cmd(f"rm -f {kc_path}")
217245
return
218246

219-
dpu_kube_cmd = f"kubectl --kubeconfig={kc_path} --request-timeout=10s"
247+
# Decode base64 data
248+
try:
249+
kubeconfig_content = base64.b64decode(
250+
kubeconfig_b64
251+
).decode('utf-8')
252+
except Exception as e:
253+
self._log_error(f"Failed to decode kubeconfig base64 data: {e}")
254+
return
255+
256+
# Write kubeconfig to temp file
257+
try:
258+
with open(kc_path, 'w') as f:
259+
f.write(kubeconfig_content)
260+
except IOError as e:
261+
self._log_error(f"Failed to write kubeconfig to {kc_path}: {e}")
262+
return
263+
264+
dpu_kube_cmd = (
265+
f"kubectl --kubeconfig={kc_path} --request-timeout=10s"
266+
)
267+
self._log_debug(
268+
f"Fetching namespaces from DPU cluster {cluster_name}"
269+
)
220270
kns_result = self.collect_cmd_output(
221271
f"{dpu_kube_cmd} get namespaces -o json",
222272
subdir=subdir_base
223273
)
224274

225275
if kns_result['status'] != 0:
226-
self._log_warn(f"Failed to access DPU cluster {cluster_name}")
227-
self.exec_cmd(f"rm -f {kc_path}")
276+
self._log_error(
277+
f"Failed to get namespaces from DPU cluster {cluster_name}"
278+
)
279+
self._log_error(
280+
f"Command output: {kns_result.get('output', 'N/A')}"
281+
)
228282
return
229283

230284
try:
231285
ns_data = json.loads(kns_result['output'])
232-
namespaces = [n['metadata']['name'] for n in ns_data.get('items', [])]
286+
namespaces = [
287+
n['metadata']['name'] for n in ns_data.get('items', [])
288+
]
233289
except (json.JSONDecodeError, KeyError) as e:
234-
self._log_warn(
235-
f"Failed to parse namespaces for {cluster_name}: {e}"
290+
self._log_error(
291+
f"Failed to parse namespaces from DPU cluster "
292+
f"{cluster_name}: {e}"
236293
)
237-
self.exec_cmd(f"rm -f {kc_path}")
238294
return
239295

240296
# Collect resources from each namespace
241297
for nspace in namespaces:
298+
self._log_debug(
299+
f"Collecting resources from namespace {nspace} "
300+
f"in DPU cluster {cluster_name}"
301+
)
242302
nspace_arg = f'--namespace={nspace}'
243303
if self.get_option('all'):
244304
k_cmd = (
245-
f"{dpu_kube_cmd} get -o json {nspace_arg} --ignore-not-found"
305+
f"{dpu_kube_cmd} get -o json {nspace_arg} "
306+
f"--ignore-not-found"
246307
)
247308
for res in self.resources:
248309
self.add_cmd_output(
@@ -251,23 +312,32 @@ def _collect_dpu_cluster_resources(self, cluster_info):
251312
)
252313

253314
if self.get_option('describe'):
254-
k_base = f"{dpu_kube_cmd} {nspace_arg} --ignore-not-found"
315+
k_base = (
316+
f"{dpu_kube_cmd} {nspace_arg} --ignore-not-found"
317+
)
255318
for res in self.resources:
256319
ret = self.exec_cmd(f"{k_base} get {res}")
257320
if ret['status'] == 0:
258321
items = [
259-
l.split()[0] for l in ret['output'].splitlines()[1:]
260-
if l.strip()
322+
line.split()[0]
323+
for line in ret['output'].splitlines()[1:]
324+
if line.strip()
261325
]
262326
for item in items:
263327
self.add_cmd_output(
264328
f"{k_base} describe {res} {item}",
265329
subdir=f"{subdir_base}/{nspace}/{res}"
266330
)
267331

268-
# Clean up temporary kubeconfig
269-
self.exec_cmd(f"rm -f {kc_path}")
270-
self._log_info(f"Collected resources from DPU cluster {cluster_name}")
332+
def _collect_all_dpu_clusters(self):
333+
"""Main orchestration method for DPU cluster collection."""
334+
clusters = self._discover_dpu_clusters()
335+
336+
if not clusters:
337+
return
338+
339+
for cluster in clusters:
340+
self._collect_dpu_cluster_resources(cluster)
271341

272342

273343
class RedHatKubernetes(DocaDpf, RedHatPlugin):

0 commit comments

Comments
 (0)