Skip to content

Commit c614e61

Browse files
authored
Merge pull request #633 from jgehrcke/jp/verbosity-vs-debuggability-improvements
Add `logVerbosity` Helm chart parameter, reduce default log verbosity
2 parents a79a9fd + 4ced422 commit c614e61

File tree

17 files changed

+304
-24
lines changed

17 files changed

+304
-24
lines changed

cmd/compute-domain-controller/cleanup.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ func (m *CleanupManager[T]) cleanup(ctx context.Context) {
123123
continue
124124
}
125125

126-
klog.Infof("Cleanup: stale %T found for ComputeDomain '%s', running callback", *new(T), uid)
126+
klog.V(1).Infof("Cleanup: stale %T found for ComputeDomain '%s', running callback", *new(T), uid)
127127
if err := m.callback(ctx, uid); err != nil {
128128
klog.Errorf("error running CleanupManager callback: %v", err)
129129
continue

cmd/compute-domain-controller/computedomain.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
232232
return fmt.Errorf("failed to cast to ComputeDomain")
233233
}
234234

235-
klog.Infof("Processing added or updated ComputeDomain: %s/%s/%s", cd.Namespace, cd.Name, cd.UID)
235+
klog.V(2).Infof("Processing added or updated ComputeDomain: %s/%s/%s", cd.Namespace, cd.Name, cd.UID)
236236

237237
if cd.GetDeletionTimestamp() != nil {
238238
if err := m.resourceClaimTemplateManager.Delete(ctx, string(cd.UID)); err != nil {

cmd/compute-domain-controller/controller.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ import (
2121
"context"
2222
"fmt"
2323

24+
"k8s.io/klog/v2"
25+
2426
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
2527
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/workqueue"
2628
)
@@ -50,6 +52,10 @@ type ManagerConfig struct {
5052
// additionalNamespaces is a list of additional namespaces
5153
// where the driver can manage resources
5254
additionalNamespaces []string
55+
56+
// logVerbosityCDDaemon controls the log verbosity for dynamically launched
57+
// ComputeDomain daemons.
58+
logVerbosityCDDaemon int
5359
}
5460

5561
// Controller manages the lifecycle of the DRA driver and its components.
@@ -77,8 +83,12 @@ func (c *Controller) Run(ctx context.Context) error {
7783
maxNodesPerIMEXDomain: c.config.flags.maxNodesPerIMEXDomain,
7884
clientsets: c.config.clientsets,
7985
workQueue: workQueue,
86+
logVerbosityCDDaemon: c.config.flags.logVerbosityCDDaemon,
8087
}
8188

89+
// TODO: log full, nested cliFlags structure.
90+
klog.Infof("controller manager config: %+v", managerConfig)
91+
8292
cdManager := NewComputeDomainManager(managerConfig)
8393

8494
if err := cdManager.Start(ctx); err != nil {

cmd/compute-domain-controller/daemonset.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ type DaemonSetTemplateData struct {
5252
ImageName string
5353
MaxNodesPerIMEXDomain int
5454
FeatureGates map[string]bool
55+
LogVerbosity int
5556
}
5657

5758
type DaemonSetManager struct {
@@ -118,7 +119,7 @@ func (m *DaemonSetManager) Start(ctx context.Context) (rerr error) {
118119
}()
119120

120121
if err := addComputeDomainLabelIndexer[*appsv1.DaemonSet](m.informer); err != nil {
121-
return fmt.Errorf("error adding indexer for MulitNodeEnvironment label: %w", err)
122+
return fmt.Errorf("error adding indexer for MultiNodeEnvironment label: %w", err)
122123
}
123124

124125
m.mutationCache = cache.NewIntegerResourceVersionMutationCache(
@@ -207,6 +208,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, cd *nvapi.ComputeDomain)
207208
ImageName: m.config.imageName,
208209
MaxNodesPerIMEXDomain: m.config.maxNodesPerIMEXDomain,
209210
FeatureGates: featuregates.ToMap(),
211+
LogVerbosity: m.config.logVerbosityCDDaemon,
210212
}
211213

212214
tmpl, err := template.ParseFiles(DaemonSetTemplatePath)
@@ -363,7 +365,7 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error {
363365
return fmt.Errorf("failed to cast to DaemonSet")
364366
}
365367

366-
klog.Infof("Processing added or updated DaemonSet: %s/%s", d.Namespace, d.Name)
368+
klog.V(2).Infof("Processing added or updated DaemonSet: %s/%s", d.Namespace, d.Name)
367369

368370
cd, err := m.getComputeDomain(d.Labels[computeDomainLabelKey])
369371
if err != nil {

cmd/compute-domain-controller/main.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ type Flags struct {
6262
namespace string
6363
imageName string
6464
maxNodesPerIMEXDomain int
65+
logVerbosityCDDaemon int
6566

6667
httpEndpoint string
6768
metricsPath string
@@ -111,6 +112,13 @@ func newApp() *cli.App {
111112
Destination: &flags.imageName,
112113
EnvVars: []string{"IMAGE_NAME"},
113114
},
115+
&cli.IntFlag{
116+
Name: "log-verbosity-cd-daemon",
117+
Usage: "Log verbosity for dynamically launched CD daemon pods",
118+
Required: true,
119+
EnvVars: []string{"LOG_VERBOSITY_CD_DAEMON"},
120+
Destination: &flags.logVerbosityCDDaemon,
121+
},
114122
&cli.IntFlag{
115123
Name: "max-nodes-per-imex-domain",
116124
Usage: "The maximum number of possible nodes per IMEX domain",

cmd/compute-domain-daemon/main.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
294294
// IMEX daemon nodes config file and (re)starting the IMEX daemon process.
295295
func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cliqueID string, pm *ProcessManager) error {
296296
for {
297-
klog.Infof("wait for nodes update")
297+
klog.V(1).Infof("wait for nodes update")
298298
select {
299299
case <-ctx.Done():
300300
klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithIPs")
@@ -327,7 +327,7 @@ func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cl
327327
// unexpectedly and expectedly).
328328
func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controller, processManager *ProcessManager, dnsNameManager *DNSNameManager) error {
329329
for {
330-
klog.Infof("wait for nodes update")
330+
klog.V(1).Infof("wait for nodes update")
331331
select {
332332
case <-ctx.Done():
333333
klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithDNSNames")

cmd/compute-domain-kubelet-plugin/device_state.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ func (s *DeviceState) Unprepare(ctx context.Context, claimRef kubeletplugin.Name
212212
// device was never prepared or has already been unprepared (assume that
213213
// Prepare+Checkpoint are done transactionally). Note that
214214
// claimRef.String() contains namespace, name, UID.
215-
klog.Infof("unprepare noop: claim not found in checkpoint data: %v", claimRef.String())
215+
klog.V(2).Infof("Unprepare noop: claim not found in checkpoint data: %v", claimRef.String())
216216
return nil
217217
}
218218

cmd/compute-domain-kubelet-plugin/driver.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,9 @@ func (d *driver) UnprepareResourceClaims(ctx context.Context, claimRefs []kubele
202202
if done {
203203
results[claim.UID] = err
204204
wg.Done()
205+
if err != nil {
206+
klog.V(0).Infof("Permanent error unpreparing devices for claim %v: %v", claim.UID, err)
207+
}
205208
return nil
206209
}
207210
return fmt.Errorf("%w", err)
@@ -251,13 +254,14 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.Res
251254
Err: fmt.Errorf("error preparing devices for claim %s/%s:%s: %w", claim.Namespace, claim.Name, claim.UID, err),
252255
}
253256
if isPermanentError(err) {
254-
klog.V(6).Infof("Permanent error preparing devices for claim %v: %v", claim.UID, err)
257+
klog.Infof("Permanent error preparing devices for claim %v: %v", claim.UID, err)
255258
return true, res
256259
}
257260
return false, res
258261
}
259262

260-
klog.Infof("prepared devices for claim '%s/%s:%s': %v", claim.Namespace, claim.Name, claim.UID, devs)
263+
klog.V(1).Infof("prepared devices for claim '%s/%s:%s': %v", claim.Namespace, claim.Name, claim.UID, devs)
264+
261265
return true, kubeletplugin.PrepareResult{Devices: devs}
262266
}
263267

@@ -272,7 +276,7 @@ func (d *driver) nodeUnprepareResource(ctx context.Context, claimRef kubeletplug
272276
return isPermanentError(err), fmt.Errorf("error unpreparing devices for claim '%v': %w", claimRef.String(), err)
273277
}
274278

275-
klog.Infof("unprepared devices for claim '%v'", claimRef.String())
279+
klog.V(1).Infof("Unprepared devices for claim '%v'", claimRef.String())
276280
return true, nil
277281
}
278282

cmd/compute-domain-kubelet-plugin/health.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,14 +130,14 @@ func (h *healthcheck) Check(ctx context.Context, req *grpc_health_v1.HealthCheck
130130
klog.ErrorS(err, "failed to call GetInfo")
131131
return status, nil
132132
}
133-
klog.V(6).Infof("Successfully invoked GetInfo: %v", info)
133+
klog.V(7).Infof("Successfully invoked GetInfo: %v", info)
134134

135135
_, err = h.draClient.NodePrepareResources(ctx, &drapb.NodePrepareResourcesRequest{})
136136
if err != nil {
137137
klog.ErrorS(err, "failed to call NodePrepareResources")
138138
return status, nil
139139
}
140-
klog.V(6).Info("Successfully invoked NodePrepareResources")
140+
klog.V(7).Info("Successfully invoked NodePrepareResources")
141141

142142
status.Status = grpc_health_v1.HealthCheckResponse_SERVING
143143
return status, nil

deployments/helm/nvidia-dra-driver-gpu/templates/controller.yaml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,20 @@ spec:
5252
{{- toYaml .Values.controller.containers.computeDomain.securityContext | nindent 10 }}
5353
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
5454
imagePullPolicy: {{ .Values.image.pullPolicy }}
55-
command: ["compute-domain-controller", "-v", "6"]
55+
command: ["compute-domain-controller", "-v", "$(LOG_VERBOSITY)"]
5656
resources:
5757
{{- toYaml .Values.controller.containers.computeDomain.resources | nindent 10 }}
5858
env:
59+
# LOG_VERBOSITY is the source of truth for this program's klog
60+
# configuration. Currently injected via CLI argument (see above) because
61+
# klog's verbosity for now cannot be sanely set from an env var.
62+
- name: LOG_VERBOSITY
63+
value: "{{ .Values.logVerbosity }}"
64+
# LOG_VERBOSITY_CD_DAEMON controls the verbosity of dynamically launched
65+
# CD daemons (their pod spec is not rendered by Helm, but by this
66+
# controller).
67+
- name: LOG_VERBOSITY_CD_DAEMON
68+
value: "{{ .Values.logVerbosity }}"
5969
- name: POD_NAME
6070
valueFrom:
6171
fieldRef:

0 commit comments

Comments
 (0)