Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/compute-domain-controller/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ func (m *CleanupManager[T]) cleanup(ctx context.Context) {
continue
}

klog.Infof("Cleanup: stale %T found for ComputeDomain '%s', running callback", *new(T), uid)
klog.V(1).Infof("Cleanup: stale %T found for ComputeDomain '%s', running callback", *new(T), uid)
if err := m.callback(ctx, uid); err != nil {
klog.Errorf("error running CleanupManager callback: %v", err)
continue
Expand Down
2 changes: 1 addition & 1 deletion cmd/compute-domain-controller/computedomain.go
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
return fmt.Errorf("failed to cast to ComputeDomain")
}

klog.Infof("Processing added or updated ComputeDomain: %s/%s/%s", cd.Namespace, cd.Name, cd.UID)
klog.V(2).Infof("Processing added or updated ComputeDomain: %s/%s/%s", cd.Namespace, cd.Name, cd.UID)

if cd.GetDeletionTimestamp() != nil {
if err := m.resourceClaimTemplateManager.Delete(ctx, string(cd.UID)); err != nil {
Expand Down
10 changes: 10 additions & 0 deletions cmd/compute-domain-controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"context"
"fmt"

"k8s.io/klog/v2"

"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/flags"
"github.com/NVIDIA/k8s-dra-driver-gpu/pkg/workqueue"
)
Expand Down Expand Up @@ -50,6 +52,10 @@ type ManagerConfig struct {
// additionalNamespaces is a list of additional namespaces
// where the driver can manage resources
additionalNamespaces []string

// logVerbosityCDDaemon controls the log verbosity for dynamically launched
// ComputeDomain daemons.
logVerbosityCDDaemon int
}

// Controller manages the lifecycle of the DRA driver and its components.
Expand Down Expand Up @@ -77,8 +83,12 @@ func (c *Controller) Run(ctx context.Context) error {
maxNodesPerIMEXDomain: c.config.flags.maxNodesPerIMEXDomain,
clientsets: c.config.clientsets,
workQueue: workQueue,
logVerbosityCDDaemon: c.config.flags.logVerbosityCDDaemon,
}

// TODO: log full, nested cliFlags structure.
klog.Infof("controller manager config: %+v", managerConfig)

cdManager := NewComputeDomainManager(managerConfig)

if err := cdManager.Start(ctx); err != nil {
Expand Down
6 changes: 4 additions & 2 deletions cmd/compute-domain-controller/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type DaemonSetTemplateData struct {
ImageName string
MaxNodesPerIMEXDomain int
FeatureGates map[string]bool
LogVerbosity int
}

type DaemonSetManager struct {
Expand Down Expand Up @@ -118,7 +119,7 @@ func (m *DaemonSetManager) Start(ctx context.Context) (rerr error) {
}()

if err := addComputeDomainLabelIndexer[*appsv1.DaemonSet](m.informer); err != nil {
return fmt.Errorf("error adding indexer for MulitNodeEnvironment label: %w", err)
return fmt.Errorf("error adding indexer for MultiNodeEnvironment label: %w", err)
}

m.mutationCache = cache.NewIntegerResourceVersionMutationCache(
Expand Down Expand Up @@ -207,6 +208,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, cd *nvapi.ComputeDomain)
ImageName: m.config.imageName,
MaxNodesPerIMEXDomain: m.config.maxNodesPerIMEXDomain,
FeatureGates: featuregates.ToMap(),
LogVerbosity: m.config.logVerbosityCDDaemon,
}

tmpl, err := template.ParseFiles(DaemonSetTemplatePath)
Expand Down Expand Up @@ -363,7 +365,7 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error {
return fmt.Errorf("failed to cast to DaemonSet")
}

klog.Infof("Processing added or updated DaemonSet: %s/%s", d.Namespace, d.Name)
klog.V(2).Infof("Processing added or updated DaemonSet: %s/%s", d.Namespace, d.Name)

cd, err := m.getComputeDomain(d.Labels[computeDomainLabelKey])
if err != nil {
Expand Down
8 changes: 8 additions & 0 deletions cmd/compute-domain-controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ type Flags struct {
namespace string
imageName string
maxNodesPerIMEXDomain int
logVerbosityCDDaemon int

httpEndpoint string
metricsPath string
Expand Down Expand Up @@ -111,6 +112,13 @@ func newApp() *cli.App {
Destination: &flags.imageName,
EnvVars: []string{"IMAGE_NAME"},
},
&cli.IntFlag{
Name: "log-verbosity-cd-daemon",
Usage: "Log verbosity for dynamically launched CD daemon pods",
Required: true,
EnvVars: []string{"LOG_VERBOSITY_CD_DAEMON"},
Destination: &flags.logVerbosityCDDaemon,
},
&cli.IntFlag{
Name: "max-nodes-per-imex-domain",
Usage: "The maximum number of possible nodes per IMEX domain",
Expand Down
4 changes: 2 additions & 2 deletions cmd/compute-domain-daemon/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ func run(ctx context.Context, cancel context.CancelFunc, flags *Flags) error {
// IMEX daemon nodes config file and (re)starting the IMEX daemon process.
func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cliqueID string, pm *ProcessManager) error {
for {
klog.Infof("wait for nodes update")
klog.V(1).Infof("wait for nodes update")
select {
case <-ctx.Done():
klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithIPs")
Expand Down Expand Up @@ -327,7 +327,7 @@ func IMEXDaemonUpdateLoopWithIPs(ctx context.Context, controller *Controller, cl
// unexpectedly and expectedly).
func IMEXDaemonUpdateLoopWithDNSNames(ctx context.Context, controller *Controller, processManager *ProcessManager, dnsNameManager *DNSNameManager) error {
for {
klog.Infof("wait for nodes update")
klog.V(1).Infof("wait for nodes update")
select {
case <-ctx.Done():
klog.Infof("shutdown: stop IMEXDaemonUpdateLoopWithDNSNames")
Expand Down
2 changes: 1 addition & 1 deletion cmd/compute-domain-kubelet-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ func (s *DeviceState) Unprepare(ctx context.Context, claimRef kubeletplugin.Name
// device was never prepared or has already been unprepared (assume that
// Prepare+Checkpoint are done transactionally). Note that
// claimRef.String() contains namespace, name, UID.
klog.Infof("unprepare noop: claim not found in checkpoint data: %v", claimRef.String())
klog.V(2).Infof("Unprepare noop: claim not found in checkpoint data: %v", claimRef.String())
return nil
}

Expand Down
10 changes: 7 additions & 3 deletions cmd/compute-domain-kubelet-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,9 @@ func (d *driver) UnprepareResourceClaims(ctx context.Context, claimRefs []kubele
if done {
results[claim.UID] = err
wg.Done()
if err != nil {
klog.V(0).Infof("Permanent error unpreparing devices for claim %v: %v", claim.UID, err)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the V(0) be dropped?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, can the log be done inside nodeUnprepareResource so as not to clutter things here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, level 0 is implicit when doing klog.Infof().

Copy link
Collaborator Author

@jgehrcke jgehrcke Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the V(0) be dropped?

I remember: I did this to make this explicit -- to set the precedent for always using an explicit level, to enhance code readability.

Because this is a question one naturally has when reading code: what level does Info on log by default? One needs to have that additional knowledge.

But I will remove this now again to eradicate a potential point of friction.

Copy link
Collaborator Author

@jgehrcke jgehrcke Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can the log be done inside nodeUnprepareResource

In d.nodeUnprepareResource(ctx, claim) we return isPermanentError(err) directly w/o inspecting its return value. We only look at done here, at the call site.

We can change this if course, but let's not do this here.

}
return nil
}
return fmt.Errorf("%w", err)
Expand Down Expand Up @@ -251,13 +254,14 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *resourceapi.Res
Err: fmt.Errorf("error preparing devices for claim %s/%s:%s: %w", claim.Namespace, claim.Name, claim.UID, err),
}
if isPermanentError(err) {
klog.V(6).Infof("Permanent error preparing devices for claim %v: %v", claim.UID, err)
klog.Infof("Permanent error preparing devices for claim %v: %v", claim.UID, err)
return true, res
}
return false, res
}

klog.Infof("prepared devices for claim '%s/%s:%s': %v", claim.Namespace, claim.Name, claim.UID, devs)
klog.V(1).Infof("prepared devices for claim '%s/%s:%s': %v", claim.Namespace, claim.Name, claim.UID, devs)

return true, kubeletplugin.PrepareResult{Devices: devs}
}

Expand All @@ -272,7 +276,7 @@ func (d *driver) nodeUnprepareResource(ctx context.Context, claimRef kubeletplug
return isPermanentError(err), fmt.Errorf("error unpreparing devices for claim '%v': %w", claimRef.String(), err)
}

klog.Infof("unprepared devices for claim '%v'", claimRef.String())
klog.V(1).Infof("Unprepared devices for claim '%v'", claimRef.String())
return true, nil
}

Expand Down
4 changes: 2 additions & 2 deletions cmd/compute-domain-kubelet-plugin/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,14 @@ func (h *healthcheck) Check(ctx context.Context, req *grpc_health_v1.HealthCheck
klog.ErrorS(err, "failed to call GetInfo")
return status, nil
}
klog.V(6).Infof("Successfully invoked GetInfo: %v", info)
klog.V(7).Infof("Successfully invoked GetInfo: %v", info)

_, err = h.draClient.NodePrepareResources(ctx, &drapb.NodePrepareResourcesRequest{})
if err != nil {
klog.ErrorS(err, "failed to call NodePrepareResources")
return status, nil
}
klog.V(6).Info("Successfully invoked NodePrepareResources")
klog.V(7).Info("Successfully invoked NodePrepareResources")

status.Status = grpc_health_v1.HealthCheckResponse_SERVING
return status, nil
Expand Down
12 changes: 11 additions & 1 deletion deployments/helm/nvidia-dra-driver-gpu/templates/controller.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,20 @@ spec:
{{- toYaml .Values.controller.containers.computeDomain.securityContext | nindent 10 }}
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
command: ["compute-domain-controller", "-v", "6"]
command: ["compute-domain-controller", "-v", "$(LOG_VERBOSITY)"]
resources:
{{- toYaml .Values.controller.containers.computeDomain.resources | nindent 10 }}
env:
# LOG_VERBOSITY is the source of truth for this program's klog
# configuration. Currently injected via CLI argument (see above) because
# klog's verbosity for now cannot be sanely set from an env var.
- name: LOG_VERBOSITY
value: "{{ .Values.logVerbosity }}"
# LOG_VERBOSITY_CD_DAEMON controls the verbosity of dynamically launched
# CD daemons (their pod spec is not rendered by Helm, but by this
# controller).
- name: LOG_VERBOSITY_CD_DAEMON
value: "{{ .Values.logVerbosity }}"
- name: POD_NAME
valueFrom:
fieldRef:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ spec:
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
fi
compute-domain-kubelet-plugin -v 6
compute-domain-kubelet-plugin -v $(LOG_VERBOSITY)
resources:
{{- toYaml .Values.kubeletPlugin.containers.computeDomains.resources | nindent 10 }}
{{/*
Expand All @@ -116,6 +116,12 @@ spec:
periodSeconds: 10
{{- end }}
env:
# LOG_VERBOSITY is the source of truth for this program's klog
# configuration. Currently injected via CLI argument (see above) because
# klog's verbosity for now cannot be sanely set from an environment
# variable.
- name: LOG_VERBOSITY
value: "{{ .Values.logVerbosity }}"
- name: MASK_NVIDIA_DRIVER_PARAMS
value: "{{ .Values.maskNvidiaDriverParams }}"
- name: NVIDIA_DRIVER_ROOT
Expand Down Expand Up @@ -150,6 +156,9 @@ spec:
- name: HEALTHCHECK_PORT
value: {{ .Values.kubeletPlugin.containers.computeDomains.healthcheckPort | quote }}
{{- end }}
{{- with .Values.kubeletPlugin.containers.computeDomains.env }}
{{- toYaml . | nindent 8 }}
{{- end }}
volumeMounts:
- name: plugins-registry
mountPath: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }}
Expand Down Expand Up @@ -181,7 +190,7 @@ spec:
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
fi
gpu-kubelet-plugin -v 6
gpu-kubelet-plugin -v $(LOG_VERBOSITY)
resources:
{{- toYaml .Values.kubeletPlugin.containers.gpus.resources | nindent 10 }}
{{/*
Expand All @@ -198,6 +207,12 @@ spec:
periodSeconds: 10
{{- end }}
env:
# LOG_VERBOSITY is the source of truth for this program's klog
# configuration. Currently injected via CLI argument (see above) because
# klog's verbosity for now cannot be sanely set from an environment
# variable.
- name: LOG_VERBOSITY
value: "{{ .Values.logVerbosity }}"
- name: MASK_NVIDIA_DRIVER_PARAMS
value: "{{ .Values.maskNvidiaDriverParams }}"
- name: NVIDIA_DRIVER_ROOT
Expand Down Expand Up @@ -234,6 +249,9 @@ spec:
- name: HEALTHCHECK_PORT
value: {{ .Values.kubeletPlugin.containers.gpus.healthcheckPort | quote }}
{{- end }}
{{- with .Values.kubeletPlugin.containers.gpus.env }}
{{- toYaml . | nindent 8 }}
{{- end }}
volumeMounts:
- name: plugins-registry
mountPath: {{ .Values.kubeletPlugin.kubeletRegistrarDirectoryPath | quote }}
Expand Down
50 changes: 50 additions & 0 deletions deployments/helm/nvidia-dra-driver-gpu/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,54 @@ resources:
# LoggingBetaOptions: true # Kubernetes logging beta features
featureGates: {}

# Log verbosity for all components. Zero or greater, higher number means higher
# verbosity. Regardless of this setting, messages of type Error, Warning, and
# Info(level 0) are always logged. Can also be set for individual components via
# environment variable (that takes precedence), see
# https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Troubleshooting#controlling-log-verbosity
#
# An (incomplete) representation of which types of messages to expect with
# increasing verbosity level:
#
# Level 0:
# - Configuration detail (during process startup)
# - Kubelet plugins:
# - Permanent errors during device Prepare() and Unprepare()
#
# Level 1:
# - CD controller:
# - Confirm cleanup of stale objects
# - k8s client-go: feature gates
# - Kubelet plugins:
# - Device (un)prepare confirmation, with resource claim UID
# - Workqueue reconciliation failures (noisy: mainly expected, retryable
# errors)
# - CD daemon:
# - explicit 'wait for nodes update'
#
# Level 2:
# - reflector.go informer state: "Caches populated"
# - Kubelet plugins:
# - Acknowledge when Unprepare is a noop
# - CD controller:
# - Added/updated API object callback confirmation
#
# Level 3:
# - reflector.go informer state: "Listing and watching"
#
# Level 6:
# - round_trippers.go output (API server request/response detail)
# - Kubelet plugins:
# - GRPC request/response detail
# - Checkpoint file update confirmation
# - CD daemon:
# - explicit 'IP set did not change'
#
# Level 7:
# - Kubelet plugins:
# - Health check
logVerbosity: "4"

# Webhook configuration
webhook:
enabled: false
Expand Down Expand Up @@ -160,13 +208,15 @@ kubeletPlugin:
securityContext: {}
resources: {}
computeDomains:
env: []
securityContext:
privileged: true
resources: {}
# Port running a gRPC health service checked by a livenessProbe.
# Set to a negative value to disable the service and the probe.
healthcheckPort: 51515
gpus:
env: []
securityContext:
privileged: true
resources: {}
Expand Down
5 changes: 4 additions & 1 deletion pkg/workqueue/workqueue.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,10 @@ func (q *WorkQueue) processNextWorkItem(ctx context.Context) {

err := q.reconcile(ctx, workItem)
if err != nil {
klog.Errorf("Failed to reconcile work item: %v", err)
// Most often, this is an expected, retryable error in the context of an
// eventually consistent system. Hence, do not log an error level. Rely
// on inner business logic to log unexpected errors on error level.
klog.V(1).Infof("Reconcile: %v", err)
// Only retry if we're still the current operation for this key
q.Lock()
if q.activeOps[workItem.Key] != nil && q.activeOps[workItem.Key] != workItem {
Expand Down
12 changes: 8 additions & 4 deletions templates/compute-domain-daemon.tmpl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,12 @@ spec:
# Run the compute domain daemon
- name: compute-domain-daemon
image: {{ .ImageName }}
command: ["compute-domain-daemon", "-v", "6", "run"]
command: ["compute-domain-daemon", "-v", "$(LOG_VERBOSITY)", "run"]
env:
# LOG_VERBOSITY is the source of truth, it's injected via CLI argument
# above because of klogs limited configuration interface.
- name: LOG_VERBOSITY
value: "{{ .LogVerbosity }}"
- name: MAX_NODES_PER_IMEX_DOMAIN
value: "{{ .MaxNodesPerIMEXDomain }}"
- name: NODE_NAME
Expand Down Expand Up @@ -63,22 +67,22 @@ spec:
# if/when necessary.
startupProbe:
exec:
command: ["compute-domain-daemon", "-v", "6", "check"]
command: ["compute-domain-daemon", "-v", "$(LOG_VERBOSITY)", "check"]
initialDelaySeconds: 0
periodSeconds: 1
timeoutSeconds: 10
failureThreshold: 1200 # (1s*1200s)=20min
successThreshold: 1
livenessProbe:
exec:
command: ["compute-domain-daemon", "-v", "6", "check"]
command: ["compute-domain-daemon", "-v", "$(LOG_VERBOSITY)", "check"]
periodSeconds: 60
timeoutSeconds: 10
failureThreshold: 20 # (60s*20)=20min
successThreshold: 1
readinessProbe:
exec:
command: ["compute-domain-daemon", "-v", "6", "check"]
command: ["compute-domain-daemon", "-v", "$(LOG_VERBOSITY)", "check"]
periodSeconds: 10
timeoutSeconds: 10
failureThreshold: 1
Expand Down
Loading