Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions cmd/agent/dist/conf.d/lock_contention.d/conf.yaml.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
init_config:

instances:

-

## @param tags - list of strings following the pattern: "key:value" - optional
## List of tags to attach to every metric, event, and service check emitted by this integration.
##
## Learn more about tagging: https://docs.datadoghq.com/tagging/
#
# tags:
# - <KEY_1>:<VALUE_1>
# - <KEY_2>:<VALUE_2>
2 changes: 2 additions & 0 deletions cmd/agent/dist/conf.d/pressure.d/conf.yaml.default
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
instances:
- {}
2 changes: 2 additions & 0 deletions cmd/agent/dist/conf.d/syscall_latency.d/conf.yaml.default
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
instances:
- {}
69 changes: 69 additions & 0 deletions cmd/system-probe/modules/lock_contention_check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2025-present Datadog, Inc.

//go:build linux && linux_bpf

package modules

import (
"fmt"
"net/http"
"sync/atomic"
"time"

"github.com/DataDog/datadog-agent/pkg/collector/corechecks/ebpf/probe/lockcontentioncheck"
"github.com/DataDog/datadog-agent/pkg/ebpf"
"github.com/DataDog/datadog-agent/pkg/system-probe/api/module"
"github.com/DataDog/datadog-agent/pkg/system-probe/config"
sysconfigtypes "github.com/DataDog/datadog-agent/pkg/system-probe/config/types"
"github.com/DataDog/datadog-agent/pkg/system-probe/utils"
"github.com/DataDog/datadog-agent/pkg/util/log"
)

func init() { registerModule(LockContentionCheck) }

// LockContentionCheck Factory
var LockContentionCheck = &module.Factory{
Name: config.LockContentionCheckModule,
Fn: func(_ *sysconfigtypes.Config, _ module.FactoryDependencies) (module.Module, error) {
log.Infof("Starting the lock contention check module")
p, err := lockcontentioncheck.NewProbe(ebpf.NewConfig())
if err != nil {
return nil, fmt.Errorf("unable to start the lock contention check probe: %w", err)
}
return &lockContentionCheckModule{
Probe: p,
lastCheck: &atomic.Int64{},
}, nil
},
NeedsEBPF: func() bool {
return true
},
}

var _ module.Module = &lockContentionCheckModule{}

type lockContentionCheckModule struct {
*lockcontentioncheck.Probe
lastCheck *atomic.Int64
}

// GetStats implements module.Module.GetStats
func (m lockContentionCheckModule) GetStats() map[string]interface{} {
return map[string]interface{}{
"last_check": m.lastCheck.Load(),
}
}

// Register implements module.Module.Register
func (m lockContentionCheckModule) Register(httpMux *module.Router) error {
httpMux.HandleFunc("/check", utils.WithConcurrencyLimit(1, func(w http.ResponseWriter, req *http.Request) {
m.lastCheck.Store(time.Now().Unix())
stats := m.Probe.GetAndFlush()
utils.WriteAsJSON(req, w, stats, utils.GetPrettyPrintFromQueryParams(req))
}))

return nil
}
2 changes: 2 additions & 0 deletions cmd/system-probe/modules/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ var moduleOrder = []types.ModuleName{
config.InjectorModule,
config.NoisyNeighborModule,
config.LogonDurationModule,
config.LockContentionCheckModule,
config.SyscallLatencyCheckModule,
}

// nolint: deadcode, unused // may be unused with certain build tag combinations
Expand Down
68 changes: 68 additions & 0 deletions cmd/system-probe/modules/syscall_latency_check.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0.
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2025-present Datadog, Inc.

//go:build linux && linux_bpf

package modules

import (
"fmt"
"net/http"
"sync/atomic"
"time"

"github.com/DataDog/datadog-agent/pkg/collector/corechecks/ebpf/probe/syscalllatency"
"github.com/DataDog/datadog-agent/pkg/ebpf"
"github.com/DataDog/datadog-agent/pkg/system-probe/api/module"
"github.com/DataDog/datadog-agent/pkg/system-probe/config"
sysconfigtypes "github.com/DataDog/datadog-agent/pkg/system-probe/config/types"
"github.com/DataDog/datadog-agent/pkg/system-probe/utils"
"github.com/DataDog/datadog-agent/pkg/util/log"
)

func init() { registerModule(SyscallLatencyCheck) }

// SyscallLatencyCheck factory
var SyscallLatencyCheck = &module.Factory{
Name: config.SyscallLatencyCheckModule,
Fn: func(_ *sysconfigtypes.Config, _ module.FactoryDependencies) (module.Module, error) {
log.Infof("Starting the syscall latency check module")
p, err := syscalllatency.NewProbe(ebpf.NewConfig())
if err != nil {
return nil, fmt.Errorf("unable to start the syscall latency probe: %w", err)
}
return &syscallLatencyModule{
Probe: p,
lastCheck: &atomic.Int64{},
}, nil
},
NeedsEBPF: func() bool {
return true
},
}

var _ module.Module = &syscallLatencyModule{}

type syscallLatencyModule struct {
*syscalllatency.Probe
lastCheck *atomic.Int64
}

// GetStats implements module.Module.
func (m *syscallLatencyModule) GetStats() map[string]interface{} {
return map[string]interface{}{
"last_check": m.lastCheck.Load(),
}
}

// Register implements module.Module.
func (m *syscallLatencyModule) Register(httpMux *module.Router) error {
httpMux.HandleFunc("/check", utils.WithConcurrencyLimit(1, func(w http.ResponseWriter, req *http.Request) {
m.lastCheck.Store(time.Now().Unix())
stats := m.Probe.GetAndFlush()
utils.WriteAsJSON(req, w, stats, utils.GetPrettyPrintFromQueryParams(req))
}))
return nil
}
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,8 @@ require (
go.opentelemetry.io/proto/otlp v1.10.0 // indirect
go.uber.org/goleak v1.3.0
go.uber.org/zap/exp v0.3.0
go.yaml.in/yaml/v2 v2.4.3
go.yaml.in/yaml/v3 v3.0.4 // indirect
go4.org/unsafe/assume-no-moving-gc v0.0.0-20231121144256-b99613f794b6 // indirect
golang.org/x/exp/typeparams v0.0.0-20251125195548-87e1e737ad39 // indirect
golang.org/x/lint v0.0.0-20241112194109-818c5a804067 // indirect
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef __LOCK_CONTENTION_CHECK_KERN_USER_H
#define __LOCK_CONTENTION_CHECK_KERN_USER_H

#include "ktypes.h"

// lock_type_key_t classifies kernel lock types derived from LCB_F_* flags
// in the contention_begin tracepoint.
typedef enum {
LOCK_TYPE_SPINLOCK = 0,
LOCK_TYPE_MUTEX = 1,
LOCK_TYPE_RWSEM_READ = 2,
LOCK_TYPE_RWSEM_WRITE = 3,
LOCK_TYPE_RWLOCK_READ = 4,
LOCK_TYPE_RWLOCK_WRITE = 5,
LOCK_TYPE_RT_MUTEX = 6,
LOCK_TYPE_PCPU_SPINLOCK = 7,
LOCK_TYPE_OTHER = 8,
LOCK_TYPE_MAX = 9,
} lock_type_key_t;

// lock_contention_stats_t holds aggregated contention statistics per lock type.
// Stored in a per-CPU array map indexed by lock_type_key_t.
typedef struct {
__u64 total_time_ns; // cumulative nanoseconds spent waiting
__u64 count; // number of contention events
__u64 max_time_ns; // max single-event wait time (reset per flush interval)
} lock_contention_stats_t;

#endif
166 changes: 166 additions & 0 deletions pkg/collector/corechecks/ebpf/c/runtime/lock-contention-check-kern.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#include "vmlinux.h"
#include "bpf_helpers.h"
#include "bpf_tracing.h"
#include "map-defs.h"
#include "lock-contention-check-kern-user.h"
#include "bpf_metadata.h"
#include "bpf_telemetry.h"

#define MAX_TSTAMP_ENTRIES 8192

/* Lock contention flags from include/trace/events/lock.h */
#define LCB_F_SPIN (1U << 0)
#define LCB_F_READ (1U << 1)
#define LCB_F_WRITE (1U << 2)
#define LCB_F_RT (1U << 3)
#define LCB_F_PERCPU (1U << 4)
#define LCB_F_MUTEX (1U << 5)

/* Per-task timestamp data stored in contention_begin, consumed in contention_end */
struct tstamp_data {
__u64 timestamp_ns;
__u64 lock; /* lock address — non-zero means slot is occupied */
__u32 flags;
};

/* Per-TID hash map for sleeping lock timestamps (mutex, rwsem, rt_mutex).
* Must NOT be per-CPU: sleeping locks can migrate between CPUs, so
* contention_end may run on a different CPU than contention_begin. */
BPF_HASH_MAP(tstamp, __u32, struct tstamp_data, MAX_TSTAMP_ENTRIES)

/* Per-CPU array for spinlock timestamps (one slot per CPU, preemption disabled) */
BPF_PERCPU_ARRAY_MAP(tstamp_cpu, struct tstamp_data, 1)

/* Per-CPU array of aggregated stats, indexed by lock_type_key_t */
BPF_PERCPU_ARRAY_MAP(lock_contention_stats, lock_contention_stats_t, LOCK_TYPE_MAX)

/* Classify LCB_F_* flags into lock_type_key_t */
static __always_inline __u32 classify_lock_type(__u32 flags) {
if (flags & LCB_F_SPIN) {
if (flags & LCB_F_PERCPU)
return LOCK_TYPE_PCPU_SPINLOCK;
if (flags & LCB_F_READ)
return LOCK_TYPE_RWLOCK_READ;
if (flags & LCB_F_WRITE)
return LOCK_TYPE_RWLOCK_WRITE;
return LOCK_TYPE_SPINLOCK;
}
if (flags & LCB_F_MUTEX)
return LOCK_TYPE_MUTEX;
if (flags & LCB_F_RT)
return LOCK_TYPE_RT_MUTEX;
if (flags & LCB_F_READ)
return LOCK_TYPE_RWSEM_READ;
if (flags & LCB_F_WRITE)
return LOCK_TYPE_RWSEM_WRITE;
/* flags == 0: pre-6.2 kernels where mutex has no dedicated flag */
if (flags == 0)
return LOCK_TYPE_MUTEX;
return LOCK_TYPE_OTHER;
}

/* Get or create a timestamp element based on lock type.
* Spinlocks/rwlocks use per-CPU array (preemption is disabled).
* Sleeping locks use per-TID hash map. */
static __always_inline struct tstamp_data *get_tstamp_elem(__u32 flags) {
struct tstamp_data *pelem;

if (flags & LCB_F_SPIN) {
__u32 idx = 0;
pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
/* Do not overwrite for nested lock contention */
if (pelem && pelem->lock)
return NULL;
return pelem;
}

__u32 tid = bpf_get_current_pid_tgid();
pelem = bpf_map_lookup_elem(&tstamp, &tid);
/* Do not overwrite for nested lock contention */
if (pelem && pelem->lock)
return NULL;

if (pelem == NULL) {
struct tstamp_data zero = {};
if (bpf_map_update_elem(&tstamp, &tid, &zero, BPF_NOEXIST) < 0)
return NULL;
pelem = bpf_map_lookup_elem(&tstamp, &tid);
}
return pelem;
}

SEC("tp_btf/contention_begin")
int tracepoint__contention_begin(u64 *ctx)
{
__u32 flags = (__u32)ctx[1];
struct tstamp_data *pelem;

pelem = get_tstamp_elem(flags);
if (pelem == NULL)
return 0;

pelem->timestamp_ns = bpf_ktime_get_ns();
pelem->lock = ctx[0];
pelem->flags = flags;

return 0;
}

SEC("tp_btf/contention_end")
int tracepoint__contention_end(u64 *ctx)
{
struct tstamp_data *pelem;
__u32 tid = 0, idx = 0;
bool need_delete = false;
__u64 duration;

/*
* contention_end does not carry flags, so we cannot know whether the
* lock was a spinlock or sleeping lock from the tracepoint args alone.
*
* Strategy (same as upstream perf lock contention):
* 1. Check per-CPU map first (spinlocks cannot sleep, so if there's
* an active entry it must be for this event).
* 2. If no per-CPU entry, check per-TID hash (sleeping locks).
* 3. Verify the lock address matches.
*/
pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
if (pelem && pelem->lock) {
if (pelem->lock != (__u64)ctx[0])
return 0;
} else {
tid = bpf_get_current_pid_tgid();
pelem = bpf_map_lookup_elem(&tstamp, &tid);
if (!pelem || pelem->lock != (__u64)ctx[0])
return 0;
need_delete = true;
}

duration = bpf_ktime_get_ns() - pelem->timestamp_ns;
if ((__s64)duration < 0) {
pelem->lock = 0;
if (need_delete)
bpf_map_delete_elem(&tstamp, &tid);
return 0;
}

/* Classify and update stats */
__u32 lock_type = classify_lock_type(pelem->flags);
lock_contention_stats_t *stats = bpf_map_lookup_elem(&lock_contention_stats, &lock_type);
if (stats) {
stats->total_time_ns += duration;
stats->count += 1;
/* max_time_ns: not atomic, but acceptable — worst case we miss
* an update, which is fine for a gauge that resets each interval */
if (stats->max_time_ns < duration)
stats->max_time_ns = duration;
}

/* Clear the timestamp slot */
pelem->lock = 0;
if (need_delete)
bpf_map_delete_elem(&tstamp, &tid);
return 0;
}

char _license[] SEC("license") = "GPL";
Loading
Loading