Skip to content

Commit bbdf1fb

Browse files
kalleepstegosaurus21QuentinBissonthampiotrrafaelroquetto
authored
Create v1.11.0-rc.1 (#4465)
* fix(prometheus.exporter.redis): pass MaxDistinctKeyGroups option to exporter (#4463) Changes the `MaxDistinctKeyGroups` configuration option to be passed into the Redis exporter instance. Previously it was accepted in Alloy's configuration but not passed through, which would cause it to default to zero regardless of how Alloy was configured. Note that this would cause a cardinality increase in generated metrics for instances of this component with key group metrics enabled. Signed-off-by: stegosaurus21 <[email protected]> Co-authored-by: Karl Persson <[email protected]> * fix(loki.source.podlogs): enable proper log collection from Kubernetes Jobs and CronJobs (#4349) * fix(loki.source.podlogs): enable proper log collection from Kubernetes Jobs and CronJobs - Add job-aware termination logic with extended grace periods (10-60s) to ensure all job logs are captured - Implement proper handling of pod deletion and race conditions between job completion and controller cleanup - Separate concerns: containerTerminated() for regular pods, shouldStopTailingJobContainer() for jobs - Enhance deduplication mechanisms to prevent duplicate log collection while ensuring comprehensive coverage - Add comprehensive test coverage including unit tests and deduplication validation - Fix race condition where job logs were missed for fast-completing or terminated jobs This resolves the issue where the podlogs component would fail to scrape logs from short-lived or terminated Kubernetes jobs due to premature termination of log collection. * rename container terminated * refactor pod info extraction * fix linting errors * loki.source.journal: dont fail if journal files do not exist (#4462) * Update changelog * Upgrade Beyla to v2.6.5 (#4468) --------- Signed-off-by: stegosaurus21 <[email protected]> Co-authored-by: Jason Liu <[email protected]> Co-authored-by: Quentin Bisson <[email protected]> Co-authored-by: Piotr <[email protected]> Co-authored-by: Rafael Roquetto <[email protected]>
1 parent 4b6e017 commit bbdf1fb

File tree

8 files changed

+678
-53
lines changed

8 files changed

+678
-53
lines changed

CHANGELOG.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ internal API changes are not present.
1010
Main (unreleased)
1111
-----------------
1212

13-
v1.11.0-rc.0
13+
v1.11.0-rc.1
1414
-----------------
1515

1616
### Breaking changes
@@ -176,6 +176,18 @@ v1.11.0-rc.0
176176

177177
- Fix race conditions in `loki.source.syslog` where it could deadlock or cause port bind errors during config reload or shutdown. (@thampiotr)
178178

179+
- **Fix `loki.source.podlogs` component to properly collect logs from Kubernetes Jobs and CronJobs.** Previously, the component would fail to scrape logs from short-lived or terminated jobs due to race conditions between job completion and pod discovery. The fix includes:
180+
- Job-aware termination logic with extended grace periods (10-60 seconds) to ensure all logs are captured
181+
- Proper handling of pod deletion and race conditions between job completion and controller cleanup
182+
- Separation of concerns: `shouldStopTailingContainer()` handles standard Kubernetes restart policies for regular pods, while `shouldStopTailingJobContainer()` handles job-specific lifecycle with grace periods
183+
- Enhanced deduplication mechanisms to prevent duplicate log collection while ensuring comprehensive coverage
184+
- Comprehensive test coverage including unit tests and deduplication validation
185+
This resolves the issue where job logs were being missed, particularly for fast-completing jobs or jobs that terminated before discovery. (@QuentinBisson)
186+
187+
- Fix `prometheus.exporter.redis` component so that it no longer ignores the `MaxDistinctKeyGroups` configuration option. If key group metrics are enabled, this will increase the cardinality of the generated metrics. (@stegosaurus21)
188+
189+
- Fix `loki.source.journal` creation failing with an error when the journal file is not found. (@thampiotr)
190+
179191
v1.10.2
180192
-----------------
181193

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ require (
6363
github.com/gorilla/mux v1.8.1
6464
github.com/grafana/alloy-remote-config v0.0.11
6565
github.com/grafana/alloy/syntax v0.1.0
66-
github.com/grafana/beyla/v2 v2.6.4
66+
github.com/grafana/beyla/v2 v2.6.5
6767
github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20250218151502-6e97feaee761
6868
github.com/grafana/ckit v0.0.0-20250514165824-dd4adf36ad34
6969
github.com/grafana/cloudflare-go v0.0.0-20230110200409-c627cf6792f2
@@ -1068,7 +1068,7 @@ exclude (
10681068
k8s.io/client-go v12.0.0+incompatible
10691069
)
10701070

1071-
replace go.opentelemetry.io/obi => github.com/grafana/opentelemetry-ebpf-instrumentation v1.2.4
1071+
replace go.opentelemetry.io/obi => github.com/grafana/opentelemetry-ebpf-instrumentation v1.2.5
10721072

10731073
replace go.opentelemetry.io/ebpf-profiler => github.com/grafana/opentelemetry-ebpf-profiler v0.0.202537-0.20250916114748-f2ff2fc6048c
10741074

go.sum

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,8 +1124,8 @@ github.com/gosnmp/gosnmp v1.41.0 h1:6RI78g2ZsbLvpvJegcV98LapszRQnbvYNKSa5WbCll4=
11241124
github.com/gosnmp/gosnmp v1.41.0/go.mod h1:CxVS6bXqmWZlafUj9pZUnQX5e4fAltqPcijxWpCitDo=
11251125
github.com/grafana/alloy-remote-config v0.0.11 h1:IwYdQ38Tv/tEZuask8l29fdIcIzUP4HhY+VhMp9lqEk=
11261126
github.com/grafana/alloy-remote-config v0.0.11/go.mod h1:kHE1usYo2WAVCikQkIXuoG1Clz8BSdiz3kF+DZSCQ4k=
1127-
github.com/grafana/beyla/v2 v2.6.4 h1:nN4ljXuETUePYE4n8oOZjvRExaPW8cLjFTznD1hNl1E=
1128-
github.com/grafana/beyla/v2 v2.6.4/go.mod h1:6X62WG9uxJedecIzmAYvK8iJXuM3xq1eRXOyz1fkAa4=
1127+
github.com/grafana/beyla/v2 v2.6.5 h1:w2EiD8T8z2TS9dSG8axF4VGnmTN9qf9mrUoIdkH+efA=
1128+
github.com/grafana/beyla/v2 v2.6.5/go.mod h1:kBxtZcdaDQ/a04ZNsbTii/uRkiQI3lTadhGbZ8gw1lo=
11291129
github.com/grafana/cadvisor v0.0.0-20240729082359-1f04a91701e2 h1:ju6EcY2aEobeBg185ETtFCKj5WzaQ48qfkbsSRRQrF4=
11301130
github.com/grafana/cadvisor v0.0.0-20240729082359-1f04a91701e2/go.mod h1:8sLW/G7rcFe1CKMaA4pYT4mX3P1xQVGqM6luzEzx/2g=
11311131
github.com/grafana/catchpoint-prometheus-exporter v0.0.0-20250218151502-6e97feaee761 h1:dPJOIEwtQ8uR3Qa79pb/lsSFJQ6j4P9vpCUQ4fKimG4=
@@ -1166,8 +1166,8 @@ github.com/grafana/node_exporter v0.18.1-grafana-r01.0.20250806062222-612bdf9540
11661166
github.com/grafana/node_exporter v0.18.1-grafana-r01.0.20250806062222-612bdf9540ec/go.mod h1:1qoZ74W1muNrTyeP2fUzUbg2UKD8AeX435xSCJNub7Y=
11671167
github.com/grafana/opentelemetry-collector/featuregate v0.0.0-20240325174506-2fd1623b2ca0 h1:i/Ne0XwoRokYj52ZcSmnvuyID3h/uA91n0Ycg/grHU8=
11681168
github.com/grafana/opentelemetry-collector/featuregate v0.0.0-20240325174506-2fd1623b2ca0/go.mod h1:mm8+xyQfgDmqhyegZRNIQmoKsNnDTwWKFLsdMoXAb7A=
1169-
github.com/grafana/opentelemetry-ebpf-instrumentation v1.2.4 h1:2dIzC3i612KVMiG6A4sq53JS3YrCg5x4SXuP8zcLTrs=
1170-
github.com/grafana/opentelemetry-ebpf-instrumentation v1.2.4/go.mod h1:EFQ1reX/fEsmFeyZn+G/lzMwGcEKlWBOIP8pECIvXZc=
1169+
github.com/grafana/opentelemetry-ebpf-instrumentation v1.2.5 h1:PbcVxweZqvoIb58Yk0BFmCCeXsaqZI05XMHzZyQ8K7o=
1170+
github.com/grafana/opentelemetry-ebpf-instrumentation v1.2.5/go.mod h1:EFQ1reX/fEsmFeyZn+G/lzMwGcEKlWBOIP8pECIvXZc=
11711171
github.com/grafana/opentelemetry-ebpf-profiler v0.0.202537-0.20250916114748-f2ff2fc6048c h1:G+o7MkTXTgpEebJM5ctNMZw8DOPOatRM6lOLeqRPL50=
11721172
github.com/grafana/opentelemetry-ebpf-profiler v0.0.202537-0.20250916114748-f2ff2fc6048c/go.mod h1:ajmdC82d8daScIWPT0Mmq95lvGmoBNMdAUBlWv/Hwbg=
11731173
github.com/grafana/otel-profiling-go v0.5.1 h1:stVPKAFZSa7eGiqbYuG25VcqYksR6iWvF3YH66t4qL8=
@@ -1194,8 +1194,6 @@ github.com/grafana/tail v0.0.0-20230510142333-77b18831edf0 h1:bjh0PVYSVVFxzINqPF
11941194
github.com/grafana/tail v0.0.0-20230510142333-77b18831edf0/go.mod h1:7t5XR+2IA8P2qggOAHTj/GCZfoLBle3OvNSYh1VkRBU=
11951195
github.com/grafana/vmware_exporter v0.0.5-beta.0.20250218170317-73398ba08329 h1:Rs4H1yv2Abk3xE82qpyhMGGA8rswAOA0HQZde/BYkFo=
11961196
github.com/grafana/vmware_exporter v0.0.5-beta.0.20250218170317-73398ba08329/go.mod h1:Z28219aViNlsLlPvuCnlgHDagRdZBAZ7JOnQg1b3eWg=
1197-
github.com/grafana/walqueue v0.0.0-20250915204108-c3ca0631af46 h1:asmom2PA/hWPhzD6lx5+aK9HvqJnYzfB7NqmgVRuTgI=
1198-
github.com/grafana/walqueue v0.0.0-20250915204108-c3ca0631af46/go.mod h1:LJm4P3SayTHSbHBYepsAf3WqlY/gwSYQyMs7OLLAi6A=
11991197
github.com/grafana/walqueue v0.0.0-20250916201216-152b1f10cca2 h1:6HRYKHfWwdIoBF5jvVAYnARFHJiKe+++j1Oxh6A1mNw=
12001198
github.com/grafana/walqueue v0.0.0-20250916201216-152b1f10cca2/go.mod h1:LJm4P3SayTHSbHBYepsAf3WqlY/gwSYQyMs7OLLAi6A=
12011199
github.com/grobie/gomemcache v0.0.0-20230213081705-239240bbc445 h1:FlKQKUYPZ5yDCN248M3R7x8yu2E3yEZ0H7aLomE4EoE=

internal/component/loki/source/journal/journal.go

Lines changed: 67 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ package journal
44

55
import (
66
"context"
7+
"fmt"
78
"os"
89
"path/filepath"
910
"sync"
@@ -12,13 +13,13 @@ import (
1213
"github.com/grafana/loki/v3/clients/pkg/promtail/scrapeconfig"
1314
"github.com/prometheus/common/model"
1415

16+
"github.com/grafana/alloy/internal/component"
1517
"github.com/grafana/alloy/internal/component/common/loki"
1618
"github.com/grafana/alloy/internal/component/common/loki/positions"
1719
alloy_relabel "github.com/grafana/alloy/internal/component/common/relabel"
1820
"github.com/grafana/alloy/internal/component/loki/source/journal/internal/target"
1921
"github.com/grafana/alloy/internal/featuregate"
20-
21-
"github.com/grafana/alloy/internal/component"
22+
"github.com/grafana/alloy/internal/runtime/logging/level"
2223
)
2324

2425
func init() {
@@ -37,13 +38,16 @@ var _ component.Component = (*Component)(nil)
3738

3839
// Component represents reading from a journal
3940
type Component struct {
40-
mut sync.RWMutex
41-
t *target.JournalTarget
42-
metrics *target.Metrics
43-
o component.Options
44-
handler chan loki.Entry
45-
positions positions.Positions
46-
receivers []loki.LogsReceiver
41+
mut sync.RWMutex
42+
t *target.JournalTarget
43+
metrics *target.Metrics
44+
o component.Options
45+
handler chan loki.Entry
46+
positions positions.Positions
47+
receivers []loki.LogsReceiver
48+
argsUpdated chan struct{}
49+
args Arguments
50+
healthErr error
4751
}
4852

4953
// New creates a new component.
@@ -69,11 +73,13 @@ func New(o component.Options, args Arguments) (*Component, error) {
6973
}
7074

7175
c := &Component{
72-
metrics: target.NewMetrics(o.Registerer),
73-
o: o,
74-
handler: make(chan loki.Entry),
75-
positions: positionsFile,
76-
receivers: args.Receivers,
76+
metrics: target.NewMetrics(o.Registerer),
77+
o: o,
78+
handler: make(chan loki.Entry),
79+
positions: positionsFile,
80+
receivers: args.Receivers,
81+
argsUpdated: make(chan struct{}, 1),
82+
args: args,
7783
}
7884
err = c.Update(args)
7985
return c, err
@@ -84,7 +90,10 @@ func (c *Component) Run(ctx context.Context) error {
8490
defer func() {
8591
c.mut.RLock()
8692
if c.t != nil {
87-
c.t.Stop()
93+
err := c.t.Stop()
94+
if err != nil {
95+
level.Warn(c.o.Logger).Log("msg", "error stopping journal target", "err", err)
96+
}
8897
}
8998
c.mut.RUnlock()
9099

@@ -103,6 +112,27 @@ func (c *Component) Run(ctx context.Context) error {
103112
r.Chan() <- lokiEntry
104113
}
105114
c.mut.RUnlock()
115+
case <-c.argsUpdated:
116+
c.mut.Lock()
117+
if c.t != nil {
118+
err := c.t.Stop()
119+
if err != nil {
120+
level.Error(c.o.Logger).Log("msg", "error stopping journal target", "err", err)
121+
}
122+
c.t = nil
123+
}
124+
rcs := alloy_relabel.ComponentToPromRelabelConfigs(c.args.RelabelRules)
125+
entryHandler := loki.NewEntryHandler(c.handler, func() {})
126+
127+
newTarget, err := target.NewJournalTarget(c.metrics, c.o.Logger, entryHandler, c.positions, c.o.ID, rcs, convertArgs(c.o.ID, c.args))
128+
if err != nil {
129+
level.Error(c.o.Logger).Log("msg", "error creating journal target", "err", err, "path", c.args.Path)
130+
c.healthErr = fmt.Errorf("error creating journal target: %w", err)
131+
} else {
132+
c.t = newTarget
133+
c.healthErr = nil
134+
}
135+
c.mut.Unlock()
106136
}
107137
}
108138
}
@@ -112,21 +142,31 @@ func (c *Component) Update(args component.Arguments) error {
112142
newArgs := args.(Arguments)
113143
c.mut.Lock()
114144
defer c.mut.Unlock()
115-
if c.t != nil {
116-
err := c.t.Stop()
117-
if err != nil {
118-
return err
119-
}
145+
c.args = newArgs
146+
select {
147+
case c.argsUpdated <- struct{}{}:
148+
default: // Update notification already sent
120149
}
121-
rcs := alloy_relabel.ComponentToPromRelabelConfigs(newArgs.RelabelRules)
122-
entryHandler := loki.NewEntryHandler(c.handler, func() {})
150+
return nil
151+
}
123152

124-
newTarget, err := target.NewJournalTarget(c.metrics, c.o.Logger, entryHandler, c.positions, c.o.ID, rcs, convertArgs(c.o.ID, newArgs))
125-
if err != nil {
126-
return err
153+
// CurrentHealth implements component.HealthComponent. It returns an unhealthy
154+
// status if the server has terminated.
155+
func (c *Component) CurrentHealth() component.Health {
156+
c.mut.RLock()
157+
defer c.mut.RUnlock()
158+
if c.healthErr == nil {
159+
return component.Health{
160+
Health: component.HealthTypeHealthy,
161+
Message: "journal target is running",
162+
UpdateTime: time.Now(),
163+
}
164+
}
165+
return component.Health{
166+
Health: component.HealthTypeUnhealthy,
167+
Message: c.healthErr.Error(),
168+
UpdateTime: time.Now(),
127169
}
128-
c.t = newTarget
129-
return nil
130170
}
131171

132172
func convertArgs(job string, a Arguments) *scrapeconfig.JournalTargetConfig {

0 commit comments

Comments
 (0)