@@ -15,7 +15,6 @@ import (
15
15
"github.com/grafana/pyroscope/ebpf/pprof"
16
16
"github.com/grafana/pyroscope/ebpf/sd"
17
17
"github.com/grafana/pyroscope/ebpf/symtab"
18
- "github.com/oklog/run"
19
18
20
19
"github.com/grafana/alloy/internal/component"
21
20
"github.com/grafana/alloy/internal/component/pyroscope"
@@ -61,7 +60,7 @@ func New(opts component.Options, args Arguments) (component.Component, error) {
61
60
args : args ,
62
61
targetFinder : targetFinder ,
63
62
session : session ,
64
- argsUpdate : make (chan Arguments ),
63
+ argsUpdate : make (chan Arguments , 4 ),
65
64
}
66
65
res .metrics .targetsActive .Set (float64 (len (res .targetFinder .DebugInfo ())))
67
66
return res , nil
@@ -104,48 +103,89 @@ type Component struct {
104
103
debugInfo DebugInfo
105
104
debugInfoLock sync.Mutex
106
105
metrics * metrics
106
+
107
+ healthMut sync.RWMutex
108
+ health component.Health
107
109
}
108
110
109
111
func (c * Component ) Run (ctx context.Context ) error {
110
- err := c .session .Start ()
111
- if err != nil {
112
- return fmt .Errorf ("ebpf profiling session start: %w" , err )
113
- }
114
- defer c .session .Stop ()
115
-
116
- var g run.Group
117
- g .Add (func () error {
118
- collectInterval := c .args .CollectInterval
119
- t := time .NewTicker (collectInterval )
120
- defer t .Stop ()
121
- for {
122
- select {
123
- case <- ctx .Done ():
124
- return nil
125
- case newArgs := <- c .argsUpdate :
126
- c .args = newArgs
127
- c .session .UpdateTargets (targetsOptionFromArgs (c .args ))
128
- c .metrics .targetsActive .Set (float64 (len (c .targetFinder .DebugInfo ())))
129
- err := c .session .Update (convertSessionOptions (c .args , c .metrics ))
130
- if err != nil {
131
- return nil
132
- }
133
- c .appendable .UpdateChildren (newArgs .ForwardTo )
134
- if c .args .CollectInterval != collectInterval {
135
- t .Reset (c .args .CollectInterval )
136
- collectInterval = c .args .CollectInterval
137
- }
138
- case <- t .C :
139
- err := c .collectProfiles ()
112
+ var (
113
+ sessionStarted = false
114
+ sessionErrors = 0
115
+ sessionMaxErrors = 3
116
+ )
117
+
118
+ collectInterval := c .args .CollectInterval
119
+ t := time .NewTicker (collectInterval )
120
+ defer t .Stop ()
121
+ for {
122
+ select {
123
+ case <- ctx .Done ():
124
+ return nil
125
+ case newArgs := <- c .argsUpdate :
126
+ // ensure there are no other updates queued. this might happen if the collection takes a very long time
127
+ newArgs = getLatestArgsFromChannel (c .argsUpdate , newArgs )
128
+
129
+ // update targets
130
+ c .args = newArgs
131
+ c .session .UpdateTargets (targetsOptionFromArgs (c .args ))
132
+ c .metrics .targetsActive .Set (float64 (len (c .targetFinder .DebugInfo ())))
133
+ err := c .session .Update (convertSessionOptions (c .args , c .metrics ))
134
+ if err != nil {
135
+ level .Error (c .options .Logger ).Log ("msg" , "failed to update profiling session" , "err" , err )
136
+ c .reportUnhealthy (err )
137
+ continue
138
+ }
139
+ c .appendable .UpdateChildren (newArgs .ForwardTo )
140
+ if c .args .CollectInterval != collectInterval {
141
+ t .Reset (c .args .CollectInterval )
142
+ collectInterval = c .args .CollectInterval
143
+ }
144
+ case <- t .C :
145
+ if ! sessionStarted {
146
+ err := c .session .Start ()
140
147
if err != nil {
141
- c .metrics .profilingSessionsFailingTotal .Inc ()
142
- return err
148
+ sessionErrors ++
149
+ if sessionErrors > sessionMaxErrors {
150
+ level .Error (c .options .Logger ).Log ("msg" , "too many errors starting profiling session, giving up" , "tries" , sessionErrors , "last_error" , err )
151
+ t .Stop ()
152
+ continue
153
+ }
154
+ level .Error (c .options .Logger ).Log ("msg" , "failed to start profiling session" , "err" , err )
155
+ c .reportUnhealthy (err )
156
+ continue
143
157
}
144
- c .updateDebugInfo ()
158
+ sessionErrors = 0
159
+ defer func () {
160
+ c .session .Stop ()
161
+ level .Info (c .options .Logger ).Log ("msg" , "ebpf profiling session stopped" )
162
+ }()
163
+ sessionStarted = true
164
+ level .Info (c .options .Logger ).Log ("msg" , "ebpf profiling session started" )
145
165
}
166
+
167
+ err := c .collectProfiles (ctx )
168
+ if err != nil {
169
+ level .Error (c .options .Logger ).Log ("msg" , "failed to collect profiles" , "err" , err )
170
+ c .reportUnhealthy (err )
171
+ c .metrics .profilingSessionsFailingTotal .Inc ()
172
+ continue
173
+ }
174
+ c .reportHealthy ()
175
+ c .updateDebugInfo ()
176
+ }
177
+ }
178
+ }
179
+
180
+ func getLatestArgsFromChannel [A any ](ch chan A , current A ) A {
181
+ for {
182
+ select {
183
+ case x := <- ch :
184
+ current = x
185
+ default :
186
+ return current
146
187
}
147
- }, func (error ) {})
148
- return g .Run ()
188
+ }
149
189
}
150
190
151
191
func (c * Component ) Update (args component.Arguments ) error {
@@ -154,13 +194,38 @@ func (c *Component) Update(args component.Arguments) error {
154
194
return nil
155
195
}
156
196
197
+ func (c * Component ) reportUnhealthy (err error ) {
198
+ c .healthMut .Lock ()
199
+ defer c .healthMut .Unlock ()
200
+ c .health = component.Health {
201
+ Health : component .HealthTypeUnhealthy ,
202
+ Message : err .Error (),
203
+ UpdateTime : time .Now (),
204
+ }
205
+ }
206
+
207
+ func (c * Component ) reportHealthy () {
208
+ c .healthMut .Lock ()
209
+ defer c .healthMut .Unlock ()
210
+ c .health = component.Health {
211
+ Health : component .HealthTypeHealthy ,
212
+ UpdateTime : time .Now (),
213
+ }
214
+ }
215
+
216
+ func (c * Component ) CurrentHealth () component.Health {
217
+ c .healthMut .RLock ()
218
+ defer c .healthMut .RUnlock ()
219
+ return c .health
220
+ }
221
+
157
222
func (c * Component ) DebugInfo () interface {} {
158
223
c .debugInfoLock .Lock ()
159
224
defer c .debugInfoLock .Unlock ()
160
225
return c .debugInfo
161
226
}
162
227
163
- func (c * Component ) collectProfiles () error {
228
+ func (c * Component ) collectProfiles (ctx context. Context ) error {
164
229
c .metrics .profilingSessionsTotal .Inc ()
165
230
level .Debug (c .options .Logger ).Log ("msg" , "ebpf collectProfiles" )
166
231
args := c .args
@@ -176,6 +241,11 @@ func (c *Component) collectProfiles() error {
176
241
level .Debug (c .options .Logger ).Log ("msg" , "ebpf collectProfiles done" , "profiles" , len (builders .Builders ))
177
242
bytesSent := 0
178
243
for _ , builder := range builders .Builders {
244
+ // check if the context is done
245
+ if ctx .Err () != nil {
246
+ return ctx .Err ()
247
+ }
248
+
179
249
serviceName := builder .Labels .Get ("service_name" )
180
250
c .metrics .pprofsTotal .WithLabelValues (serviceName ).Inc ()
181
251
c .metrics .pprofSamplesTotal .WithLabelValues (serviceName ).Add (float64 (len (builder .Profile .Sample )))
@@ -192,7 +262,7 @@ func (c *Component) collectProfiles() error {
192
262
c .metrics .pprofBytesTotal .WithLabelValues (serviceName ).Add (float64 (len (rawProfile )))
193
263
194
264
samples := []* pyroscope.RawSample {{RawProfile : rawProfile }}
195
- err = appender .Append (context . Background () , builder .Labels , samples )
265
+ err = appender .Append (ctx , builder .Labels , samples )
196
266
if err != nil {
197
267
level .Error (c .options .Logger ).Log ("msg" , "ebpf pprof write" , "err" , err )
198
268
continue
0 commit comments