Skip to content

Commit a3a7611

Browse files
authored
Improve observability; add sqlite retries to deal with sqlite_busy errors (#352)
* Use the pragma `sqlite_busy` to add automatic retries on sqlite busy error * I was seeing high rates of sqlitebusy errors when deployed on K8s on Azure and this was causing learning to happen much less frequently then it should * Clean up the prometheus metrics to support dashboards for monitoring impact. The key metrics we want are * Number of accepted suggestions * Number of executed cells * Number of learned examples * Cleanup up prometheus metrics so we can better monitor why learning isn't happening; this was used to track down sqlite busy errors * Support monitoring with Datadog * With Datadog structured logging we need to use the field "level" for severity or else it won't automatically parse that field * Udpate the configuration to allow the user to control what fields are used in the encoder
1 parent 48cc643 commit a3a7611

File tree

11 files changed

+251
-89
lines changed

11 files changed

+251
-89
lines changed

app/go.mod

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ require (
5454
gonum.org/v1/gonum v0.15.0
5555
google.golang.org/api v0.189.0
5656
google.golang.org/grpc v1.64.1
57-
google.golang.org/protobuf v1.34.2
57+
google.golang.org/protobuf v1.35.2
5858
gopkg.in/yaml.v3 v3.0.1
5959
k8s.io/apimachinery v0.27.3
6060
k8s.io/client-go v1.5.2
@@ -81,8 +81,9 @@ require (
8181
github.com/ProtonMail/go-crypto v1.0.0 // indirect
8282
github.com/beorn7/perks v1.0.1 // indirect
8383
github.com/bmatcuk/doublestar/v4 v4.6.1 // indirect
84+
github.com/bufbuild/connect-go v1.10.0 // indirect
8485
github.com/bytedance/sonic v1.11.3 // indirect
85-
github.com/cespare/xxhash/v2 v2.2.0 // indirect
86+
github.com/cespare/xxhash/v2 v2.3.0 // indirect
8687
github.com/chenzhuoyu/base64x v0.0.0-20230717121745-296ad89f973d // indirect
8788
github.com/chenzhuoyu/iasm v0.9.1 // indirect
8889
github.com/cli/go-gh v1.2.1 // indirect
@@ -100,6 +101,7 @@ require (
100101
github.com/docker/docker v26.1.2+incompatible // indirect
101102
github.com/docker/docker-credential-helpers v0.7.0 // indirect
102103
github.com/dustin/go-humanize v1.0.1 // indirect
104+
github.com/easyCZ/connect-go-prometheus v0.0.1 // indirect
103105
github.com/emirpasic/gods v1.18.1 // indirect
104106
github.com/felixge/httpsnoop v1.0.4 // indirect
105107
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
@@ -136,7 +138,7 @@ require (
136138
github.com/josharian/intern v1.0.0 // indirect
137139
github.com/json-iterator/go v1.1.12 // indirect
138140
github.com/kevinburke/ssh_config v1.2.0 // indirect
139-
github.com/klauspost/compress v1.17.4 // indirect
141+
github.com/klauspost/compress v1.17.9 // indirect
140142
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
141143
github.com/kr/pretty v0.3.1 // indirect
142144
github.com/kr/text v0.2.0 // indirect
@@ -147,23 +149,24 @@ require (
147149
github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 // indirect
148150
github.com/mattn/go-isatty v0.0.20 // indirect
149151
github.com/mattn/go-runewidth v0.0.15 // indirect
150-
github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369 // indirect
152+
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
151153
github.com/mitchellh/go-homedir v1.1.0 // indirect
152154
github.com/mitchellh/mapstructure v1.5.0 // indirect
153155
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
154156
github.com/modern-go/reflect2 v1.0.2 // indirect
155157
github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect
158+
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
156159
github.com/ncruces/go-strftime v0.1.9 // indirect
157160
github.com/opencontainers/go-digest v1.0.0 // indirect
158161
github.com/opencontainers/image-spec v1.1.0 // indirect
159162
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
160163
github.com/pjbgf/sha1cd v0.3.0 // indirect
161164
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
162165
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
163-
github.com/prometheus/client_golang v1.12.0 // indirect
164-
github.com/prometheus/client_model v0.2.1-0.20210607210712-147c58e9608a // indirect
165-
github.com/prometheus/common v0.32.1 // indirect
166-
github.com/prometheus/procfs v0.7.3 // indirect
166+
github.com/prometheus/client_golang v1.20.5 // indirect
167+
github.com/prometheus/client_model v0.6.1 // indirect
168+
github.com/prometheus/common v0.60.1 // indirect
169+
github.com/prometheus/procfs v0.15.1 // indirect
167170
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
168171
github.com/replicate/replicate-go v0.21.0 // indirect
169172
github.com/rivo/uniseg v0.4.7 // indirect
@@ -207,12 +210,12 @@ require (
207210
go.opentelemetry.io/proto/otlp v1.2.0 // indirect
208211
go.uber.org/multierr v1.11.0 // indirect
209212
golang.org/x/arch v0.7.0 // indirect
210-
golang.org/x/crypto v0.25.0 // indirect
211-
golang.org/x/net v0.27.0 // indirect
212-
golang.org/x/oauth2 v0.21.0 // indirect
213-
golang.org/x/sync v0.7.0 // indirect
214-
golang.org/x/sys v0.22.0 // indirect
215-
golang.org/x/text v0.16.0 // indirect
213+
golang.org/x/crypto v0.27.0 // indirect
214+
golang.org/x/net v0.29.0 // indirect
215+
golang.org/x/oauth2 v0.23.0 // indirect
216+
golang.org/x/sync v0.8.0 // indirect
217+
golang.org/x/sys v0.27.0 // indirect
218+
golang.org/x/text v0.18.0 // indirect
216219
golang.org/x/time v0.5.0 // indirect
217220
google.golang.org/genproto v0.0.0-20240722135656-d784300faade // indirect
218221
google.golang.org/genproto/googleapis/api v0.0.0-20240722135656-d784300faade // indirect

app/go.sum

Lines changed: 37 additions & 0 deletions
Large diffs are not rendered by default.

app/pkg/agent/agent.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ import (
66
"strings"
77
"sync"
88

9+
"github.com/prometheus/client_golang/prometheus"
10+
"github.com/prometheus/client_golang/prometheus/promauto"
11+
912
"github.com/jlewi/foyle/app/pkg/runme/ulid"
1013

1114
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1/v1alpha1connect"
@@ -49,6 +52,19 @@ const (
4952
temperature = 0.9
5053
)
5154

55+
var (
56+
executedCounter = promauto.NewCounterVec(prometheus.CounterOpts{
57+
Name: "cells_executed_total",
58+
Help: "Total number of executed cells broken down by status"},
59+
[]string{"status"},
60+
)
61+
62+
acceptedCounter = promauto.NewCounter(prometheus.CounterOpts{
63+
Name: "cells_accepted_total",
64+
Help: "Total number of suggested cells accepted broken down by type"},
65+
)
66+
)
67+
5268
// Agent is the agent.
5369
type Agent struct {
5470
v1alpha1.UnimplementedGenerateServiceServer
@@ -625,6 +641,14 @@ func (a *Agent) LogEvents(ctx context.Context, req *connect.Request[v1alpha1.Log
625641
func() {
626642
_, span := tp.Start(ctx, "LogEvent", trace.WithAttributes(attribute.String("eventType", event.Type.String()), attribute.String("contextId", event.ContextId), attribute.String("selectedCellId", event.SelectedId)))
627643
defer span.End()
644+
645+
switch event.GetType() {
646+
case v1alpha1.LogEventType_ACCEPTED:
647+
648+
acceptedCounter.Inc()
649+
case v1alpha1.LogEventType_EXECUTE:
650+
executedCounter.WithLabelValues(event.GetExecuteStatus().String()).Inc()
651+
}
628652
// N.B we can't use zap.Object to log the event because it contains runme protos which don't have the zap marshaler bindings.
629653
log.Info("LogEvent", "eventId", event.GetEventId(), "eventType", event.Type, "contextId", event.ContextId, "selectedCellId", event.SelectedId, logs.ZapProto("event", event))
630654
}()

app/pkg/analyze/fsql/schema.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,4 @@ CREATE TABLE IF NOT EXISTS results (
3333

3434
-- The JSON serialization of the proto.
3535
proto_json TEXT NOT NULL
36-
);
36+
);

app/pkg/analyze/session_manager.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ import (
88
"os"
99
"path/filepath"
1010

11+
"github.com/go-logr/zapr"
12+
"go.uber.org/zap"
13+
1114
"github.com/prometheus/client_golang/prometheus"
1215
"github.com/prometheus/client_golang/prometheus/promauto"
1316

@@ -69,6 +72,15 @@ func NewSessionsManager(db *sql.DB) (*SessionsManager, error) {
6972
return nil, err
7073
}
7174

75+
// Set busy_timeout using PRAGMA. This is to deal with frequent sqlite busy errors when deployed on
76+
// Azure.
77+
// This is in milliseconds
78+
if _, err := db.Exec("PRAGMA busy_timeout = 10000;"); err != nil {
79+
return nil, errors.Wrapf(err, "Failed to set busy timeout for the database")
80+
}
81+
log := zapr.NewLogger(zap.L())
82+
log.Info("sqlite busy_timeout set", "timeout", 5000)
83+
7284
// Create the dbtx from the actual database
7385
queries := fsql.New(db)
7486

@@ -115,7 +127,6 @@ func (db *SessionsManager) Update(ctx context.Context, contextID string, updateF
115127

116128
tx, err := db.db.BeginTx(ctx, &sql.TxOptions{})
117129
if err != nil {
118-
// DO NOT COMMIT
119130
sessCounter.WithLabelValues("failedstart").Inc()
120131
return errors.Wrapf(err, "Failed to start transaction")
121132
}
@@ -132,7 +143,6 @@ func (db *SessionsManager) Update(ctx context.Context, contextID string, updateF
132143
if err != nil {
133144
logDBErrors(ctx, err)
134145
if err != sql.ErrNoRows {
135-
// DO NOT COMMIT
136146
sessCounter.WithLabelValues("failedget").Inc()
137147
return errors.Wrapf(err, "Failed to get session with id %v", contextID)
138148
}
@@ -144,7 +154,6 @@ func (db *SessionsManager) Update(ctx context.Context, contextID string, updateF
144154
}
145155
}
146156

147-
// DO NOT COMMIT
148157
sessCounter.WithLabelValues("callupdatefunc").Inc()
149158

150159
if err := updateFunc(session); err != nil {
@@ -172,7 +181,6 @@ func (db *SessionsManager) Update(ctx context.Context, contextID string, updateF
172181
NumGenerateTraces: newRow.NumGenerateTraces,
173182
}
174183

175-
// DO NOT COMMIT
176184
sessCounter.WithLabelValues("callupdatesession").Inc()
177185
if err := queries.UpdateSession(ctx, update); err != nil {
178186
logDBErrors(ctx, err)
@@ -199,7 +207,6 @@ func (db *SessionsManager) Update(ctx context.Context, contextID string, updateF
199207
return err
200208
}
201209

202-
// DO NOT COMMIT
203210
sessCounter.WithLabelValues("done").Inc()
204211
return nil
205212
}

app/pkg/application/app.go

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -280,9 +280,27 @@ func (a *App) createCoreForConsole(paths []string) (zapcore.Core, error) {
280280

281281
// Use the keys used by cloud logging
282282
// https://cloud.google.com/logging/docs/structured-logging
283-
c.LevelKey = "severity"
284-
c.TimeKey = "time"
285-
c.MessageKey = "message"
283+
logFields := a.Config.Logging.LogFields
284+
if logFields == nil {
285+
logFields = &config.LogFields{}
286+
}
287+
if logFields.Level != "" {
288+
c.LevelKey = logFields.Level
289+
} else {
290+
c.LevelKey = "severity"
291+
}
292+
293+
if logFields.Time != "" {
294+
c.TimeKey = logFields.Time
295+
} else {
296+
c.TimeKey = "time"
297+
}
298+
299+
if logFields.Message != "" {
300+
c.MessageKey = logFields.Message
301+
} else {
302+
c.MessageKey = "message"
303+
}
286304

287305
lvl := a.Config.GetLogLevel()
288306
zapLvl := zap.NewAtomicLevel()
@@ -333,10 +351,32 @@ func (a *App) createJSONCoreLogger(paths []string) (zapcore.Core, error) {
333351
c := zap.NewProductionEncoderConfig()
334352
// Use the keys used by cloud logging
335353
// https://cloud.google.com/logging/docs/structured-logging
336-
c.LevelKey = "severity"
337-
c.TimeKey = "time"
338-
c.MessageKey = "message"
354+
logFields := a.Config.Logging.LogFields
355+
if logFields == nil {
356+
logFields = &config.LogFields{}
357+
}
358+
if logFields.Level != "" {
359+
c.LevelKey = logFields.Level
360+
} else {
361+
c.LevelKey = "severity"
362+
}
363+
364+
if logFields.Time != "" {
365+
c.TimeKey = logFields.Time
366+
} else {
367+
c.TimeKey = "time"
368+
}
369+
370+
if logFields.Message != "" {
371+
c.MessageKey = logFields.Message
372+
} else {
373+
c.MessageKey = "message"
374+
}
375+
339376
// We attach the function key to the logs because that is useful for identifying the function that generated the log.
377+
// N.B are logs processing depends on this field being present in the logs. This is one reason
378+
// why we don't allow it to be customized to match the field expected by a logging backend like Datadog
379+
// or Cloud Logging
340380
c.FunctionKey = "function"
341381

342382
jsonEncoder := zapcore.NewJSONEncoder(c)

app/pkg/config/config.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,15 @@ type Logging struct {
201201

202202
// MaxDelaySeconds is the maximum delay in seconds to wait before processing the logs.
203203
MaxDelaySeconds int `json:"maxDelaySeconds,omitempty" yaml:"maxDelaySeconds,omitempty"`
204+
205+
LogFields *LogFields `json:"logFields,omitempty" yaml:"logFields,omitempty"`
206+
}
207+
208+
// LogFields is the fields to use when logging to structured logging
209+
type LogFields struct {
210+
Level string `json:"level,omitempty" yaml:"level,omitempty"`
211+
Time string `json:"time,omitempty" yaml:"time,omitempty"`
212+
Message string `json:"message,omitempty" yaml:"message,omitempty"`
204213
}
205214

206215
type LogSink struct {

app/pkg/learn/learner.go

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,32 @@ const (
4141
var (
4242
enqueuedCounter = promauto.NewCounter(prometheus.CounterOpts{
4343
Name: "learner_enqueued_total",
44-
Help: "Total number of enqueued blocks",
44+
Help: "Total number of enqueued sessions for learning",
4545
})
4646

47+
sessFiltered = promauto.NewCounterVec(
48+
prometheus.CounterOpts{
49+
Name: "learner_not_learnable",
50+
Help: "Number of sessions that aren't learnable",
51+
},
52+
[]string{"status"},
53+
)
54+
4755
sessProcessed = promauto.NewCounterVec(
4856
prometheus.CounterOpts{
4957
Name: "learner_sessions_processed",
5058
Help: "Number of sessions processed by the learner",
5159
},
5260
[]string{"status"},
5361
)
62+
63+
learnedCounter = promauto.NewCounterVec(
64+
prometheus.CounterOpts{
65+
Name: "learned_examples_total",
66+
Help: "Number of examples learned",
67+
},
68+
[]string{"status"},
69+
)
5470
)
5571

5672
// Learner handles the learn loop to learn from past mistakes.
@@ -298,7 +314,10 @@ func (l *Learner) Reconcile(ctx context.Context, id string) error {
298314

299315
if len(writeErrors.Causes) > 0 {
300316
writeErrors.Final = errors.New("Not all examples could be successfully reconciled")
317+
learnedCounter.WithLabelValues("error").Inc()
301318
return writeErrors
319+
} else {
320+
learnedCounter.WithLabelValues("success").Inc()
302321
}
303322
return nil
304323
}
@@ -381,26 +400,26 @@ func isLearnable(session *logspb.Session) bool {
381400

382401
if execEvent == nil {
383402
// Since the cell wasn't successfully executed we don't learn from it
384-
sessProcessed.WithLabelValues("noexec").Inc()
403+
sessFiltered.WithLabelValues("noexec").Inc()
385404
return false
386405
}
387406

388407
log := zapr.NewLogger(zap.L())
389408
if session.GetFullContext() == nil {
390-
sessProcessed.WithLabelValues("nocontext").Inc()
409+
sessFiltered.WithLabelValues("nocontext").Inc()
391410
log.Error(errors.New("Session missing fullcontext"), "contextId", session.GetContextId())
392411
return false
393412
}
394413

395414
if session.GetFullContext().GetNotebook() == nil {
396-
sessProcessed.WithLabelValues("nonotebook").Inc()
415+
sessFiltered.WithLabelValues("nonotebook").Inc()
397416
log.Error(errors.New("Session missing notebook"), "contextId", session.GetContextId())
398417
return false
399418
}
400419

401420
if session.GetFullContext().GetSelected() == 0 {
402421
// If its the first cell we can't learn from it because what would we use as context to predict it?
403-
sessProcessed.WithLabelValues("firstcell").Inc()
422+
sessFiltered.WithLabelValues("firstcell").Inc()
404423
return false
405424
}
406425
return true

0 commit comments

Comments
 (0)