@@ -7,6 +7,10 @@ import (
7
7
"fmt"
8
8
"os"
9
9
"path/filepath"
10
+ "sync/atomic"
11
+ "time"
12
+
13
+ "github.com/cenkalti/backoff/v4"
10
14
11
15
"github.com/go-logr/zapr"
12
16
"go.uber.org/zap"
47
51
Name : "sqlite_busy" ,
48
52
Help : "Number of operations that failed because sqlite was busy" ,
49
53
})
54
+
55
+ activeUpdatesGauge = promauto .NewGauge (prometheus.GaugeOpts {
56
+ Name : "session_manager_active_updates" , // Metric name
57
+ Help : "Number of active updates" , // Metric description
58
+ })
50
59
)
51
60
52
61
// GetDDL return the DDL for the database.
@@ -63,6 +72,9 @@ type SessionUpdater func(session *logspb.Session) error
63
72
type SessionsManager struct {
64
73
queries * fsql.Queries
65
74
db * sql.DB
75
+ // Keep track of the number of concurrent calls to Update.
76
+ // This is intended to try to track down why SQLITE_BUSY errors is so frequent.
77
+ activeUpdates atomic.Int32
66
78
}
67
79
68
80
func NewSessionsManager (db * sql.DB ) (* SessionsManager , error ) {
@@ -81,6 +93,20 @@ func NewSessionsManager(db *sql.DB) (*SessionsManager, error) {
81
93
log := zapr .NewLogger (zap .L ())
82
94
log .Info ("sqlite busy_timeout set" , "timeout" , 5000 )
83
95
96
+ if _ , err := db .Exec ("PRAGMA busy_timeout = 10000;" ); err != nil {
97
+ return nil , errors .Wrapf (err , "Failed to set busy timeout for the database" )
98
+ }
99
+
100
+ // Activate WAL mode. This hopefully helps with SQLITE_BUSY errors and contention by using a separate file
101
+ // to log writes.
102
+ // https://www.sqlite.org/wal.html#:~:text=One%20has%20merely%20to%20run,set%20on%20any%20one%20connection.
103
+ // This mode is supposedly persistent the next time the application opens it will still be doing this
104
+ output , err := db .Exec ("PRAGMA journal_mode=WAL;" )
105
+ log .Info ("Set journal mode to WAL" , "output" , output )
106
+ if err != nil {
107
+ return nil , errors .Wrapf (err , "Failed to set journal mode to WAL" )
108
+ }
109
+
84
110
// Create the dbtx from the actual database
85
111
queries := fsql .New (db )
86
112
@@ -117,98 +143,127 @@ func (db *SessionsManager) Get(ctx context.Context, contextID string) (*logspb.S
117
143
// inserted if the updateFunc returns nil. If the session already exists then the session is passed to updateFunc
118
144
// and the updated value is then written to the database
119
145
func (db * SessionsManager ) Update (ctx context.Context , contextID string , updateFunc SessionUpdater ) error {
146
+ // Increment the counter when entering the function
147
+ numActive := db .activeUpdates .Add (1 )
148
+ defer func () {
149
+ // Decrement the counter when leaving the function
150
+ value := db .activeUpdates .Add (- 1 )
151
+ activeUpdatesGauge .Set (float64 (value ))
152
+ }()
153
+
120
154
log := logs .FromContext (ctx )
121
155
if contextID == "" {
122
156
return errors .WithStack (errors .New ("contextID must be non-empty" ))
123
157
}
124
158
log = log .WithValues ("contextId" , contextID )
125
159
126
- sessCounter .WithLabelValues ("start" ).Inc ()
127
-
128
- tx , err := db .db .BeginTx (ctx , & sql.TxOptions {})
129
- if err != nil {
130
- sessCounter .WithLabelValues ("failedstart" ).Inc ()
131
- return errors .Wrapf (err , "Failed to start transaction" )
160
+ // Intended to track whether SQLITE_BUSY errors are correlated with the number of concurrent calls to Update.
161
+ if numActive > 1 {
162
+ log .Info ("Concurrent Session Updates" , "numActive" , numActive )
132
163
}
133
164
134
- err = func () error {
135
- queries := db .queries .WithTx (tx )
136
- // Read the record
137
- sessRow , err := queries .GetSession (ctx , contextID )
165
+ activeUpdatesGauge .Set (float64 (numActive ))
166
+ sessCounter .WithLabelValues ("start" ).Inc ()
138
167
139
- // If the session doesn't exist then we do nothing because session is initializeed to empty session
140
- session := & logspb.Session {
141
- ContextId : contextID ,
142
- }
143
- if err != nil {
144
- logDBErrors (ctx , err )
145
- if err != sql .ErrNoRows {
146
- sessCounter .WithLabelValues ("failedget" ).Inc ()
147
- return errors .Wrapf (err , "Failed to get session with id %v" , contextID )
168
+ // Wrap the updates in a retry loop. This is intended to deal with SQLITE_BUSY errors and other possible sources
169
+ // of contention
170
+ b := backoff .NewExponentialBackOff (backoff .WithMaxElapsedTime (5 * time .Minute ), backoff .WithMaxInterval (30 * time .Second ))
171
+ for {
172
+ err := func () error {
173
+ tx , err := db .db .BeginTx (ctx , & sql.TxOptions {})
174
+ if err != nil {
175
+ // See https://go.dev/doc/database/execute-transactions We do not to issue a rollback if BeginTx fails
176
+ sessCounter .WithLabelValues ("failedstart" ).Inc ()
177
+ return errors .Wrapf (err , "Failed to start transaction" )
148
178
}
149
- // ErrNoRows means the session doesn't exist so we just continue with the empty session
150
- } else {
151
- // Deserialize the proto
152
- if err := proto .Unmarshal (sessRow .Proto , session ); err != nil {
153
- return errors .Wrapf (err , "Failed to deserialize session" )
179
+
180
+ // Ensure Rollback gets called.
181
+ // This is a null op if the transaction has already been committed or rolled back.
182
+ defer func () {
183
+ if err := tx .Rollback (); err != nil {
184
+ log .Error (err , "Failed to rollback transaction" )
185
+ }
186
+ }()
187
+
188
+ queries := db .queries .WithTx (tx )
189
+ // Read the record
190
+ sessRow , err := queries .GetSession (ctx , contextID )
191
+
192
+ // If the session doesn't exist then we do nothing because session is initializeed to empty session
193
+ session := & logspb.Session {
194
+ ContextId : contextID ,
195
+ }
196
+ if err != nil {
197
+ logDBErrors (ctx , err )
198
+ if err != sql .ErrNoRows {
199
+ sessCounter .WithLabelValues ("failedget" ).Inc ()
200
+ return errors .Wrapf (err , "Failed to get session with id %v" , contextID )
201
+ }
202
+ // ErrNoRows means the session doesn't exist so we just continue with the empty session
203
+ } else {
204
+ // Deserialize the proto
205
+ if err := proto .Unmarshal (sessRow .Proto , session ); err != nil {
206
+ return errors .Wrapf (err , "Failed to deserialize session" )
207
+ }
154
208
}
155
- }
156
209
157
- sessCounter .WithLabelValues ("callupdatefunc" ).Inc ()
210
+ sessCounter .WithLabelValues ("callupdatefunc" ).Inc ()
158
211
159
- if err := updateFunc (session ); err != nil {
160
- return errors .Wrapf (err , "Failed to update session" )
161
- }
212
+ if err := updateFunc (session ); err != nil {
213
+ return errors .Wrapf (err , "Failed to update session" )
214
+ }
162
215
163
- newRow , err := protoToRow (session )
164
- if err != nil {
165
- return errors .Wrapf (err , "Failed to convert session proto to table row" )
166
- }
216
+ newRow , err := protoToRow (session )
217
+ if err != nil {
218
+ return errors .Wrapf (err , "Failed to convert session proto to table row" )
219
+ }
167
220
168
- if newRow .Contextid != contextID {
169
- return errors .WithStack (errors .Errorf ("contextID in session doesn't match contextID. Update was called with contextID: %v but session has contextID: %v" , contextID , newRow .Contextid ))
170
- }
221
+ if newRow .Contextid != contextID {
222
+ return errors .WithStack (errors .Errorf ("contextID in session doesn't match contextID. Update was called with contextID: %v but session has contextID: %v" , contextID , newRow .Contextid ))
223
+ }
171
224
172
- update := fsql.UpdateSessionParams {
173
- Contextid : contextID ,
174
- Proto : newRow .Proto ,
175
- Starttime : newRow .Starttime ,
176
- Endtime : newRow .Endtime ,
177
- Selectedid : newRow .Selectedid ,
178
- Selectedkind : newRow .Selectedkind ,
179
- TotalInputTokens : newRow .TotalInputTokens ,
180
- TotalOutputTokens : newRow .TotalOutputTokens ,
181
- NumGenerateTraces : newRow .NumGenerateTraces ,
182
- }
225
+ update := fsql.UpdateSessionParams {
226
+ Contextid : contextID ,
227
+ Proto : newRow .Proto ,
228
+ Starttime : newRow .Starttime ,
229
+ Endtime : newRow .Endtime ,
230
+ Selectedid : newRow .Selectedid ,
231
+ Selectedkind : newRow .Selectedkind ,
232
+ TotalInputTokens : newRow .TotalInputTokens ,
233
+ TotalOutputTokens : newRow .TotalOutputTokens ,
234
+ NumGenerateTraces : newRow .NumGenerateTraces ,
235
+ }
183
236
184
- sessCounter .WithLabelValues ("callupdatesession" ).Inc ()
185
- if err := queries .UpdateSession (ctx , update ); err != nil {
186
- logDBErrors (ctx , err )
187
- return errors .Wrapf (err , "Failed to update session" )
188
- }
189
- return nil
190
- }()
237
+ sessCounter .WithLabelValues ("callupdatesession" ).Inc ()
238
+ if err := queries .UpdateSession (ctx , update ); err != nil {
239
+ logDBErrors (ctx , err )
240
+ return errors .Wrapf (err , "Failed to update session" )
241
+ }
191
242
192
- if err == nil {
193
- if err := tx .Commit (); err != nil {
194
- logDBErrors (ctx , err )
195
- log .Error (err , "Failed to commit transaction" )
196
- sessCounter .WithLabelValues ("commitfail" ).Inc ()
197
- return errors .Wrapf (err , "Failed to commit transaction" )
243
+ if err := tx .Commit (); err != nil {
244
+ logDBErrors (ctx , err )
245
+ log .Error (err , "Failed to commit transaction" )
246
+ sessCounter .WithLabelValues ("commitfail" ).Inc ()
247
+ return errors .Wrapf (err , "Failed to commit transaction" )
248
+ }
249
+ sessCounter .WithLabelValues ("success" ).Inc ()
250
+ return nil
251
+ }()
252
+
253
+ if err == nil {
254
+ sessCounter .WithLabelValues ("done" ).Inc ()
255
+ return nil
198
256
}
199
- sessCounter . WithLabelValues ( "success" ). Inc ()
200
- } else {
201
- logDBErrors ( ctx , err )
202
- sessCounter .WithLabelValues ("fail " ).Inc ()
203
- log . Error ( err , "Failed to update session" )
204
- if txErr := tx . Rollback (); txErr != nil {
205
- log . Error ( txErr , "Failed to rollback transaction" )
257
+
258
+ wait := b . NextBackOff ()
259
+ if wait == backoff . Stop {
260
+ sessCounter .WithLabelValues ("done " ).Inc ()
261
+ err := errors . Errorf ( "Failed to update session for contextId %s" , contextID )
262
+ log . Error ( err , "Failed to update session" )
263
+ return err
206
264
}
207
- return err
265
+ time . Sleep ( wait )
208
266
}
209
-
210
- sessCounter .WithLabelValues ("done" ).Inc ()
211
- return nil
212
267
}
213
268
214
269
func (m * SessionsManager ) GetSession (ctx context.Context , request * connect.Request [logspb.GetSessionRequest ]) (* connect.Response [logspb.GetSessionResponse ], error ) {
0 commit comments