Skip to content

Commit d9d54c7

Browse files
midwellEdvin Lindqvistgab-arrobo
authored
Fix DRSM nil pointer crashes in distributed deployments (#197)
* drsm: fix nil pointer crashes in change stream handler Add defensive nil checks to prevent crashes when processing MongoDB change stream events in distributed deployments. Fixes three crash scenarios: 1. Empty owner field - skip to prevent resource leaks 2. Nil chunk pointer - chunk not yet in global table (out-of-order events) 3. Nil pod pointer - pod not yet registered locally Skip invalid updates with warning logs rather than crashing. Eventual consistency maintained by periodic checkAllChunks() resync (3 seconds). Tested with multiple instances during pod failures and network partitions. Signed-off-by: Edvin Lindqvist <[email protected]> * simplified owner check and centralized podChunks verification/initialization Co-authored-by: Arrobo, Gabriel <[email protected]> Signed-off-by: Edvin Lindqvist <[email protected]> * Stepped up version Signed-off-by: Edvin Lindqvist <[email protected]> --------- Signed-off-by: Edvin Lindqvist <[email protected]> Co-authored-by: Edvin Lindqvist <[email protected]> Co-authored-by: Arrobo, Gabriel <[email protected]>
1 parent 0960089 commit d9d54c7

File tree

2 files changed

+26
-3
lines changed

2 files changed

+26
-3
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.5.7-dev
1+
1.5.7

drsm/updates.go

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ func (d *Drsm) handleDbUpdates() {
110110
}
111111
}
112112

113+
func (d *Drsm) ensurePodChunksInitialized(podD *podData) {
114+
if podD.podChunks == nil {
115+
podD.podChunks = make(map[int32]*chunk)
116+
}
117+
}
118+
113119
func iterateChangeStream(d *Drsm, routineCtx context.Context, stream *mongo.ChangeStream) {
114120
logger.DrsmLog.Debugf("iterate change stream for podData: %v", d)
115121

@@ -154,15 +160,32 @@ func iterateChangeStream(d *Drsm, routineCtx context.Context, stream *mongo.Chan
154160
// update on chunkId..
155161
// looks like chunk owner getting change
156162
owner := s.Update.UpdFields.PodId
163+
if owner == "" {
164+
logger.DrsmLog.Warnf("stream(Update): missing owner in update for doc %s, operation: %+v", s.DId.Id, s.Update)
165+
continue
166+
}
157167
c := getChunkIdFromDocId(s.DId.Id)
158168
d.globalChunkTblMutex.Lock()
159169
cp := d.globalChunkTbl[c]
160170
d.globalChunkTblMutex.Unlock()
171+
if cp == nil {
172+
logger.DrsmLog.Warnf("stream(Update): chunk %d not found in global table for owner %s - will be corrected by periodic resync", c, owner)
173+
// Without a chunk reference there is nothing to update; skip to avoid panic.
174+
// The periodic checkAllChunks() will resync state from MongoDB.
175+
continue
176+
}
161177
// TODO update IP address as well.
162178
cp.Owner.PodName = owner
163179
cp.Owner.PodIp = s.Update.UpdFields.PodIp
164180
cp.Owner.PodInstance = s.Update.UpdFields.PodInstance
165-
podD := d.podMap[owner]
181+
podD, found := d.podMap[owner]
182+
if !found {
183+
logger.DrsmLog.Warnf("stream(Update): pod %s not in local map for chunk %d update - will be corrected when keepalive arrives or during periodic resync", owner, c)
184+
// Wait for proper pod initialization via keepalive. Eventual consistency will be maintained by periodic resync and proper keepalive events.
185+
continue
186+
}
187+
// Defensive: should never happen if addPod() was called, but prevents panic
188+
d.ensurePodChunksInitialized(podD)
166189
podD.podChunks[c] = cp // add chunk to pod
167190
logger.DrsmLog.Infof("stream(Update): pod to chunk map %v", podD.podChunks)
168191
}
@@ -270,7 +293,7 @@ func (d *Drsm) addChunk(full *FullStream) {
270293
func (d *Drsm) addPod(full *FullStream) *podData {
271294
podI := PodId{PodName: full.PodId, PodInstance: full.PodInstance, PodIp: full.PodIp}
272295
pod := &podData{PodId: podI}
273-
pod.podChunks = make(map[int32]*chunk)
296+
d.ensurePodChunksInitialized(pod)
274297
d.podMap[full.PodId] = pod
275298
logger.DrsmLog.Infof("keepalive insert d.podMaps %v", d.podMap)
276299
return pod

0 commit comments

Comments
 (0)