@@ -25,14 +25,18 @@ import (
2525
2626// EventProcessorConfig holds configuration for the unified event processor
2727type EventProcessorConfig struct {
28- // MaxRetries for event processing (0 = no retries, -1 = infinite retries)
28+ // MaxRetries for event processing (used by QueueEventProcessor for retry limit)
29+ // DefaultEventProcessor ignores this and does not retry
30+ // Set to 0 for no retries, -1 for unlimited retries
2931 MaxRetries int
30- // RetryDelay between attempts
31- RetryDelay time.Duration
3232 // EnableMetrics controls whether to track processing metrics
3333 EnableMetrics bool
3434 // MetricsLabels for module-specific metrics categorization
3535 MetricsLabels map [string ]string
36+ // MarkProcessedOnError determines whether to mark events as processed even if handler returns error
37+ // Set to false (default) to preserve event for retry on next restart
38+ // Set to true to skip failed events and continue (use with caution - may lose events)
39+ MarkProcessedOnError bool
3640}
3741
3842// EventProcessor provides a unified interface for processing change stream events
@@ -71,14 +75,6 @@ type DefaultEventProcessor struct {
7175func NewEventProcessor (
7276 watcher ChangeStreamWatcher , dbClient DatabaseClient , config EventProcessorConfig ,
7377) EventProcessor {
74- if config .MaxRetries == 0 {
75- config .MaxRetries = 3 // Default retry attempts
76- }
77-
78- if config .RetryDelay == 0 {
79- config .RetryDelay = time .Second * 2 // Default retry delay
80- }
81-
8278 return & DefaultEventProcessor {
8379 changeStreamWatcher : watcher ,
8480 databaseClient : dbClient ,
@@ -143,73 +139,63 @@ func (p *DefaultEventProcessor) processEvents(ctx context.Context) error {
143139 }
144140}
145141
146- // handleSingleEvent processes a single event with retry logic
142+ // handleSingleEvent processes a single event
143+ // IMPORTANT: Does NOT retry internally - handler is responsible for its own retries if needed
144+ // This prevents retry-induced blocking of the event stream
147145func (p * DefaultEventProcessor ) handleSingleEvent (ctx context.Context , event Event ) error {
148146 startTime := time .Now ()
149147
150148 // Unmarshal the event
151149 var healthEventWithStatus model.HealthEventWithStatus
152150 if err := event .UnmarshalDocument (& healthEventWithStatus ); err != nil {
153151 p .updateMetrics ("unmarshal_error" , "" , time .Since (startTime ), false )
152+ // Unmarshal errors are non-recoverable, mark as processed to skip bad data
153+ if markErr := p .changeStreamWatcher .MarkProcessed (ctx , []byte {}); markErr != nil {
154+ slog .Error ("Failed to mark processed after unmarshal error" , "error" , markErr )
155+ }
154156 return fmt .Errorf ("failed to unmarshal event: %w" , err )
155157 }
156158
157159 eventID , err := event .GetDocumentID ()
158160 if err != nil {
159161 p .updateMetrics ("document_id_error" , "" , time .Since (startTime ), false )
162+ // Document ID errors are non-recoverable, mark as processed
163+ if markErr := p .changeStreamWatcher .MarkProcessed (ctx , []byte {}); markErr != nil {
164+ slog .Error ("Failed to mark processed after document ID error" , "error" , markErr )
165+ }
160166 return fmt .Errorf ("failed to get document ID: %w" , err )
161167 }
162168
163- // DEBUG: Log RAW event immediately after unmarshal, before any business logic
164- if healthEventWithStatus .HealthEvent != nil {
165- slog .Info ("EventProcessor - RAW event received from change stream" ,
166- "eventID" , eventID ,
167- "agent" , healthEventWithStatus .HealthEvent .Agent ,
168- "checkName" , healthEventWithStatus .HealthEvent .CheckName ,
169- "isFatal" , healthEventWithStatus .HealthEvent .IsFatal ,
170- "isHealthy" , healthEventWithStatus .HealthEvent .IsHealthy ,
171- "recommendedAction" , healthEventWithStatus .HealthEvent .RecommendedAction ,
172- "errorCode" , healthEventWithStatus .HealthEvent .ErrorCode ,
173- "nodeName" , healthEventWithStatus .HealthEvent .NodeName ,
174- "message" , healthEventWithStatus .HealthEvent .Message ,
175- "createdAt" , healthEventWithStatus .CreatedAt )
176- }
177-
178169 slog .Debug ("Processing event" , "eventID" , eventID , "event" , healthEventWithStatus )
179170
180- // Process with retry logic
181- var processErr error
182-
183- maxRetries := p .config .MaxRetries
184- if maxRetries < 0 {
185- maxRetries = 1000000 // Treat -1 as infinite retries (large number)
186- }
187-
188- for attempt := 1 ; attempt <= maxRetries ; attempt ++ {
189- slog .Debug ("Processing event" , "attempt" , attempt , "eventID" , eventID , "maxRetries" , p .config .MaxRetries )
190-
191- processErr = p .eventHandler .ProcessEvent (ctx , & healthEventWithStatus )
192- if processErr == nil {
193- // Success
194- p .updateMetrics ("processing_success" , eventID , time .Since (startTime ), true )
195- break
196- }
197-
198- slog .Error ("Event processing failed" , "attempt" , attempt , "eventID" , eventID , "error" , processErr )
199-
200- // Don't sleep on the last attempt
201- if attempt < maxRetries {
202- time .Sleep (p .config .RetryDelay )
203- }
204- }
171+ // Process event - NO internal retries
172+ // Handler is responsible for any retries it needs
173+ processErr := p .eventHandler .ProcessEvent (ctx , & healthEventWithStatus )
205174
206175 if processErr != nil {
207176 p .updateMetrics ("processing_failed" , eventID , time .Since (startTime ), false )
208- slog .Error ("Max retries reached, event processing failed" , "eventID" , eventID , "error" , processErr )
177+ slog .Error ("Event processing failed" , "eventID" , eventID , "error" , processErr )
178+
179+ // CRITICAL FIX: Only mark as processed if configured to do so OR if handler succeeded
180+ if p .config .MarkProcessedOnError {
181+ slog .Warn ("Marking failed event as processed due to MarkProcessedOnError=true" ,
182+ "eventID" , eventID , "error" , processErr )
183+ if markErr := p .changeStreamWatcher .MarkProcessed (ctx , []byte {}); markErr != nil {
184+ slog .Error ("Failed to mark processed after error" , "error" , markErr )
185+ return fmt .Errorf ("failed to mark event as processed: %w" , markErr )
186+ }
187+ } else {
188+ // Do NOT mark as processed - event will be retried on next restart
189+ slog .Error ("Event processing failed, NOT marking as processed - will retry on restart" ,
190+ "eventID" , eventID , "error" , processErr )
191+ return processErr
192+ }
193+
194+ return processErr
209195 }
210196
211- // Always mark as processed to advance the resume token
212- // Note: Passing empty token allows MongoDB watcher to use its internal cursor state
197+ // Success - mark as processed
198+ p . updateMetrics ( "processing_success" , eventID , time . Since ( startTime ), true )
213199 if err := p .changeStreamWatcher .MarkProcessed (ctx , []byte {}); err != nil {
214200 p .updateMetrics ("mark_processed_error" , eventID , time .Since (startTime ), false )
215201 return fmt .Errorf ("failed to mark event as processed: %w" , err )
0 commit comments