@@ -295,10 +295,117 @@ func TestInitializeDatabaseStoreConnector(t *testing.T) {
295295 }()
296296
297297 ringBuffer := ringbuffer .NewRingBuffer ("test" , context .Background ())
298- connector , err := InitializeDatabaseStoreConnector (context .Background (), ringBuffer , "" )
298+ connector , err := InitializeDatabaseStoreConnector (context .Background (), ringBuffer , "" , 3 )
299299
300300 require .Error (t , err )
301301 require .Nil (t , connector )
302302 require .Contains (t , err .Error (), "NODE_NAME is not set" )
303303 })
304304}
305+
306+ // TestMessageRetriedOnMongoDBFailure verifies that
307+ // messages are retried with exponential backoff when MongoDB write fails.
308+ func TestMessageRetriedOnMongoDBFailure (t * testing.T ) {
309+ ctx , cancel := context .WithCancel (context .Background ())
310+ defer cancel ()
311+
312+ ringBuffer := ringbuffer .NewRingBuffer ("testRetryBehavior" , ctx ,
313+ ringbuffer .WithRetryConfig (10 * time .Millisecond , 50 * time .Millisecond ))
314+ nodeName := "testNode"
315+ mockClient := & mockDatabaseClient {}
316+
317+ // First 2 calls fail, 3rd call succeeds
318+ mockClient .On ("InsertMany" , mock .Anything , mock .Anything ).
319+ Return (nil , errors .New ("MongoDB temporarily unavailable" )).Times (2 )
320+ mockClient .On ("InsertMany" , mock .Anything , mock .Anything ).
321+ Return (& client.InsertManyResult {InsertedIDs : []interface {}{"id1" }}, nil ).Once ()
322+
323+ connector := & DatabaseStoreConnector {
324+ databaseClient : mockClient ,
325+ ringBuffer : ringBuffer ,
326+ nodeName : nodeName ,
327+ maxRetries : 3 ,
328+ }
329+
330+ healthEvent := & protos.HealthEvent {
331+ NodeName : "gpu-node-1" ,
332+ GeneratedTimestamp : timestamppb .New (time .Now ()),
333+ CheckName : "GpuXidError" ,
334+ ErrorCode : []string {"79" }, // GPU fell off the bus
335+ IsFatal : true ,
336+ IsHealthy : false ,
337+ }
338+
339+ healthEvents := & protos.HealthEvents {
340+ Events : []* protos.HealthEvent {healthEvent },
341+ }
342+
343+ ringBuffer .Enqueue (healthEvents )
344+ require .Equal (t , 1 , ringBuffer .CurrentLength (), "Event should be in queue" )
345+ go connector .FetchAndProcessHealthMetric (ctx )
346+
347+ require .Eventually (t , func () bool {
348+ return ringBuffer .CurrentLength () == 0
349+ }, 500 * time .Millisecond , 10 * time .Millisecond , "Queue should be empty after successful retry" )
350+
351+ // Give a bit more time for all async operations to complete
352+ time .Sleep (100 * time .Millisecond )
353+
354+ // Verify correct number of retry attempts
355+ mockClient .AssertNumberOfCalls (t , "InsertMany" , 3 )
356+ cancel ()
357+ }
358+
359+ // TestMessageDroppedAfterMaxRetries verifies that messages are eventually dropped
360+ // after exceeding the maximum retry count to prevent unbounded memory growth.
361+ func TestMessageDroppedAfterMaxRetries (t * testing.T ) {
362+ ctx , cancel := context .WithCancel (context .Background ())
363+ defer cancel ()
364+
365+ ringBuffer := ringbuffer .NewRingBuffer ("testMaxRetries" , ctx ,
366+ ringbuffer .WithRetryConfig (10 * time .Millisecond , 50 * time .Millisecond ))
367+ nodeName := "testNode"
368+ mockClient := & mockDatabaseClient {}
369+
370+ // Always fail to simulate persistent MongoDB outage
371+ mockClient .On ("InsertMany" , mock .Anything , mock .Anything ).Return (
372+ (* client .InsertManyResult )(nil ),
373+ errors .New ("MongoDB permanently down" ),
374+ )
375+
376+ connector := & DatabaseStoreConnector {
377+ databaseClient : mockClient ,
378+ ringBuffer : ringBuffer ,
379+ nodeName : nodeName ,
380+ maxRetries : 3 ,
381+ }
382+
383+ healthEvent := & protos.HealthEvent {
384+ NodeName : "gpu-node-1" ,
385+ GeneratedTimestamp : timestamppb .New (time .Now ()),
386+ CheckName : "GpuXidError" ,
387+ ErrorCode : []string {"79" },
388+ IsFatal : true ,
389+ IsHealthy : false ,
390+ }
391+
392+ healthEvents := & protos.HealthEvents {
393+ Events : []* protos.HealthEvent {healthEvent },
394+ }
395+
396+ ringBuffer .Enqueue (healthEvents )
397+ require .Equal (t , 1 , ringBuffer .CurrentLength ())
398+
399+ go connector .FetchAndProcessHealthMetric (ctx )
400+
401+ require .Eventually (t , func () bool {
402+ return ringBuffer .CurrentLength () == 0
403+ }, 500 * time .Millisecond , 10 * time .Millisecond , "Event should be dropped after max retries" )
404+
405+ // Give enough time for the final retry attempt to complete
406+ time .Sleep (100 * time .Millisecond )
407+
408+ // Verify we attempted initial call plus 3 retries (4 total)
409+ mockClient .AssertNumberOfCalls (t , "InsertMany" , 4 )
410+ cancel ()
411+ }
0 commit comments