@@ -44,6 +44,7 @@ func NewWatchDog(pm ProcessManager, timeoutBusy, timeoutIdle time.Duration, busy
4444 busyCheck : busy ,
4545 idleCheck : idle ,
4646 addressModelMap : make (map [string ]string ),
47+ stop : make (chan bool , 1 ),
4748 }
4849}
4950
@@ -104,18 +105,18 @@ func (wd *WatchDog) Run() {
104105
105106func (wd * WatchDog ) checkIdle () {
106107 wd .Lock ()
107- defer wd .Unlock ()
108108 log .Debug ().Msg ("[WatchDog] Watchdog checks for idle connections" )
109+
110+ // Collect models to shutdown while holding the lock
111+ var modelsToShutdown []string
109112 for address , t := range wd .idleTime {
110113 log .Debug ().Msgf ("[WatchDog] %s: idle connection" , address )
111114 if time .Since (t ) > wd .idletimeout {
112115 log .Warn ().Msgf ("[WatchDog] Address %s is idle for too long, killing it" , address )
113116 model , ok := wd .addressModelMap [address ]
114117 if ok {
115- if err := wd .pm .ShutdownModel (model ); err != nil {
116- log .Error ().Err (err ).Str ("model" , model ).Msg ("[watchdog] error shutting down model" )
117- }
118- log .Debug ().Msgf ("[WatchDog] model shut down: %s" , address )
118+ modelsToShutdown = append (modelsToShutdown , model )
119+ // Clean up the maps while we have the lock
119120 delete (wd .idleTime , address )
120121 delete (wd .addressModelMap , address )
121122 delete (wd .addressMap , address )
@@ -125,25 +126,32 @@ func (wd *WatchDog) checkIdle() {
125126 }
126127 }
127128 }
129+ wd .Unlock ()
130+
131+ // Now shutdown models without holding the watchdog lock to prevent deadlock
132+ for _ , model := range modelsToShutdown {
133+ if err := wd .pm .ShutdownModel (model ); err != nil {
134+ log .Error ().Err (err ).Str ("model" , model ).Msg ("[watchdog] error shutting down model" )
135+ }
136+ log .Debug ().Msgf ("[WatchDog] model shut down: %s" , model )
137+ }
128138}
129139
130140func (wd * WatchDog ) checkBusy () {
131141 wd .Lock ()
132- defer wd .Unlock ()
133142 log .Debug ().Msg ("[WatchDog] Watchdog checks for busy connections" )
134143
144+ // Collect models to shutdown while holding the lock
145+ var modelsToShutdown []string
135146 for address , t := range wd .timetable {
136147 log .Debug ().Msgf ("[WatchDog] %s: active connection" , address )
137148
138149 if time .Since (t ) > wd .timeout {
139-
140150 model , ok := wd .addressModelMap [address ]
141151 if ok {
142152 log .Warn ().Msgf ("[WatchDog] Model %s is busy for too long, killing it" , model )
143- if err := wd .pm .ShutdownModel (model ); err != nil {
144- log .Error ().Err (err ).Str ("model" , model ).Msg ("[watchdog] error shutting down model" )
145- }
146- log .Debug ().Msgf ("[WatchDog] model shut down: %s" , address )
153+ modelsToShutdown = append (modelsToShutdown , model )
154+ // Clean up the maps while we have the lock
147155 delete (wd .timetable , address )
148156 delete (wd .addressModelMap , address )
149157 delete (wd .addressMap , address )
@@ -153,4 +161,13 @@ func (wd *WatchDog) checkBusy() {
153161 }
154162 }
155163 }
164+ wd .Unlock ()
165+
166+ // Now shutdown models without holding the watchdog lock to prevent deadlock
167+ for _ , model := range modelsToShutdown {
168+ if err := wd .pm .ShutdownModel (model ); err != nil {
169+ log .Error ().Err (err ).Str ("model" , model ).Msg ("[watchdog] error shutting down model" )
170+ }
171+ log .Debug ().Msgf ("[WatchDog] model shut down: %s" , model )
172+ }
156173}
0 commit comments