@@ -56,10 +56,11 @@ type LocalCluster struct {
5656	customTokenizers  string 
5757
5858	// resources 
59- 	dcli    * docker.Client 
60- 	net     cnet 
61- 	zeros   []* zero 
62- 	alphas  []* alpha 
59+ 	dcli      * docker.Client 
60+ 	net       cnet 
61+ 	netMutex  sync.Mutex  // protects network recreation 
62+ 	zeros     []* zero 
63+ 	alphas    []* alpha 
6364}
6465
6566// UpgradeStrategy is an Enum that defines various upgrade strategies 
@@ -167,18 +168,42 @@ func (c *LocalCluster) init() error {
167168
168169func  (c  * LocalCluster ) createNetwork () error  {
169170	c .net .name  =  c .conf .prefix  +  "-net" 
171+ 
172+ 	ctx , cancel  :=  context .WithTimeout (context .Background (), requestTimeout )
173+ 	defer  cancel ()
174+ 
175+ 	// Check if network already exists 
176+ 	existingNet , err  :=  c .dcli .NetworkInspect (ctx , c .net .name , network.InspectOptions {})
177+ 	if  err  ==  nil  {
178+ 		// Network exists, reuse it 
179+ 		log .Printf ("[INFO] reusing existing network %s (ID: %s)" , c .net .name , existingNet .ID )
180+ 		c .net .id  =  existingNet .ID 
181+ 		return  nil 
182+ 	}
183+ 
184+ 	// Network doesn't exist, create it 
170185	opts  :=  network.CreateOptions {
171186		Driver : "bridge" ,
172187		IPAM :   & network.IPAM {Driver : "default" },
173188	}
174189
175- 	ctx , cancel  :=  context .WithTimeout (context .Background (), requestTimeout )
176- 	defer  cancel ()
177- 	network , err  :=  c .dcli .NetworkCreate (ctx , c .net .name , opts )
190+ 	networkResp , err  :=  c .dcli .NetworkCreate (ctx , c .net .name , opts )
178191	if  err  !=  nil  {
192+ 		// If network already exists (race condition), try to inspect and reuse it 
193+ 		if  strings .Contains (err .Error (), "already exists" ) {
194+ 			log .Printf ("[INFO] network %s already exists (race condition), inspecting" , c .net .name )
195+ 			existingNet , inspectErr  :=  c .dcli .NetworkInspect (ctx , c .net .name , network.InspectOptions {})
196+ 			if  inspectErr  ==  nil  {
197+ 				log .Printf ("[INFO] reusing existing network %s (ID: %s)" , c .net .name , existingNet .ID )
198+ 				c .net .id  =  existingNet .ID 
199+ 				return  nil 
200+ 			}
201+ 			// If inspect also fails, return original create error 
202+ 			log .Printf ("[WARNING] failed to inspect network after creation conflict: %v" , inspectErr )
203+ 		}
179204		return  errors .Wrap (err , "error creating network" )
180205	}
181- 	c .net .id  =  network .ID 
206+ 	c .net .id  =  networkResp .ID 
182207
183208	return  nil 
184209}
@@ -256,6 +281,27 @@ func (c *LocalCluster) createContainer(dc dnode) (string, error) {
256281		return  "" , err 
257282	}
258283
284+ 	// Verify the network still exists before creating container 
285+ 	ctx , cancel  :=  context .WithTimeout (context .Background (), requestTimeout )
286+ 	defer  cancel ()
287+ 	if  c .net .id  !=  ""  {
288+ 		_ , err  :=  c .dcli .NetworkInspect (ctx , c .net .id , network.InspectOptions {})
289+ 		if  err  !=  nil  {
290+ 			// Use mutex to prevent multiple goroutines from recreating network simultaneously 
291+ 			c .netMutex .Lock ()
292+ 			// Double-check after acquiring lock - another goroutine may have recreated it 
293+ 			_ , recheckErr  :=  c .dcli .NetworkInspect (ctx , c .net .id , network.InspectOptions {})
294+ 			if  recheckErr  !=  nil  {
295+ 				log .Printf ("[WARNING] network %s (ID: %s) not found, recreating" , c .net .name , c .net .id )
296+ 				if  err  :=  c .createNetwork (); err  !=  nil  {
297+ 					c .netMutex .Unlock ()
298+ 					return  "" , errors .Wrap (err , "error recreating network" )
299+ 				}
300+ 			}
301+ 			c .netMutex .Unlock ()
302+ 		}
303+ 	}
304+ 
259305	cconf  :=  & container.Config {Cmd : cmd , Image : image , WorkingDir : dc .workingDir (), ExposedPorts : dc .ports ()}
260306	hconf  :=  & container.HostConfig {Mounts : mts , PublishAllPorts : true , PortBindings : dc .bindings (c .conf .portOffset )}
261307	networkConfig  :=  & network.NetworkingConfig {
@@ -267,8 +313,6 @@ func (c *LocalCluster) createContainer(dc dnode) (string, error) {
267313		},
268314	}
269315
270- 	ctx , cancel  :=  context .WithTimeout (context .Background (), requestTimeout )
271- 	defer  cancel ()
272316	resp , err  :=  c .dcli .ContainerCreate (ctx , cconf , hconf , networkConfig , nil , dc .cname ())
273317	if  err  !=  nil  {
274318		return  "" , errors .Wrapf (err , "error creating container %v" , dc .cname ())
@@ -394,16 +438,28 @@ func (c *LocalCluster) cleanupDocker() error {
394438	// Prune containers 
395439	contsReport , err  :=  c .dcli .ContainersPrune (ctx , filters.Args {})
396440	if  err  !=  nil  {
397- 		log .Fatalf ("[ERROR] Error pruning containers: %v" , err )
441+ 		// Don't fail if prune is already running - just skip it 
442+ 		if  strings .Contains (err .Error (), "already running" ) {
443+ 			log .Printf ("[WARNING] Skipping container prune - operation already running" )
444+ 		} else  {
445+ 			log .Printf ("[WARNING] Error pruning containers: %v" , err )
446+ 		}
447+ 	} else  {
448+ 		log .Printf ("[INFO] Pruned containers: %+v\n " , contsReport )
398449	}
399- 	log .Printf ("[INFO] Pruned containers: %+v\n " , contsReport )
400450
401451	// Prune networks 
402452	netsReport , err  :=  c .dcli .NetworksPrune (ctx , filters.Args {})
403453	if  err  !=  nil  {
404- 		log .Fatalf ("[ERROR] Error pruning networks: %v" , err )
454+ 		// Don't fail if prune is already running - just skip it 
455+ 		if  strings .Contains (err .Error (), "already running" ) {
456+ 			log .Printf ("[WARNING] Skipping network prune - operation already running" )
457+ 		} else  {
458+ 			log .Printf ("[WARNING] Error pruning networks: %v" , err )
459+ 		}
460+ 	} else  {
461+ 		log .Printf ("[INFO] Pruned networks: %+v\n " , netsReport )
405462	}
406- 	log .Printf ("[INFO] Pruned networks: %+v\n " , netsReport )
407463
408464	return  nil 
409465}
@@ -493,6 +549,22 @@ func (c *LocalCluster) StartAlpha(id int) error {
493549func  (c  * LocalCluster ) startContainer (dc  dnode ) error  {
494550	ctx , cancel  :=  context .WithTimeout (context .Background (), requestTimeout )
495551	defer  cancel ()
552+ 
553+ 	// verify the container still exists 
554+ 	_ , err  :=  c .dcli .ContainerInspect (ctx , dc .cid ())
555+ 	if  err  !=  nil  {
556+ 		log .Printf ("[WARNING] container %s (ID: %s) not found, attempting to recreate" , dc .cname (), dc .cid ())
557+ 		newCID , createErr  :=  c .createContainer (dc )
558+ 		if  createErr  !=  nil  {
559+ 			return  errors .Wrapf (createErr , "error recreating missing container [%v]" , dc .cname ())
560+ 		}
561+ 		switch  node  :=  dc .(type ) {
562+ 		case  * alpha , * zero :
563+ 			node .setContainerID (newCID )
564+ 		}
565+ 		log .Printf ("[INFO] successfully recreated container %s with new ID: %s" , dc .cname (), newCID )
566+ 	}
567+ 
496568	if  err  :=  c .dcli .ContainerStart (ctx , dc .cid (), container.StartOptions {}); err  !=  nil  {
497569		return  errors .Wrapf (err , "error starting container [%v]" , dc .cname ())
498570	}
@@ -634,15 +706,15 @@ func (c *LocalCluster) containerHealthCheck(url func(c *LocalCluster) (string, e
634706
635707		req , err  :=  http .NewRequest (http .MethodGet , endpoint , nil )
636708		if  err  !=  nil  {
637- 			if  attempt  >  10  {
638- 				log .Printf ("[WARNING] error  building req for endpoint [%v], err: [%v]" , endpoint , err )
709+ 			if  attempt  >  50  {
710+ 				log .Printf ("[WARNING] problem  building req for endpoint [%v], err: [%v]" , endpoint , err )
639711			}
640712			continue 
641713		}
642714		body , err  :=  dgraphapi .DoReq (req )
643715		if  err  !=  nil  {
644- 			if  attempt  >  10  {
645- 				log .Printf ("[WARNING] error  hitting health endpoint [%v], err: [%v]" , endpoint , err )
716+ 			if  attempt  >  50  {
717+ 				log .Printf ("[WARNING] problem  hitting health endpoint [%v], err: [%v]" , endpoint , err )
646718			}
647719			continue 
648720		}
@@ -691,8 +763,8 @@ func (c *LocalCluster) waitUntilLogin() error {
691763			log .Printf ("[INFO] login succeeded" )
692764			return  nil 
693765		}
694- 		if  attempt  >  10  {
695- 			log .Printf ("[WARNING] error  trying to login: %v" , err )
766+ 		if  attempt  >  5  {
767+ 			log .Printf ("[WARNING] problem  trying to login: %v" , err )
696768		}
697769		time .Sleep (waitDurBeforeRetry )
698770	}
@@ -876,7 +948,7 @@ func (c *LocalCluster) Client() (*dgraphapi.GrpcClient, func(), error) {
876948	cleanup  :=  func () {
877949		for  _ , conn  :=  range  conns  {
878950			if  err  :=  conn .Close (); err  !=  nil  {
879- 				log .Printf ("[WARNING] error  closing connection: %v" , err )
951+ 				log .Printf ("[WARNING] problem  closing connection: %v" , err )
880952			}
881953		}
882954	}
@@ -897,7 +969,7 @@ func (c *LocalCluster) AlphaClient(id int) (*dgraphapi.GrpcClient, func(), error
897969	client  :=  dgo .NewDgraphClient (api .NewDgraphClient (conn ))
898970	cleanup  :=  func () {
899971		if  err  :=  conn .Close (); err  !=  nil  {
900- 			log .Printf ("[WARNING] error  closing connection: %v" , err )
972+ 			log .Printf ("[WARNING] problem  closing connection: %v" , err )
901973		}
902974	}
903975	return  & dgraphapi.GrpcClient {Dgraph : client }, cleanup , nil 
0 commit comments