@@ -351,7 +351,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
351
351
// Loads in allocatedDevices information from disk.
352
352
err := m .readCheckpoint ()
353
353
if err != nil {
354
- klog .InfoS ( "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date" , "err" , err )
354
+ klog .ErrorS ( err , "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date" )
355
355
}
356
356
357
357
return m .server .Start ()
@@ -453,7 +453,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
453
453
// should always be consistent. Otherwise, we run with the risk
454
454
// of failing to garbage collect non-existing resources or devices.
455
455
if ! ok {
456
- klog .ErrorS ( nil , "Unexpected: healthyDevices and endpoints are out of sync" )
456
+ klog .InfoS ( "Unexpected: healthyDevices and endpoints are out of sync" )
457
457
}
458
458
delete (m .endpoints , resourceName )
459
459
delete (m .healthyDevices , resourceName )
@@ -468,7 +468,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
468
468
eI , ok := m .endpoints [resourceName ]
469
469
if (ok && eI .e .stopGracePeriodExpired ()) || ! ok {
470
470
if ! ok {
471
- klog .ErrorS ( nil , "Unexpected: unhealthyDevices and endpoints are out of sync" )
471
+ klog .InfoS ( "Unexpected: unhealthyDevices and endpoints became out of sync" )
472
472
}
473
473
delete (m .endpoints , resourceName )
474
474
delete (m .unhealthyDevices , resourceName )
@@ -484,7 +484,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
484
484
m .mutex .Unlock ()
485
485
if needsUpdateCheckpoint {
486
486
if err := m .writeCheckpoint (); err != nil {
487
- klog .ErrorS (err , "Error on writing checkpoint" )
487
+ klog .ErrorS (err , "Failed to write checkpoint file " )
488
488
}
489
489
}
490
490
return capacity , allocatable , deletedResources .UnsortedList ()
@@ -503,9 +503,10 @@ func (m *ManagerImpl) writeCheckpoint() error {
503
503
err := m .checkpointManager .CreateCheckpoint (kubeletDeviceManagerCheckpoint , data )
504
504
if err != nil {
505
505
err2 := fmt .Errorf ("failed to write checkpoint file %q: %v" , kubeletDeviceManagerCheckpoint , err )
506
- klog .InfoS ( "Failed to write checkpoint file" , "err" , err )
506
+ klog .ErrorS ( err , "Failed to write checkpoint file" )
507
507
return err2
508
508
}
509
+ klog .V (4 ).InfoS ("Checkpoint file written" , "checkpoint" , kubeletDeviceManagerCheckpoint )
509
510
return nil
510
511
}
511
512
@@ -516,7 +517,7 @@ func (m *ManagerImpl) readCheckpoint() error {
516
517
if err != nil {
517
518
if err == errors .ErrCheckpointNotFound {
518
519
// no point in trying anything else
519
- klog .InfoS ( "Failed to read data from checkpoint" , "checkpoint" , kubeletDeviceManagerCheckpoint , "err" , err )
520
+ klog .ErrorS ( err , "Failed to read data from checkpoint" , "checkpoint" , kubeletDeviceManagerCheckpoint )
520
521
return nil
521
522
}
522
523
return err
@@ -534,6 +535,8 @@ func (m *ManagerImpl) readCheckpoint() error {
534
535
m .unhealthyDevices [resource ] = sets .New [string ]()
535
536
m .endpoints [resource ] = endpointInfo {e : newStoppedEndpointImpl (resource ), opts : nil }
536
537
}
538
+
539
+ klog .V (4 ).InfoS ("Read data from checkpoint file" , "checkpoint" , kubeletDeviceManagerCheckpoint )
537
540
return nil
538
541
}
539
542
@@ -596,7 +599,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
596
599
// running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be
597
600
// because it already has access to all the required devices, so we got nothing to do and we can bail out.
598
601
if ! m .sourcesReady .AllReady () && m .isContainerAlreadyRunning (podUID , contName ) {
599
- klog .V (3 ).InfoS ("container detected running, nothing to do" , "deviceNumber" , needed , "resourceName" , resource , "podUID" , podUID , "containerName" , contName )
602
+ klog .V (3 ).InfoS ("Container detected running, nothing to do" , "deviceNumber" , needed , "resourceName" , resource , "podUID" , podUID , "containerName" , contName )
600
603
return nil , nil
601
604
}
602
605
@@ -627,7 +630,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
627
630
// We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path.
628
631
// We cover container restart on kubelet steady state with the same flow.
629
632
if needed == 0 {
630
- klog .V (3 ).InfoS ("no devices needed, nothing to do" , "deviceNumber" , needed , "resourceName" , resource , "podUID" , podUID , "containerName" , contName )
633
+ klog .V (3 ).InfoS ("No devices needed, nothing to do" , "deviceNumber" , needed , "resourceName" , resource , "podUID" , podUID , "containerName" , contName )
631
634
// No change, no work.
632
635
return nil , nil
633
636
}
@@ -836,7 +839,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
836
839
for k , v := range container .Resources .Limits {
837
840
resource := string (k )
838
841
needed := int (v .Value ())
839
- klog .V (3 ).InfoS ("Looking for needed resources" , "needed " , needed , "resourceName " , resource )
842
+ klog .V (3 ).InfoS ("Looking for needed resources" , "resourceName " , resource , "pod " , klog . KObj ( pod ), "containerName" , container . Name , "needed" , needed )
840
843
if ! m .isDevicePluginResource (resource ) {
841
844
continue
842
845
}
@@ -882,7 +885,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
882
885
devs := allocDevices .UnsortedList ()
883
886
// TODO: refactor this part of code to just append a ContainerAllocationRequest
884
887
// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
885
- klog .V (3 ).InfoS ("Making allocation request for device plugin" , "devices" , devs , "resourceName" , resource )
888
+ klog .V (4 ).InfoS ("Making allocation request for device plugin" , "devices" , devs , "resourceName" , resource , "pod" , klog . KObj ( pod ), "containerName" , container . Name )
886
889
resp , err := eI .e .allocate (devs )
887
890
metrics .DevicePluginAllocationDuration .WithLabelValues (resource ).Observe (metrics .SinceInSeconds (startRPCTime ))
888
891
if err != nil {
@@ -952,7 +955,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
952
955
}
953
956
954
957
if ! m .checkPodActive (pod ) {
955
- klog .ErrorS ( nil , "pod deleted from activePods, skip to reAllocate" , "podUID" , podUID )
958
+ klog .V ( 5 ). InfoS ( "Pod deleted from activePods, skip to reAllocate" , "pod" , klog . KObj ( pod ), " podUID" , podUID , "containerName" , container . Name )
956
959
continue
957
960
}
958
961
@@ -984,7 +987,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s
984
987
985
988
if eI .opts == nil || ! eI .opts .PreStartRequired {
986
989
m .mutex .Unlock ()
987
- klog .V (4 ).InfoS ("Plugin options indicate to skip PreStartContainer for resource" , "resourceName" , resource )
990
+ klog .V (5 ).InfoS ("Plugin options indicate to skip PreStartContainer for resource" , "podUID" , podUID , " resourceName" , resource , "containerName" , contName )
988
991
return nil
989
992
}
990
993
@@ -1014,12 +1017,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re
1014
1017
}
1015
1018
1016
1019
if eI .opts == nil || ! eI .opts .GetPreferredAllocationAvailable {
1017
- klog .V (4 ).InfoS ("Plugin options indicate to skip GetPreferredAllocation for resource" , "resourceName" , resource )
1020
+ klog .V (5 ).InfoS ("Plugin options indicate to skip GetPreferredAllocation for resource" , "resourceName" , resource , "podUID" , podUID , "containerName" , contName )
1018
1021
return nil , nil
1019
1022
}
1020
1023
1021
1024
m .mutex .Unlock ()
1022
- klog .V (4 ).InfoS ("Issuing a GetPreferredAllocation call for container" , "containerName" , contName , "podUID" , podUID )
1025
+ klog .V (4 ).InfoS ("Issuing a GetPreferredAllocation call for container" , "resourceName" , resource , " containerName" , contName , "podUID" , podUID )
1023
1026
resp , err := eI .e .getPreferredAllocation (available .UnsortedList (), mustInclude .UnsortedList (), size )
1024
1027
m .mutex .Lock ()
1025
1028
if err != nil {
@@ -1167,19 +1170,19 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool {
1167
1170
func (m * ManagerImpl ) isContainerAlreadyRunning (podUID , cntName string ) bool {
1168
1171
cntID , err := m .containerMap .GetContainerID (podUID , cntName )
1169
1172
if err != nil {
1170
- klog .V ( 4 ). InfoS ( "container not found in the initial map, assumed NOT running" , "podUID" , podUID , "containerName" , cntName , "err" , err )
1173
+ klog .ErrorS ( err , "Container not found in the initial map, assumed NOT running" , "podUID" , podUID , "containerName" , cntName )
1171
1174
return false
1172
1175
}
1173
1176
1174
1177
// note that if container runtime is down when kubelet restarts, this set will be empty,
1175
1178
// so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again.
1176
1179
// This scenario should however be rare enough.
1177
1180
if ! m .containerRunningSet .Has (cntID ) {
1178
- klog .V (4 ).InfoS ("container not present in the initial running set" , "podUID" , podUID , "containerName" , cntName , "containerID" , cntID )
1181
+ klog .V (4 ).InfoS ("Container not present in the initial running set" , "podUID" , podUID , "containerName" , cntName , "containerID" , cntID )
1179
1182
return false
1180
1183
}
1181
1184
1182
1185
// Once we make it here we know we have a running container.
1183
- klog .V (4 ).InfoS ("container found in the initial set, assumed running" , "podUID" , podUID , "containerName" , cntName , "containerID" , cntID )
1186
+ klog .V (4 ).InfoS ("Container found in the initial set, assumed running" , "podUID" , podUID , "containerName" , cntName , "containerID" , cntID )
1184
1187
return true
1185
1188
}
0 commit comments