Skip to content

Commit 39f1c90

Browse files
authored
Merge pull request kubernetes#129735 from swatisehgal/device-mgr-logs-improvements
Device Manager logging improvements
2 parents 994a469 + f449697 commit 39f1c90

File tree

5 files changed

+39
-31
lines changed

5 files changed

+39
-31
lines changed

pkg/kubelet/cm/devicemanager/manager.go

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
351351
// Loads in allocatedDevices information from disk.
352352
err := m.readCheckpoint()
353353
if err != nil {
354-
klog.InfoS("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date", "err", err)
354+
klog.ErrorS(err, "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date")
355355
}
356356

357357
return m.server.Start()
@@ -453,7 +453,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
453453
// should always be consistent. Otherwise, we run with the risk
454454
// of failing to garbage collect non-existing resources or devices.
455455
if !ok {
456-
klog.ErrorS(nil, "Unexpected: healthyDevices and endpoints are out of sync")
456+
klog.InfoS("Unexpected: healthyDevices and endpoints are out of sync")
457457
}
458458
delete(m.endpoints, resourceName)
459459
delete(m.healthyDevices, resourceName)
@@ -468,7 +468,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
468468
eI, ok := m.endpoints[resourceName]
469469
if (ok && eI.e.stopGracePeriodExpired()) || !ok {
470470
if !ok {
471-
klog.ErrorS(nil, "Unexpected: unhealthyDevices and endpoints are out of sync")
471+
klog.InfoS("Unexpected: unhealthyDevices and endpoints became out of sync")
472472
}
473473
delete(m.endpoints, resourceName)
474474
delete(m.unhealthyDevices, resourceName)
@@ -484,7 +484,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
484484
m.mutex.Unlock()
485485
if needsUpdateCheckpoint {
486486
if err := m.writeCheckpoint(); err != nil {
487-
klog.ErrorS(err, "Error on writing checkpoint")
487+
klog.ErrorS(err, "Failed to write checkpoint file")
488488
}
489489
}
490490
return capacity, allocatable, deletedResources.UnsortedList()
@@ -503,9 +503,10 @@ func (m *ManagerImpl) writeCheckpoint() error {
503503
err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data)
504504
if err != nil {
505505
err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
506-
klog.InfoS("Failed to write checkpoint file", "err", err)
506+
klog.ErrorS(err, "Failed to write checkpoint file")
507507
return err2
508508
}
509+
klog.V(4).InfoS("Checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint)
509510
return nil
510511
}
511512

@@ -516,7 +517,7 @@ func (m *ManagerImpl) readCheckpoint() error {
516517
if err != nil {
517518
if err == errors.ErrCheckpointNotFound {
518519
// no point in trying anything else
519-
klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err)
520+
klog.ErrorS(err, "Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint)
520521
return nil
521522
}
522523
return err
@@ -534,6 +535,8 @@ func (m *ManagerImpl) readCheckpoint() error {
534535
m.unhealthyDevices[resource] = sets.New[string]()
535536
m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil}
536537
}
538+
539+
klog.V(4).InfoS("Read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint)
537540
return nil
538541
}
539542

@@ -596,7 +599,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
596599
// running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be
597600
// because it already has access to all the required devices, so we got nothing to do and we can bail out.
598601
if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) {
599-
klog.V(3).InfoS("container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
602+
klog.V(3).InfoS("Container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
600603
return nil, nil
601604
}
602605

@@ -627,7 +630,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
627630
// We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path.
628631
// We cover container restart on kubelet steady state with the same flow.
629632
if needed == 0 {
630-
klog.V(3).InfoS("no devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
633+
klog.V(3).InfoS("No devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
631634
// No change, no work.
632635
return nil, nil
633636
}
@@ -836,7 +839,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
836839
for k, v := range container.Resources.Limits {
837840
resource := string(k)
838841
needed := int(v.Value())
839-
klog.V(3).InfoS("Looking for needed resources", "needed", needed, "resourceName", resource)
842+
klog.V(3).InfoS("Looking for needed resources", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "needed", needed)
840843
if !m.isDevicePluginResource(resource) {
841844
continue
842845
}
@@ -882,7 +885,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
882885
devs := allocDevices.UnsortedList()
883886
// TODO: refactor this part of code to just append a ContainerAllocationRequest
884887
// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
885-
klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource)
888+
klog.V(4).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
886889
resp, err := eI.e.allocate(devs)
887890
metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime))
888891
if err != nil {
@@ -952,7 +955,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
952955
}
953956

954957
if !m.checkPodActive(pod) {
955-
klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID)
958+
klog.V(5).InfoS("Pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name)
956959
continue
957960
}
958961

@@ -984,7 +987,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s
984987

985988
if eI.opts == nil || !eI.opts.PreStartRequired {
986989
m.mutex.Unlock()
987-
klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "resourceName", resource)
990+
klog.V(5).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName)
988991
return nil
989992
}
990993

@@ -1014,12 +1017,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re
10141017
}
10151018

10161019
if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable {
1017-
klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource)
1020+
klog.V(5).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName)
10181021
return nil, nil
10191022
}
10201023

10211024
m.mutex.Unlock()
1022-
klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "containerName", contName, "podUID", podUID)
1025+
klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "resourceName", resource, "containerName", contName, "podUID", podUID)
10231026
resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size)
10241027
m.mutex.Lock()
10251028
if err != nil {
@@ -1167,19 +1170,19 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool {
11671170
func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
11681171
cntID, err := m.containerMap.GetContainerID(podUID, cntName)
11691172
if err != nil {
1170-
klog.V(4).InfoS("container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName, "err", err)
1173+
klog.ErrorS(err, "Container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName)
11711174
return false
11721175
}
11731176

11741177
// note that if container runtime is down when kubelet restarts, this set will be empty,
11751178
// so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again.
11761179
// This scenario should however be rare enough.
11771180
if !m.containerRunningSet.Has(cntID) {
1178-
klog.V(4).InfoS("container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
1181+
klog.V(4).InfoS("Container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
11791182
return false
11801183
}
11811184

11821185
// Once we make it here we know we have a running container.
1183-
klog.V(4).InfoS("container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
1186+
klog.V(4).InfoS("Container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
11841187
return true
11851188
}

pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ func (c *client) Disconnect() error {
106106
}
107107
c.mutex.Unlock()
108108
c.handler.PluginDisconnected(c.resource)
109+
110+
klog.V(2).InfoS("Device plugin disconnected", "resource", c.resource)
109111
return nil
110112
}
111113

pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s
6262
return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
6363
}
6464

65+
klog.V(2).InfoS("Device plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
6566
return nil
6667
}
6768

@@ -75,6 +76,7 @@ func (s *server) connectClient(name string, socketPath string) error {
7576
return err
7677
}
7778

79+
klog.V(2).InfoS("Connected to new client", "resource", name)
7880
go func() {
7981
s.runClient(name, c)
8082
}()
@@ -112,7 +114,7 @@ func (s *server) runClient(name string, c Client) {
112114
}
113115

114116
if err := s.disconnectClient(name, c); err != nil {
115-
klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err)
117+
klog.ErrorS(err, "Unable to disconnect client", "resource", name, "client", c)
116118
}
117119
}
118120

pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ func (s *server) Start() error {
9191

9292
if selinux.GetEnabled() {
9393
if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
94-
klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err)
94+
klog.ErrorS(err, "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir)
9595
}
9696
}
9797

@@ -128,7 +128,7 @@ func (s *server) Start() error {
128128
func (s *server) Stop() error {
129129
s.visitClients(func(r string, c Client) {
130130
if err := s.disconnectClient(r, c); err != nil {
131-
klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err)
131+
klog.ErrorS(err, "Failed to disconnect device plugin client", "resourceName", r)
132132
}
133133
})
134134

@@ -145,6 +145,7 @@ func (s *server) Stop() error {
145145
// During kubelet termination, we do not need the registration server,
146146
// and we consider the kubelet to be healthy even when it is down.
147147
s.setHealthy()
148+
klog.V(2).InfoS("Stopping device plugin registration server")
148149

149150
return nil
150151
}
@@ -159,18 +160,18 @@ func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Emp
159160

160161
if !s.isVersionCompatibleWithPlugin(r.Version) {
161162
err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
162-
klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err)
163+
klog.ErrorS(err, "Bad registration request from device plugin with resource", "resourceName", r.ResourceName)
163164
return &api.Empty{}, err
164165
}
165166

166167
if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
167168
err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
168-
klog.InfoS("Bad registration request from device plugin", "err", err)
169+
klog.ErrorS(err, "Bad registration request from device plugin")
169170
return &api.Empty{}, err
170171
}
171172

172173
if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
173-
klog.InfoS("Error connecting to device plugin client", "err", err)
174+
klog.ErrorS(err, "Error connecting to device plugin client")
174175
return &api.Empty{}, err
175176
}
176177

pkg/kubelet/cm/devicemanager/topology_hints.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
4343
for resource, requested := range accumulatedResourceRequests {
4444
// Only consider devices that actually contain topology information.
4545
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
46-
klog.InfoS("Resource does not have a topology preference", "resource", resource)
46+
klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested)
4747
deviceHints[resource] = nil
4848
continue
4949
}
@@ -54,11 +54,11 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
5454
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
5555
if allocated.Len() > 0 {
5656
if allocated.Len() != requested {
57-
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
57+
klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
5858
deviceHints[resource] = []topologymanager.TopologyHint{}
5959
continue
6060
}
61-
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name)
61+
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
6262
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
6363
continue
6464
}
@@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
6767
available := m.getAvailableDevices(resource)
6868
reusable := m.devicesToReuse[string(pod.UID)][resource]
6969
if available.Union(reusable).Len() < requested {
70-
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len())
70+
klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len())
7171
deviceHints[resource] = []topologymanager.TopologyHint{}
7272
continue
7373
}
@@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
9494
for resource, requested := range accumulatedResourceRequests {
9595
// Only consider devices that actually contain topology information.
9696
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
97-
klog.InfoS("Resource does not have a topology preference", "resource", resource)
97+
klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "request", requested)
9898
deviceHints[resource] = nil
9999
continue
100100
}
@@ -105,19 +105,19 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
105105
allocated := m.podDevices.podDevices(string(pod.UID), resource)
106106
if allocated.Len() > 0 {
107107
if allocated.Len() != requested {
108-
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
108+
klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
109109
deviceHints[resource] = []topologymanager.TopologyHint{}
110110
continue
111111
}
112-
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod))
112+
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "allocated", allocated.Len())
113113
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
114114
continue
115115
}
116116

117117
// Get the list of available devices, for which TopologyHints should be generated.
118118
available := m.getAvailableDevices(resource)
119119
if available.Len() < requested {
120-
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len())
120+
klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len())
121121
deviceHints[resource] = []topologymanager.TopologyHint{}
122122
continue
123123
}

0 commit comments

Comments
 (0)