rebase: update replaced k8s.io modules to v0.33.0

Signed-off-by: Niels de Vos <ndevos@ibm.com>
2025-06-13 18:43:34 +00:00 · 2025-05-07 13:13:33 +02:00
parent dd77e72800
commit 107407b44b
1723 changed files with 65035 additions and 175239 deletions
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/OWNERS
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/OWNERS
@ -6,6 +6,7 @@ approvers:
  - derekwaynecarr
  - yujuhong
  - klueska
+  - ffromani
 reviewers:
  - sig-node-reviewers
 emeritus_approvers:
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cgroup_manager_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cgroup_manager_linux.go
@ -25,11 +25,10 @@ import (
 	"sync"
 	"time"

-	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
-	libcontainercgroupmanager "github.com/opencontainers/runc/libcontainer/cgroups/manager"
-	cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
-	libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
+	libcontainercgroups "github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
+	libcontainercgroupmanager "github.com/opencontainers/cgroups/manager"
+	cgroupsystemd "github.com/opencontainers/cgroups/systemd"
 	"k8s.io/klog/v2"
 	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"

@ -195,14 +194,14 @@ func (m *cgroupCommon) buildCgroupPaths(name CgroupName) map[string]string {
 }

 // libctCgroupConfig converts CgroupConfig to libcontainer's Cgroup config.
-func (m *cgroupCommon) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup {
-	config := &libcontainerconfigs.Cgroup{
+func (m *cgroupCommon) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainercgroups.Cgroup {
+	config := &libcontainercgroups.Cgroup{
 		Systemd: m.useSystemd,
 	}
 	if needResources {
 		config.Resources = m.toResources(in.ResourceParameters)
 	} else {
-		config.Resources = &libcontainerconfigs.Resources{}
+		config.Resources = &libcontainercgroups.Resources{}
 	}

 	if !config.Systemd {
@ -279,8 +278,8 @@ var (
 	availableRootControllers     sets.Set[string]
 )

-func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
-	resources := &libcontainerconfigs.Resources{
+func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainercgroups.Resources {
+	resources := &libcontainercgroups.Resources{
 		SkipDevices:     true,
 		SkipFreezeOnSet: true,
 	}
@ -324,7 +323,7 @@ func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainer
 	return resources
 }

-func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) {
+func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainercgroups.Resources) {
 	// Check if hugetlb is supported.
 	if libcontainercgroups.IsCgroup2UnifiedMode() {
 		if !getSupportedUnifiedControllers().Has("hugetlb") {
@ -344,7 +343,7 @@ func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources
 			klog.InfoS("Invalid pageSize", "err", err)
 			continue
 		}
-		resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
+		resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainercgroups.HugepageLimit{
 			Pagesize: sizeString,
 			Limit:    uint64(limit),
 		})
@ -355,7 +354,7 @@ func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources
 		if pageSizes.Has(pageSize) {
 			continue
 		}
-		resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
+		resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainercgroups.HugepageLimit{
 			Pagesize: pageSize,
 			Limit:    uint64(0),
 		})
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cgroup_v1_manager_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cgroup_v1_manager_linux.go
@ -22,8 +22,8 @@ import (
 	"strconv"
 	"strings"

-	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	libcontainercgroups "github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/fscommon"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
 )
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cgroup_v2_manager_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cgroup_v2_manager_linux.go
@ -24,13 +24,17 @@ import (
 	"strconv"
 	"strings"

-	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/cgroups/fscommon"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
 	cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
 )

-const cgroupv2MemLimitFile string = "memory.max"
+const (
+	cgroupv2MemLimitFile  = "memory.max"
+	cgroupv2CpuMaxFile    = "cpu.max"
+	cgroupv2CpuWeightFile = "cpu.weight"
+)

 // cgroupV2impl implements the CgroupManager interface
 // for cgroup v2.
@ -100,14 +104,14 @@ func (c *cgroupV2impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName

 func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) {
 	var cpuLimitStr, cpuPeriodStr string
-	cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max")
+	cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, cgroupv2CpuMaxFile)
 	if err != nil {
-		return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %w", cgroupPath, err)
+		return nil, fmt.Errorf("failed to read %s file for cgroup %v: %w", cgroupv2CpuMaxFile, cgroupPath, err)
 	}
 	numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr)
 	if errScan != nil || numItems != 2 {
-		return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %w",
-			cpuLimitAndPeriod, cgroupPath, errScan)
+		return nil, fmt.Errorf("failed to correctly parse content of %s file ('%s') for cgroup %v: %w",
+			cgroupv2CpuMaxFile, cpuLimitAndPeriod, cgroupPath, errScan)
 	}
 	cpuLimit := int64(-1)
 	if cpuLimitStr != Cgroup2MaxCpuLimit {
@ -120,7 +124,7 @@ func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, e
 	if errPeriod != nil {
 		return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %w", cgroupPath, errPeriod)
 	}
-	cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight")
+	cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, cgroupv2CpuWeightFile)
 	if errWeight != nil {
 		return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %w", cgroupPath, errWeight)
 	}
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager.go
@ -31,6 +31,7 @@ import (
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apiserver/pkg/server/healthz"
 	internalapi "k8s.io/cri-api/pkg/apis"
+	"k8s.io/klog/v2"
 	podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
 	kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
 	"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
@ -154,6 +155,13 @@ type ContainerManager interface {
 	// Updates returns a channel that receives an Update when the device changed its status.
 	Updates() <-chan resourceupdates.Update

+	// PodHasExclusiveCPUs returns true if the provided pod has containers with exclusive CPUs,
+	// This means that at least one sidecar container or one app container has exclusive CPUs allocated.
+	PodHasExclusiveCPUs(pod *v1.Pod) bool
+
+	// ContainerHasExclusiveCPUs returns true if the provided container in the pod has exclusive cpu
+	ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool
+
 	// Implements the PodResources Provider API
 	podresources.CPUsProvider
 	podresources.DevicesProvider
@ -161,6 +169,10 @@ type ContainerManager interface {
 	podresources.DynamicResourcesProvider
 }

+type cpuAllocationReader interface {
+	GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet
+}
+
 type NodeConfig struct {
 	NodeName              types.NodeName
 	RuntimeCgroupsName    string
@ -174,19 +186,19 @@ type NodeConfig struct {
 	KubeletRootDir        string
 	ProtectKernelDefaults bool
 	NodeAllocatableConfig
-	QOSReserved                             map[v1.ResourceName]int64
-	CPUManagerPolicy                        string
-	CPUManagerPolicyOptions                 map[string]string
-	TopologyManagerScope                    string
-	CPUManagerReconcilePeriod               time.Duration
-	ExperimentalMemoryManagerPolicy         string
-	ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation
-	PodPidsLimit                            int64
-	EnforceCPULimits                        bool
-	CPUCFSQuotaPeriod                       time.Duration
-	TopologyManagerPolicy                   string
-	TopologyManagerPolicyOptions            map[string]string
-	CgroupVersion                           int
+	QOSReserved                  map[v1.ResourceName]int64
+	CPUManagerPolicy             string
+	CPUManagerPolicyOptions      map[string]string
+	TopologyManagerScope         string
+	CPUManagerReconcilePeriod    time.Duration
+	MemoryManagerPolicy          string
+	MemoryManagerReservedMemory  []kubeletconfig.MemoryReservation
+	PodPidsLimit                 int64
+	EnforceCPULimits             bool
+	CPUCFSQuotaPeriod            time.Duration
+	TopologyManagerPolicy        string
+	TopologyManagerPolicyOptions map[string]string
+	CgroupVersion                int
 }

 type NodeAllocatableConfig struct {
@ -212,6 +224,30 @@ func int64Slice(in []int) []int64 {
 	return out
 }

+func podHasExclusiveCPUs(cr cpuAllocationReader, pod *v1.Pod) bool {
+	for _, container := range pod.Spec.InitContainers {
+		if containerHasExclusiveCPUs(cr, pod, &container) {
+			return true
+		}
+	}
+	for _, container := range pod.Spec.Containers {
+		if containerHasExclusiveCPUs(cr, pod, &container) {
+			return true
+		}
+	}
+	klog.V(4).InfoS("Pod contains no container with pinned cpus", "podName", pod.Name)
+	return false
+}
+
+func containerHasExclusiveCPUs(cr cpuAllocationReader, pod *v1.Pod, container *v1.Container) bool {
+	exclusiveCPUs := cr.GetExclusiveCPUs(string(pod.UID), container.Name)
+	if !exclusiveCPUs.IsEmpty() {
+		klog.V(4).InfoS("Container has pinned cpus", "podName", pod.Name, "containerName", container.Name)
+		return true
+	}
+	return false
+}
+
 // parsePercentage parses the percentage string to numeric value.
 func parsePercentage(v string) (int64, error) {
 	if !strings.HasSuffix(v, "%") {
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager_linux.go
@ -27,9 +27,8 @@ import (
 	"sync"
 	"time"

-	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/manager"
-	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/cgroups"
+	"github.com/opencontainers/cgroups/manager"
 	"k8s.io/klog/v2"
 	"k8s.io/mount-utils"
 	utilpath "k8s.io/utils/path"
@ -336,10 +335,10 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
 	cm.topologyManager.AddHintProvider(cm.cpuManager)

 	cm.memoryManager, err = memorymanager.NewManager(
-		nodeConfig.ExperimentalMemoryManagerPolicy,
+		nodeConfig.MemoryManagerPolicy,
 		machineInfo,
 		cm.GetNodeAllocatableReservation(),
-		nodeConfig.ExperimentalMemoryManagerReservedMemory,
+		nodeConfig.MemoryManagerReservedMemory,
 		nodeConfig.KubeletRootDir,
 		cm.topologyManager,
 	)
@ -365,7 +364,8 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
 			enforceCPULimits:  cm.EnforceCPULimits,
 			// cpuCFSQuotaPeriod is in microseconds. NodeConfig.CPUCFSQuotaPeriod is time.Duration (measured in nano seconds).
 			// Convert (cm.CPUCFSQuotaPeriod) [nanoseconds] / time.Microsecond (1000) to get cpuCFSQuotaPeriod in microseconds.
-			cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
+			cpuCFSQuotaPeriod:   uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
+			podContainerManager: cm,
 		}
 	}
 	return &podContainerManagerNoop{
@ -373,16 +373,24 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
 	}
 }

+func (cm *containerManagerImpl) PodHasExclusiveCPUs(pod *v1.Pod) bool {
+	return podHasExclusiveCPUs(cm.cpuManager, pod)
+}
+
+func (cm *containerManagerImpl) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
+	return containerHasExclusiveCPUs(cm.cpuManager, pod, container)
+}
+
 func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle {
 	return &internalContainerLifecycleImpl{cm.cpuManager, cm.memoryManager, cm.topologyManager}
 }

 // Create a cgroup container manager.
 func createManager(containerName string) (cgroups.Manager, error) {
-	cg := &configs.Cgroup{
+	cg := &cgroups.Cgroup{
 		Parent: "/",
 		Name:   containerName,
-		Resources: &configs.Resources{
+		Resources: &cgroups.Resources{
 			SkipDevices: true,
 		},
 		Systemd: false,
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager_stub.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager_stub.go
@ -195,6 +195,14 @@ func (cm *containerManagerStub) Updates() <-chan resourceupdates.Update {
 	return nil
 }

+func (cm *containerManagerStub) PodHasExclusiveCPUs(pod *v1.Pod) bool {
+	return false
+}
+
+func (cm *containerManagerStub) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
+	return false
+}
+
 func NewStubContainerManager() ContainerManager {
 	return &containerManagerStub{shouldResetExtendedResourceCapacity: false}
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager_windows.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/container_manager_windows.go
@ -168,10 +168,10 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I

 		klog.InfoS("Creating memory manager")
 		cm.memoryManager, err = memorymanager.NewManager(
-			nodeConfig.ExperimentalMemoryManagerPolicy,
+			nodeConfig.MemoryManagerPolicy,
 			machineInfo,
 			cm.GetNodeAllocatableReservation(),
-			nodeConfig.ExperimentalMemoryManagerReservedMemory,
+			nodeConfig.MemoryManagerReservedMemory,
 			nodeConfig.KubeletRootDir,
 			cm.topologyManager,
 		)
@ -369,3 +369,11 @@ func (cm *containerManagerImpl) UnprepareDynamicResources(ctx context.Context, p
 func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool {
 	return false
 }
+
+func (cm *containerManagerImpl) PodHasExclusiveCPUs(pod *v1.Pod) bool {
+	return podHasExclusiveCPUs(cm.cpuManager, pod)
+}
+
+func (cm *containerManagerImpl) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
+	return containerHasExclusiveCPUs(cm.cpuManager, pod, container)
+}
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/containermap/container_map.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/containermap/container_map.go
@ -18,6 +18,7 @@ package containermap

 import (
 	"fmt"
+	"maps"
 )

 // cmItem (ContainerMap ITEM) is a pair podUID, containerName
@ -36,11 +37,7 @@ func NewContainerMap() ContainerMap {

 // Clone creates a deep copy of the ContainerMap
 func (cm ContainerMap) Clone() ContainerMap {
-	ret := make(ContainerMap, len(cm))
-	for key, val := range cm {
-		ret[key] = val
-	}
-	return ret
+	return maps.Clone(cm)
 }

 // Add adds a mapping of (containerID)->(podUID, containerName) to the ContainerMap
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/cpu_assignment.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/cpu_assignment.go
@ -18,6 +18,7 @@ package cpumanager

 import (
 	"fmt"
+	"maps"
 	"math"
 	"sort"

@ -39,11 +40,7 @@ const (
 type mapIntInt map[int]int

 func (m mapIntInt) Clone() mapIntInt {
-	cp := make(mapIntInt, len(m))
-	for k, v := range m {
-		cp[k] = v
-	}
-	return cp
+	return maps.Clone(m)
 }

 func (m mapIntInt) Keys() []int {
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/cpu_manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/cpu_manager.go
@ -239,6 +239,8 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
 		return err
 	}

+	klog.V(4).InfoS("CPU manager started", "policy", m.policy.Name())
+
 	m.allocatableCPUs = m.policy.GetAllocatableCPUs(m.state)

 	if m.policy.Name() == string(PolicyNone) {
@ -465,7 +467,7 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
 			cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name)
 			if cset.IsEmpty() {
 				// NOTE: This should not happen outside of tests.
-				klog.V(2).InfoS("ReconcileState: skipping container; assigned cpuset is empty", "pod", klog.KObj(pod), "containerName", container.Name)
+				klog.V(2).InfoS("ReconcileState: skipping container; empty cpuset assigned", "pod", klog.KObj(pod), "containerName", container.Name)
 				failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
 				continue
 			}
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/policy_options.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/policy_options.go
@ -39,16 +39,17 @@ const (

 var (
 	alphaOptions = sets.New[string](
-		DistributeCPUsAcrossNUMAOption,
 		AlignBySocketOption,
 		DistributeCPUsAcrossCoresOption,
-		StrictCPUReservationOption,
 		PreferAlignByUnCoreCacheOption,
 	)
 	betaOptions = sets.New[string](
+		StrictCPUReservationOption,
+		DistributeCPUsAcrossNUMAOption,
+	)
+	stableOptions = sets.New[string](
 		FullPCPUsOnlyOption,
 	)
-	stableOptions = sets.New[string]()
 )

 // CheckPolicyOptionAvailable verifies if the given option can be used depending on the Feature Gate Settings.
@ -66,6 +67,7 @@ func CheckPolicyOptionAvailable(option string) error {
 		return fmt.Errorf("CPU Manager Policy Beta-level Options not enabled, but option %q provided", option)
 	}

+	// if the option is stable, we need no CPUManagerPolicy*Options feature gate check
 	return nil
 }

--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/policy_static.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/policy_static.go
@ -18,6 +18,7 @@ package cpumanager

 import (
 	"fmt"
+	"strconv"

 	v1 "k8s.io/api/core/v1"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -325,13 +326,16 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 	defer func() {
 		if rerr != nil {
 			metrics.CPUManagerPinningErrorsTotal.Inc()
+			if p.options.FullPhysicalCPUsOnly {
+				metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
+			}
 			return
 		}
-		if !p.options.FullPhysicalCPUsOnly {
+		// TODO: move in updateMetricsOnAllocate
+		if p.options.FullPhysicalCPUsOnly {
 			// increment only if we know we allocate aligned resources
-			return
+			metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
 		}
-		metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
 	}()

 	if p.options.FullPhysicalCPUsOnly {
@ -367,8 +371,8 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 			}
 		}
 	}
-	if cpuset, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
-		p.updateCPUsToReuse(pod, container, cpuset)
+	if cset, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
+		p.updateCPUsToReuse(pod, container, cset)
 		klog.InfoS("Static policy: container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name)
 		return nil
 	}
@ -378,16 +382,17 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 	klog.InfoS("Topology Affinity", "pod", klog.KObj(pod), "containerName", container.Name, "affinity", hint)

 	// Allocate CPUs according to the NUMA affinity contained in the hint.
-	cpuset, err := p.allocateCPUs(s, numCPUs, hint.NUMANodeAffinity, p.cpusToReuse[string(pod.UID)])
+	cpuAllocation, err := p.allocateCPUs(s, numCPUs, hint.NUMANodeAffinity, p.cpusToReuse[string(pod.UID)])
 	if err != nil {
 		klog.ErrorS(err, "Unable to allocate CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "numCPUs", numCPUs)
 		return err
 	}

-	s.SetCPUSet(string(pod.UID), container.Name, cpuset)
-	p.updateCPUsToReuse(pod, container, cpuset)
-	p.updateMetricsOnAllocate(cpuset)
+	s.SetCPUSet(string(pod.UID), container.Name, cpuAllocation.CPUs)
+	p.updateCPUsToReuse(pod, container, cpuAllocation.CPUs)
+	p.updateMetricsOnAllocate(s, cpuAllocation)

+	klog.V(4).InfoS("Allocated exclusive CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpuset", cpuAllocation.CPUs.String())
 	return nil
 }

@ -412,18 +417,19 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
 		// Mutate the shared pool, adding released cpus.
 		toRelease = toRelease.Difference(cpusInUse)
 		s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
-		p.updateMetricsOnRelease(toRelease)
+		p.updateMetricsOnRelease(s, toRelease)
+
 	}
 	return nil
 }

-func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bitmask.BitMask, reusableCPUs cpuset.CPUSet) (cpuset.CPUSet, error) {
+func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bitmask.BitMask, reusableCPUs cpuset.CPUSet) (topology.Allocation, error) {
 	klog.InfoS("AllocateCPUs", "numCPUs", numCPUs, "socket", numaAffinity)

 	allocatableCPUs := p.GetAvailableCPUs(s).Union(reusableCPUs)

 	// If there are aligned CPUs in numaAffinity, attempt to take those first.
-	result := cpuset.New()
+	result := topology.EmptyAllocation()
 	if numaAffinity != nil {
 		alignedCPUs := p.getAlignedCPUs(numaAffinity, allocatableCPUs)

@ -432,30 +438,33 @@ func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bit
 			numAlignedToAlloc = numCPUs
 		}

-		alignedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc)
+		allocatedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc)
 		if err != nil {
-			return cpuset.New(), err
+			return topology.EmptyAllocation(), err
 		}

-		result = result.Union(alignedCPUs)
+		result.CPUs = result.CPUs.Union(allocatedCPUs)
 	}

 	// Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
-	remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result), numCPUs-result.Size())
+	remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result.CPUs), numCPUs-result.CPUs.Size())
 	if err != nil {
-		return cpuset.New(), err
+		return topology.EmptyAllocation(), err
 	}
-	result = result.Union(remainingCPUs)
+	result.CPUs = result.CPUs.Union(remainingCPUs)
+	result.Aligned = p.topology.CheckAlignment(result.CPUs)

 	// Remove allocated CPUs from the shared CPUSet.
-	s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result))
+	s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result.CPUs))

-	klog.InfoS("AllocateCPUs", "result", result)
+	klog.InfoS("AllocateCPUs", "result", result.String())
 	return result, nil
 }

 func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int {
-	if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
+	qos := v1qos.GetPodQOS(pod)
+	if qos != v1.PodQOSGuaranteed {
+		klog.V(5).InfoS("Exclusive CPU allocation skipped, pod QoS is not guaranteed", "pod", klog.KObj(pod), "containerName", container.Name, "qos", qos)
 		return 0
 	}
 	cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
@ -464,11 +473,19 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int
 	// We should return this value because this is what kubelet agreed to allocate for the container
 	// and the value configured with runtime.
 	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
-		if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
+		containerStatuses := pod.Status.ContainerStatuses
+		if podutil.IsRestartableInitContainer(container) {
+			if len(pod.Status.InitContainerStatuses) != 0 {
+				containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
+			}
+		}
+		if cs, ok := podutil.GetContainerStatus(containerStatuses, container.Name); ok {
 			cpuQuantity = cs.AllocatedResources[v1.ResourceCPU]
 		}
 	}
-	if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
+	cpuValue := cpuQuantity.Value()
+	if cpuValue*1000 != cpuQuantity.MilliValue() {
+		klog.V(5).InfoS("Exclusive CPU allocation skipped, pod requested non-integral CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpu", cpuValue)
 		return 0
 	}
 	// Safe downcast to do for all systems with < 2.1 billion CPUs.
@ -740,27 +757,60 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC

 func (p *staticPolicy) initializeMetrics(s state.State) {
 	metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
-	metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
+	metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
+	metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0)        // ensure the value exists
+	metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Add(0)        // ensure the value exists
+	totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
+	metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(totalAssignedCPUs.Size()))
+	updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs)
 }

-func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {
-	ncpus := cset.Size()
+func (p *staticPolicy) updateMetricsOnAllocate(s state.State, cpuAlloc topology.Allocation) {
+	ncpus := cpuAlloc.CPUs.Size()
 	metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus))
 	metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000))
+	if cpuAlloc.Aligned.UncoreCache {
+		metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Inc()
+	}
+	totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
+	updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs)
 }

-func (p *staticPolicy) updateMetricsOnRelease(cset cpuset.CPUSet) {
+func (p *staticPolicy) updateMetricsOnRelease(s state.State, cset cpuset.CPUSet) {
 	ncpus := cset.Size()
 	metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus))
 	metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000))
+	totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
+	updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs.Difference(cset))
 }

-func countExclusiveCPUs(s state.State) int {
-	exclusiveCPUs := 0
-	for _, cpuAssign := range s.GetCPUAssignments() {
-		for _, cset := range cpuAssign {
-			exclusiveCPUs += cset.Size()
+func getTotalAssignedExclusiveCPUs(s state.State) cpuset.CPUSet {
+	totalAssignedCPUs := cpuset.New()
+	for _, assignment := range s.GetCPUAssignments() {
+		for _, cset := range assignment {
+			totalAssignedCPUs = totalAssignedCPUs.Union(cset)
 		}
+
+	}
+	return totalAssignedCPUs
+}
+
+func updateAllocationPerNUMAMetric(topo *topology.CPUTopology, allocatedCPUs cpuset.CPUSet) {
+	numaCount := make(map[int]int)
+
+	// Count CPUs allocated per NUMA node
+	for _, cpuID := range allocatedCPUs.UnsortedList() {
+		numaNode, err := topo.CPUNUMANodeID(cpuID)
+		if err != nil {
+			//NOTE: We are logging the error but it is highly unlikely to happen as the CPUset
+			//      is already computed, evaluated and there is no room for user tampering.
+			klog.ErrorS(err, "Unable to determine NUMA node", "cpuID", cpuID)
+		}
+		numaCount[numaNode]++
+	}
+
+	// Update metric
+	for numaNode, count := range numaCount {
+		metrics.CPUManagerAllocationPerNUMA.WithLabelValues(strconv.Itoa(numaNode)).Set(float64(count))
 	}
-	return exclusiveCPUs
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state/state_checkpoint.go
@ -201,7 +201,7 @@ func (sc *stateCheckpoint) SetCPUSet(podUID string, containerName string, cset c
 	sc.cache.SetCPUSet(podUID, containerName, cset)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
 	}
 }

@ -212,7 +212,7 @@ func (sc *stateCheckpoint) SetDefaultCPUSet(cset cpuset.CPUSet) {
 	sc.cache.SetDefaultCPUSet(cset)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint")
 	}
 }

@ -223,7 +223,7 @@ func (sc *stateCheckpoint) SetCPUAssignments(a ContainerCPUAssignments) {
 	sc.cache.SetCPUAssignments(a)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint")
 	}
 }

@ -234,7 +234,7 @@ func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
 	sc.cache.Delete(podUID, containerName)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
 	}
 }

@ -245,6 +245,6 @@ func (sc *stateCheckpoint) ClearState() {
 	sc.cache.ClearState()
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint")
 	}
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology/alignment.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology/alignment.go
@ -0,0 +1,78 @@
+/*
+Copyright 2025 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package topology
+
+import (
+	"fmt"
+
+	"k8s.io/utils/cpuset"
+)
+
+// Alignment is metadata about a cpuset allocation
+type Alignment struct {
+	// UncoreCache is true if all the CPUs are uncore-cache aligned,
+	// IOW if they all share the same Uncore cache block.
+	// If the allocated CPU count is greater than a Uncore Group size,
+	// CPUs can't be uncore-aligned; otherwise, they are.
+	// This flag tracks alignment, not interference or lack thereof.
+	UncoreCache bool
+}
+
+func (ca Alignment) String() string {
+	return fmt.Sprintf("aligned=<uncore:%v>", ca.UncoreCache)
+}
+
+// Allocation represents a CPU set plus alignment metadata
+type Allocation struct {
+	CPUs    cpuset.CPUSet
+	Aligned Alignment
+}
+
+func (ca Allocation) String() string {
+	return ca.CPUs.String() + " " + ca.Aligned.String()
+}
+
+// EmptyAllocation returns a new zero-valued CPU allocation. Please note that
+// a empty cpuset is aligned according to every possible way we can consider
+func EmptyAllocation() Allocation {
+	return Allocation{
+		CPUs: cpuset.New(),
+		Aligned: Alignment{
+			UncoreCache: true,
+		},
+	}
+}
+
+func isAlignedAtUncoreCache(topo *CPUTopology, cpuList ...int) bool {
+	if len(cpuList) <= 1 {
+		return true
+	}
+	reference, ok := topo.CPUDetails[cpuList[0]]
+	if !ok {
+		return false
+	}
+	for _, cpu := range cpuList[1:] {
+		info, ok := topo.CPUDetails[cpu]
+		if !ok {
+			return false
+		}
+		if info.UncoreCacheID != reference.UncoreCacheID {
+			return false
+		}
+	}
+	return true
+}
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology/doc.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology/doc.go
@ -15,4 +15,4 @@ limitations under the License.
 */

 // Package topology contains helpers for the CPU manager.
-package topology // import "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
+package topology
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology/topology.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology/topology.go
@ -101,6 +101,15 @@ func (topo *CPUTopology) CPUNUMANodeID(cpu int) (int, error) {
 	return info.NUMANodeID, nil
 }

+// CheckAlignment returns alignment information for the given cpuset in
+// the context of the current CPU topology
+func (topo *CPUTopology) CheckAlignment(cpus cpuset.CPUSet) Alignment {
+	cpuList := cpus.UnsortedList()
+	return Alignment{
+		UncoreCache: isAlignedAtUncoreCache(topo, cpuList...),
+	}
+}
+
 // CPUInfo contains the NUMA, socket, UncoreCache and core IDs associated with a CPU.
 type CPUInfo struct {
 	NUMANodeID    int
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/manager.go
@ -202,15 +202,12 @@ func (m *ManagerImpl) CleanupPluginDirectory(dir string) error {
 		if filePath == m.checkpointFile() {
 			continue
 		}
-		// TODO: Until the bug - https://github.com/golang/go/issues/33357 is fixed, os.stat wouldn't return the
-		// right mode(socket) on windows. Hence deleting the file, without checking whether
-		// its a socket, on windows.
-		stat, err := os.Lstat(filePath)
+		stat, err := os.Stat(filePath)
 		if err != nil {
 			klog.ErrorS(err, "Failed to stat file", "path", filePath)
 			continue
 		}
-		if stat.IsDir() {
+		if stat.IsDir() || stat.Mode()&os.ModeSocket == 0 {
 			continue
 		}
 		err = os.RemoveAll(filePath)
@ -351,7 +348,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
 	// Loads in allocatedDevices information from disk.
 	err := m.readCheckpoint()
 	if err != nil {
-		klog.InfoS("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date", "err", err)
+		klog.ErrorS(err, "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date")
 	}

 	return m.server.Start()
@ -453,7 +450,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
 			// should always be consistent. Otherwise, we run with the risk
 			// of failing to garbage collect non-existing resources or devices.
 			if !ok {
-				klog.ErrorS(nil, "Unexpected: healthyDevices and endpoints are out of sync")
+				klog.InfoS("Unexpected: healthyDevices and endpoints are out of sync")
 			}
 			delete(m.endpoints, resourceName)
 			delete(m.healthyDevices, resourceName)
@ -468,7 +465,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
 		eI, ok := m.endpoints[resourceName]
 		if (ok && eI.e.stopGracePeriodExpired()) || !ok {
 			if !ok {
-				klog.ErrorS(nil, "Unexpected: unhealthyDevices and endpoints are out of sync")
+				klog.InfoS("Unexpected: unhealthyDevices and endpoints became out of sync")
 			}
 			delete(m.endpoints, resourceName)
 			delete(m.unhealthyDevices, resourceName)
@ -484,7 +481,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
 	m.mutex.Unlock()
 	if needsUpdateCheckpoint {
 		if err := m.writeCheckpoint(); err != nil {
-			klog.ErrorS(err, "Error on writing checkpoint")
+			klog.ErrorS(err, "Failed to write checkpoint file")
 		}
 	}
 	return capacity, allocatable, deletedResources.UnsortedList()
@ -503,9 +500,10 @@ func (m *ManagerImpl) writeCheckpoint() error {
 	err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data)
 	if err != nil {
 		err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
-		klog.InfoS("Failed to write checkpoint file", "err", err)
+		klog.ErrorS(err, "Failed to write checkpoint file")
 		return err2
 	}
+	klog.V(4).InfoS("Checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint)
 	return nil
 }

@ -516,7 +514,7 @@ func (m *ManagerImpl) readCheckpoint() error {
 	if err != nil {
 		if err == errors.ErrCheckpointNotFound {
 			// no point in trying anything else
-			klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err)
+			klog.ErrorS(err, "Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint)
 			return nil
 		}
 		return err
@ -534,6 +532,8 @@ func (m *ManagerImpl) readCheckpoint() error {
 		m.unhealthyDevices[resource] = sets.New[string]()
 		m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil}
 	}
+
+	klog.V(4).InfoS("Read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint)
 	return nil
 }

@ -596,7 +596,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
 	// running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be
 	// because it already has access to all the required devices, so we got nothing to do and we can bail out.
 	if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) {
-		klog.V(3).InfoS("container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
+		klog.V(3).InfoS("Container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
 		return nil, nil
 	}

@ -627,7 +627,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
 	// We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path.
 	// We cover container restart on kubelet steady state with the same flow.
 	if needed == 0 {
-		klog.V(3).InfoS("no devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
+		klog.V(3).InfoS("No devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
 		// No change, no work.
 		return nil, nil
 	}
@ -836,7 +836,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
 	for k, v := range container.Resources.Limits {
 		resource := string(k)
 		needed := int(v.Value())
-		klog.V(3).InfoS("Looking for needed resources", "needed", needed, "resourceName", resource)
+		klog.V(3).InfoS("Looking for needed resources", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "needed", needed)
 		if !m.isDevicePluginResource(resource) {
 			continue
 		}
@ -882,7 +882,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
 		devs := allocDevices.UnsortedList()
 		// TODO: refactor this part of code to just append a ContainerAllocationRequest
 		// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
-		klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource)
+		klog.V(4).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
 		resp, err := eI.e.allocate(devs)
 		metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime))
 		if err != nil {
@ -952,7 +952,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
 		}

 		if !m.checkPodActive(pod) {
-			klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID)
+			klog.V(5).InfoS("Pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name)
 			continue
 		}

@ -984,7 +984,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s

 	if eI.opts == nil || !eI.opts.PreStartRequired {
 		m.mutex.Unlock()
-		klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "resourceName", resource)
+		klog.V(5).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName)
 		return nil
 	}

@ -1014,12 +1014,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re
 	}

 	if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable {
-		klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource)
+		klog.V(5).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName)
 		return nil, nil
 	}

 	m.mutex.Unlock()
-	klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "containerName", contName, "podUID", podUID)
+	klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "resourceName", resource, "containerName", contName, "podUID", podUID)
 	resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size)
 	m.mutex.Lock()
 	if err != nil {
@ -1167,7 +1167,7 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool {
 func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
 	cntID, err := m.containerMap.GetContainerID(podUID, cntName)
 	if err != nil {
-		klog.V(4).InfoS("container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName, "err", err)
+		klog.ErrorS(err, "Container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName)
 		return false
 	}

@ -1175,11 +1175,11 @@ func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
 	// so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again.
 	// This scenario should however be rare enough.
 	if !m.containerRunningSet.Has(cntID) {
-		klog.V(4).InfoS("container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
+		klog.V(4).InfoS("Container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
 		return false
 	}

 	// Once we make it here we know we have a running container.
-	klog.V(4).InfoS("container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
+	klog.V(4).InfoS("Container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
 	return true
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1/client.go
@ -106,6 +106,8 @@ func (c *client) Disconnect() error {
 	}
 	c.mutex.Unlock()
 	c.handler.PluginDisconnected(c.resource)
+
+	klog.V(2).InfoS("Device plugin disconnected", "resource", c.resource)
 	return nil
 }

--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1/handler.go
@ -43,8 +43,8 @@ func (s *server) RegisterPlugin(pluginName string, endpoint string, versions []s
 	return s.connectClient(pluginName, endpoint)
 }

-func (s *server) DeRegisterPlugin(pluginName string) {
-	klog.V(2).InfoS("Deregistering plugin", "plugin", pluginName)
+func (s *server) DeRegisterPlugin(pluginName, endpoint string) {
+	klog.V(2).InfoS("Deregistering plugin", "plugin", pluginName, "endpoint", endpoint)
 	client := s.getClient(pluginName)
 	if client != nil {
 		s.disconnectClient(pluginName, client)
@ -62,6 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s
 		return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
 	}

+	klog.V(2).InfoS("Device plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
 	return nil
 }

@ -75,6 +76,7 @@ func (s *server) connectClient(name string, socketPath string) error {
 		return err
 	}

+	klog.V(2).InfoS("Connected to new client", "resource", name)
 	go func() {
 		s.runClient(name, c)
 	}()
@ -86,7 +88,6 @@ func (s *server) disconnectClient(name string, c Client) error {
 	s.deregisterClient(name)
 	return c.Disconnect()
 }
-
 func (s *server) registerClient(name string, c Client) {
 	s.mutex.Lock()
 	defer s.mutex.Unlock()
@ -112,7 +113,7 @@ func (s *server) runClient(name string, c Client) {
 	}

 	if err := s.disconnectClient(name, c); err != nil {
-		klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err)
+		klog.ErrorS(err, "Unable to disconnect client", "resource", name, "client", c)
 	}
 }

--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1/server.go
@ -91,7 +91,7 @@ func (s *server) Start() error {

 	if selinux.GetEnabled() {
 		if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
-			klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err)
+			klog.ErrorS(err, "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir)
 		}
 	}

@ -128,7 +128,7 @@ func (s *server) Start() error {
 func (s *server) Stop() error {
 	s.visitClients(func(r string, c Client) {
 		if err := s.disconnectClient(r, c); err != nil {
-			klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err)
+			klog.ErrorS(err, "Failed to disconnect device plugin client", "resourceName", r)
 		}
 	})

@ -145,6 +145,7 @@ func (s *server) Stop() error {
 	// During kubelet termination, we do not need the registration server,
 	// and we consider the kubelet to be healthy even when it is down.
 	s.setHealthy()
+	klog.V(2).InfoS("Stopping device plugin registration server")

 	return nil
 }
@ -159,18 +160,18 @@ func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Emp

 	if !s.isVersionCompatibleWithPlugin(r.Version) {
 		err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
-		klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err)
+		klog.ErrorS(err, "Bad registration request from device plugin with resource", "resourceName", r.ResourceName)
 		return &api.Empty{}, err
 	}

 	if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
 		err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
-		klog.InfoS("Bad registration request from device plugin", "err", err)
+		klog.ErrorS(err, "Bad registration request from device plugin")
 		return &api.Empty{}, err
 	}

 	if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
-		klog.InfoS("Error connecting to device plugin client", "err", err)
+		klog.ErrorS(err, "Error connecting to device plugin client")
 		return &api.Empty{}, err
 	}

--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/pod_devices.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/pod_devices.go
@ -17,6 +17,7 @@ limitations under the License.
 package devicemanager

 import (
+	"maps"
 	"sync"

 	"k8s.io/klog/v2"
@ -429,10 +430,7 @@ func NewResourceDeviceInstances() ResourceDeviceInstances {
 func (rdev ResourceDeviceInstances) Clone() ResourceDeviceInstances {
 	clone := NewResourceDeviceInstances()
 	for resourceName, resourceDevs := range rdev {
-		clone[resourceName] = make(map[string]pluginapi.Device)
-		for devID, dev := range resourceDevs {
-			clone[resourceName][devID] = dev
-		}
+		clone[resourceName] = maps.Clone(resourceDevs)
 	}
 	return clone
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/topology_hints.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/topology_hints.go
@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
 	for resource, requested := range accumulatedResourceRequests {
 		// Only consider devices that actually contain topology information.
 		if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
-			klog.InfoS("Resource does not have a topology preference", "resource", resource)
+			klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested)
 			deviceHints[resource] = nil
 			continue
 		}
@ -54,11 +54,11 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
 		allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
 		if allocated.Len() > 0 {
 			if allocated.Len() != requested {
-				klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
+				klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
 				deviceHints[resource] = []topologymanager.TopologyHint{}
 				continue
 			}
-			klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name)
+			klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
 			deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
 			continue
 		}
@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
 		available := m.getAvailableDevices(resource)
 		reusable := m.devicesToReuse[string(pod.UID)][resource]
 		if available.Union(reusable).Len() < requested {
-			klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len())
+			klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len())
 			deviceHints[resource] = []topologymanager.TopologyHint{}
 			continue
 		}
@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
 	for resource, requested := range accumulatedResourceRequests {
 		// Only consider devices that actually contain topology information.
 		if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
-			klog.InfoS("Resource does not have a topology preference", "resource", resource)
+			klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "request", requested)
 			deviceHints[resource] = nil
 			continue
 		}
@ -105,11 +105,11 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
 		allocated := m.podDevices.podDevices(string(pod.UID), resource)
 		if allocated.Len() > 0 {
 			if allocated.Len() != requested {
-				klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
+				klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
 				deviceHints[resource] = []topologymanager.TopologyHint{}
 				continue
 			}
-			klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod))
+			klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "allocated", allocated.Len())
 			deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
 			continue
 		}
@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
 		// Get the list of available devices, for which TopologyHints should be generated.
 		available := m.getAvailableDevices(resource)
 		if available.Len() < requested {
-			klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len())
+			klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len())
 			deviceHints[resource] = []topologymanager.TopologyHint{}
 			continue
 		}
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/doc.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/doc.go
@ -18,4 +18,4 @@ limitations under the License.
 // to manage containers. For example, they contain functions to configure containers' cgroups,
 // ensure containers run with the desired QoS, and allocate compute resources like cpus, memory,
 // devices...
-package cm // import "k8s.io/kubernetes/pkg/kubelet/cm"
+package cm
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/manager.go
@ -98,7 +98,20 @@ func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, n
 }

 func (m *ManagerImpl) GetWatcherHandler() cache.PluginHandler {
-	return cache.PluginHandler(dra.NewRegistrationHandler(m.kubeClient, m.getNode))
+	// The time that DRA drivers have to come back after being unregistered
+	// before the kubelet removes their ResourceSlices.
+	//
+	// This must be long enough to actually allow stopping a pod and
+	// starting the replacement (otherwise ResourceSlices get deleted
+	// unnecessarily) and not too long (otherwise the time window were
+	// pods might still get scheduled to the node after removal of a
+	// driver is too long).
+	//
+	// 30 seconds might be long enough for a simple container restart.
+	// If a DRA driver wants to be sure that slices don't get wiped,
+	// it should use rolling updates.
+	wipingDelay := 30 * time.Second
+	return cache.PluginHandler(dra.NewRegistrationHandler(m.kubeClient, m.getNode, wipingDelay))
 }

 // Start starts the reconcile loop of the manager.
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin/plugins_store.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin/plugins_store.go
@ -18,13 +18,16 @@ package plugin

 import (
 	"errors"
+	"fmt"
+	"slices"
 	"sync"
 )

 // PluginsStore holds a list of DRA Plugins.
 type pluginsStore struct {
 	sync.RWMutex
-	store map[string]*Plugin
+	// plugin name -> Plugin in the order in which they got added
+	store map[string][]*Plugin
 }

 // draPlugins map keeps track of all registered DRA plugins on the node
@ -37,43 +40,57 @@ func (s *pluginsStore) get(pluginName string) *Plugin {
 	s.RLock()
 	defer s.RUnlock()

-	return s.store[pluginName]
+	instances := s.store[pluginName]
+	if len(instances) == 0 {
+		return nil
+	}
+	// Heuristic: pick the most recent one. It's most likely
+	// the newest, except when kubelet got restarted and registered
+	// all running plugins in random order.
+	return instances[len(instances)-1]
 }

 // Set lets you save a DRA Plugin to the list and give it a specific name.
 // This method is protected by a mutex.
-func (s *pluginsStore) add(p *Plugin) (replacedPlugin *Plugin, replaced bool) {
+func (s *pluginsStore) add(p *Plugin) error {
 	s.Lock()
 	defer s.Unlock()

 	if s.store == nil {
-		s.store = make(map[string]*Plugin)
+		s.store = make(map[string][]*Plugin)
 	}
-
-	replacedPlugin, exists := s.store[p.name]
-	s.store[p.name] = p
-
-	if replacedPlugin != nil && replacedPlugin.cancel != nil {
-		replacedPlugin.cancel(errors.New("plugin got replaced"))
+	for _, oldP := range s.store[p.name] {
+		if oldP.endpoint == p.endpoint {
+			// One plugin instance cannot hijack the endpoint of another instance.
+			return fmt.Errorf("endpoint %s already registered for plugin %s", p.endpoint, p.name)
+		}
 	}
-
-	return replacedPlugin, exists
+	s.store[p.name] = append(s.store[p.name], p)
+	return nil
 }

-// Delete lets you delete a DRA Plugin by name.
-// This method is protected by a mutex.
-func (s *pluginsStore) delete(pluginName string) *Plugin {
+// remove lets you remove one endpoint for a DRA Plugin.
+// This method is protected by a mutex. It returns the
+// plugin if found and true if that was the last instance
+func (s *pluginsStore) remove(pluginName, endpoint string) (*Plugin, bool) {
 	s.Lock()
 	defer s.Unlock()

-	p, exists := s.store[pluginName]
-	if !exists {
-		return nil
+	instances := s.store[pluginName]
+	i := slices.IndexFunc(instances, func(p *Plugin) bool { return p.endpoint == endpoint })
+	if i == -1 {
+		return nil, false
 	}
+	p := instances[i]
+	last := len(instances) == 1
+	if last {
+		delete(s.store, pluginName)
+	} else {
+		s.store[pluginName] = slices.Delete(instances, i, i+1)
+	}
+
 	if p.cancel != nil {
 		p.cancel(errors.New("plugin got removed"))
 	}
-	delete(s.store, pluginName)
-
-	return p
+	return p, last
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin/registration.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin/registration.go
@ -21,6 +21,7 @@ import (
 	"errors"
 	"fmt"
 	"slices"
+	"sync"
 	"time"

 	v1 "k8s.io/api/core/v1"
@ -51,8 +52,22 @@ type RegistrationHandler struct {
 	// This is necessary because it implements APIs which don't
 	// provide a context.
 	backgroundCtx context.Context
+	cancel        func(err error)
 	kubeClient    kubernetes.Interface
 	getNode       func() (*v1.Node, error)
+	wipingDelay   time.Duration
+
+	wg    sync.WaitGroup
+	mutex sync.Mutex
+
+	// pendingWipes maps a plugin name to a cancel function for
+	// wiping of that plugin's ResourceSlices. Entries get added
+	// in DeRegisterPlugin and check in RegisterPlugin. If
+	// wiping is pending during RegisterPlugin, it gets canceled.
+	//
+	// Must use pointers to functions because the entries have to
+	// be comparable.
+	pendingWipes map[string]*context.CancelCauseFunc
 }

 var _ cache.PluginHandler = &RegistrationHandler{}
@ -62,12 +77,20 @@ var _ cache.PluginHandler = &RegistrationHandler{}
 // Must only be called once per process because it manages global state.
 // If a kubeClient is provided, then it synchronizes ResourceSlices
 // with the resource information provided by plugins.
-func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1.Node, error)) *RegistrationHandler {
+func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1.Node, error), wipingDelay time.Duration) *RegistrationHandler {
+	// The context and thus logger should come from the caller.
+	return newRegistrationHandler(context.TODO(), kubeClient, getNode, wipingDelay)
+}
+
+func newRegistrationHandler(ctx context.Context, kubeClient kubernetes.Interface, getNode func() (*v1.Node, error), wipingDelay time.Duration) *RegistrationHandler {
+	ctx, cancel := context.WithCancelCause(ctx)
 	handler := &RegistrationHandler{
-		// The context and thus logger should come from the caller.
-		backgroundCtx: klog.NewContext(context.TODO(), klog.LoggerWithName(klog.TODO(), "DRA registration handler")),
+		backgroundCtx: klog.NewContext(ctx, klog.LoggerWithName(klog.FromContext(ctx), "DRA registration handler")),
+		cancel:        cancel,
 		kubeClient:    kubeClient,
 		getNode:       getNode,
+		wipingDelay:   wipingDelay,
+		pendingWipes:  make(map[string]*context.CancelCauseFunc),
 	}

 	// When kubelet starts up, no DRA driver has registered yet. None of
@ -77,19 +100,45 @@ func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1
 	// to start up.
 	//
 	// This has to run in the background.
-	go handler.wipeResourceSlices("")
+	handler.wg.Add(1)
+	go func() {
+		defer handler.wg.Done()
+
+		logger := klog.LoggerWithName(klog.FromContext(handler.backgroundCtx), "startup")
+		ctx := klog.NewContext(handler.backgroundCtx, logger)
+		handler.wipeResourceSlices(ctx, 0 /* no delay */, "" /* all drivers */)
+	}()

 	return handler
 }

+// Stop cancels any remaining background activities and blocks until all goroutines have stopped.
+func (h *RegistrationHandler) Stop() {
+	h.cancel(errors.New("Stop was called"))
+	h.wg.Wait()
+}
+
 // wipeResourceSlices deletes ResourceSlices of the node, optionally just for a specific driver.
-func (h *RegistrationHandler) wipeResourceSlices(driver string) {
+// Wiping will delay for a while and can be canceled by canceling the context.
+func (h *RegistrationHandler) wipeResourceSlices(ctx context.Context, delay time.Duration, driver string) {
 	if h.kubeClient == nil {
 		return
 	}
-	ctx := h.backgroundCtx
 	logger := klog.FromContext(ctx)

+	if delay != 0 {
+		// Before we start deleting, give the driver time to bounce back.
+		// Perhaps it got removed as part of a DaemonSet update and the
+		// replacement pod is about to start.
+		logger.V(4).Info("Starting to wait before wiping ResourceSlices", "delay", delay)
+		select {
+		case <-ctx.Done():
+			logger.V(4).Info("Aborting wiping of ResourceSlices", "reason", context.Cause(ctx))
+		case <-time.After(delay):
+			logger.V(4).Info("Starting to wipe ResourceSlices after waiting", "delay", delay)
+		}
+	}
+
 	backoff := wait.Backoff{
 		Duration: time.Second,
 		Factor:   2,
@ -148,10 +197,10 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,
 	// into all log output related to the plugin.
 	ctx := h.backgroundCtx
 	logger := klog.FromContext(ctx)
-	logger = klog.LoggerWithValues(logger, "pluginName", pluginName)
+	logger = klog.LoggerWithValues(logger, "pluginName", pluginName, "endpoint", endpoint)
 	ctx = klog.NewContext(ctx, logger)

-	logger.V(3).Info("Register new DRA plugin", "endpoint", endpoint)
+	logger.V(3).Info("Register new DRA plugin")

 	chosenService, err := h.validateSupportedServices(pluginName, supportedServices)
 	if err != nil {
@ -179,9 +228,19 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,

 	// Storing endpoint of newly registered DRA Plugin into the map, where plugin name will be the key
 	// all other DRA components will be able to get the actual socket of DRA plugins by its name.
+	if err := draPlugins.add(pluginInstance); err != nil {
+		cancel(err)
+		// No wrapping, the error already contains details.
+		return err
+	}

-	if oldPlugin, replaced := draPlugins.add(pluginInstance); replaced {
-		logger.V(1).Info("DRA plugin already registered, the old plugin was replaced and will be forgotten by the kubelet till the next kubelet restart", "oldEndpoint", oldPlugin.endpoint)
+	// Now cancel any pending ResourceSlice wiping for this plugin.
+	// Only needs to be done once.
+	h.mutex.Lock()
+	defer h.mutex.Unlock()
+	if cancel := h.pendingWipes[pluginName]; cancel != nil {
+		(*cancel)(errors.New("new plugin instance registered"))
+		delete(h.pendingWipes, pluginName)
 	}

 	return nil
@ -220,16 +279,53 @@ func (h *RegistrationHandler) validateSupportedServices(pluginName string, suppo

 // DeRegisterPlugin is called when a plugin has removed its socket,
 // signaling it is no longer available.
-func (h *RegistrationHandler) DeRegisterPlugin(pluginName string) {
-	if p := draPlugins.delete(pluginName); p != nil {
+func (h *RegistrationHandler) DeRegisterPlugin(pluginName, endpoint string) {
+	if p, last := draPlugins.remove(pluginName, endpoint); p != nil {
+		// This logger includes endpoint and pluginName.
 		logger := klog.FromContext(p.backgroundCtx)
-		logger.V(3).Info("Deregister DRA plugin", "endpoint", p.endpoint)
+		logger.V(3).Info("Deregister DRA plugin", "lastInstance", last)
+		if !last {
+			return
+		}
+
+		// Prepare for canceling the background wiping. This needs to run
+		// in the context of the registration handler, the one from
+		// the plugin is canceled.
+		logger = klog.FromContext(h.backgroundCtx)
+		logger = klog.LoggerWithName(logger, "driver-cleanup")
+		logger = klog.LoggerWithValues(logger, "pluginName", pluginName)
+		ctx, cancel := context.WithCancelCause(h.backgroundCtx)
+		ctx = klog.NewContext(ctx, logger)

 		// Clean up the ResourceSlices for the deleted Plugin since it
 		// may have died without doing so itself and might never come
 		// back.
-		go h.wipeResourceSlices(pluginName)
+		//
+		// May get canceled if the plugin comes back quickly enough
+		// (see RegisterPlugin).
+		h.mutex.Lock()
+		defer h.mutex.Unlock()
+		if cancel := h.pendingWipes[pluginName]; cancel != nil {
+			(*cancel)(errors.New("plugin deregistered a second time"))
+		}
+		h.pendingWipes[pluginName] = &cancel

+		h.wg.Add(1)
+		go func() {
+			defer h.wg.Done()
+			defer func() {
+				h.mutex.Lock()
+				defer h.mutex.Unlock()
+
+				// Cancel our own context, but remove it from the map only if it
+				// is the current entry. Perhaps it already got replaced.
+				cancel(errors.New("wiping done"))
+				if h.pendingWipes[pluginName] == &cancel {
+					delete(h.pendingWipes, pluginName)
+				}
+			}()
+			h.wipeResourceSlices(ctx, h.wipingDelay, pluginName)
+		}()
 		return
 	}

--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/fake_container_manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/fake_container_manager.go
@ -268,3 +268,11 @@ func (cm *FakeContainerManager) UpdateAllocatedResourcesStatus(pod *v1.Pod, stat
 func (cm *FakeContainerManager) Updates() <-chan resourceupdates.Update {
 	return nil
 }
+
+func (cm *FakeContainerManager) PodHasExclusiveCPUs(pod *v1.Pod) bool {
+	return false
+}
+
+func (cm *FakeContainerManager) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
+	return false
+}
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/helpers_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/helpers_linux.go
@ -23,7 +23,7 @@ import (
 	"path/filepath"
 	"strconv"

-	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
+	libcontainercgroups "github.com/opencontainers/cgroups"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/types"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/memory_manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/memory_manager.go
@ -205,6 +205,7 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe

 	m.allocatableMemory = m.policy.GetAllocatableMemory(m.state)

+	klog.V(4).InfoS("memorymanager started", "policy", m.policy.Name())
 	return nil
 }

@ -248,7 +249,7 @@ func (m *manager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.
 	}

 	if numaNodes.Len() == 0 {
-		klog.V(5).InfoS("No allocation is available", "pod", klog.KObj(pod), "containerName", container.Name)
+		klog.V(5).InfoS("NUMA nodes not available for allocation", "pod", klog.KObj(pod), "containerName", container.Name)
 		return nil
 	}

@ -266,7 +267,7 @@ func (m *manager) Allocate(pod *v1.Pod, container *v1.Container) error {

 	// Call down into the policy to assign this container memory if required.
 	if err := m.policy.Allocate(m.state, pod, container); err != nil {
-		klog.ErrorS(err, "Allocate error")
+		klog.ErrorS(err, "Allocate error", "pod", klog.KObj(pod), "containerName", container.Name)
 		return err
 	}
 	return nil
@ -280,7 +281,7 @@ func (m *manager) RemoveContainer(containerID string) error {
 	// if error appears it means container entry already does not exist under the container map
 	podUID, containerName, err := m.containerMap.GetContainerRef(containerID)
 	if err != nil {
-		klog.InfoS("Failed to get container from container map", "containerID", containerID, "err", err)
+		klog.ErrorS(err, "Failed to get container from container map", "containerID", containerID)
 		return nil
 	}

@ -344,7 +345,7 @@ func (m *manager) removeStaleState() {
 	for podUID := range assignments {
 		for containerName := range assignments[podUID] {
 			if _, ok := activeContainers[podUID][containerName]; !ok {
-				klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
+				klog.V(2).InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
 				m.policyRemoveContainerByRef(podUID, containerName)
 			}
 		}
@ -352,7 +353,7 @@ func (m *manager) removeStaleState() {

 	m.containerMap.Visit(func(podUID, containerName, containerID string) {
 		if _, ok := activeContainers[podUID][containerName]; !ok {
-			klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
+			klog.V(2).InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
 			m.policyRemoveContainerByRef(podUID, containerName)
 		}
 	})
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/policy_static.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/policy_static.go
@ -96,7 +96,9 @@ func (p *staticPolicy) Start(s state.State) error {
 // Allocate call is idempotent
 func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
 	// allocate the memory only for guaranteed pods
-	if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
+	qos := v1qos.GetPodQOS(pod)
+	if qos != v1.PodQOSGuaranteed {
+		klog.V(5).InfoS("Exclusive memory allocation skipped, pod QoS is not guaranteed", "pod", klog.KObj(pod), "containerName", container.Name, "qos", qos)
 		return nil
 	}

@ -196,6 +198,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
 	// TODO: we should refactor our state structs to reflect the amount of the re-used memory
 	p.updateInitContainersMemoryBlocks(s, pod, container, containerBlocks)

+	klog.V(4).InfoS("Allocated exclusive memory", "pod", klog.KObj(pod), "containerName", container.Name)
 	return nil
 }

@ -304,24 +307,24 @@ func regenerateHints(pod *v1.Pod, ctn *v1.Container, ctnBlocks []state.Block, re
 	}

 	if len(ctnBlocks) != len(reqRsrc) {
-		klog.ErrorS(nil, "The number of requested resources by the container differs from the number of memory blocks", "containerName", ctn.Name)
+		klog.InfoS("The number of requested resources by the container differs from the number of memory blocks", "pod", klog.KObj(pod), "containerName", ctn.Name)
 		return nil
 	}

 	for _, b := range ctnBlocks {
 		if _, ok := reqRsrc[b.Type]; !ok {
-			klog.ErrorS(nil, "Container requested resources do not have resource of this type", "containerName", ctn.Name, "type", b.Type)
+			klog.InfoS("Container requested resources but none available of this type", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type)
 			return nil
 		}

 		if b.Size != reqRsrc[b.Type] {
-			klog.ErrorS(nil, "Memory already allocated with different numbers than requested", "podUID", pod.UID, "type", b.Type, "containerName", ctn.Name, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
+			klog.InfoS("Memory already allocated with different numbers than requested", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
 			return nil
 		}

 		containerNUMAAffinity, err := bitmask.NewBitMask(b.NUMAAffinity...)
 		if err != nil {
-			klog.ErrorS(err, "Failed to generate NUMA bitmask")
+			klog.ErrorS(err, "Failed to generate NUMA bitmask", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type)
 			return nil
 		}

@ -447,7 +450,13 @@ func getRequestedResources(pod *v1.Pod, container *v1.Container) (map[v1.Resourc
 	// We should return this value because this is what kubelet agreed to allocate for the container
 	// and the value configured with runtime.
 	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
-		if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
+		containerStatuses := pod.Status.ContainerStatuses
+		if podutil.IsRestartableInitContainer(container) {
+			if len(pod.Status.InitContainerStatuses) != 0 {
+				containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
+			}
+		}
+		if cs, ok := podutil.GetContainerStatus(containerStatuses, container.Name); ok {
 			resources = cs.AllocatedResources
 		}
 	}
@ -654,36 +663,36 @@ func (p *staticPolicy) validateState(s state.State) error {

 func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
 	if len(ms1) != len(ms2) {
-		klog.ErrorS(nil, "Node states are different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
+		klog.InfoS("Node states were different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
 		return false
 	}

 	for nodeID, nodeState1 := range ms1 {
 		nodeState2, ok := ms2[nodeID]
 		if !ok {
-			klog.ErrorS(nil, "Node state does not have node ID", "nodeID", nodeID)
+			klog.InfoS("Node state didn't have node ID", "nodeID", nodeID)
 			return false
 		}

 		if nodeState1.NumberOfAssignments != nodeState2.NumberOfAssignments {
-			klog.ErrorS(nil, "Node states number of assignments are different", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
+			klog.InfoS("Node state had a different number of memory assignments.", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
 			return false
 		}

 		if !areGroupsEqual(nodeState1.Cells, nodeState2.Cells) {
-			klog.ErrorS(nil, "Node states groups are different", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
+			klog.InfoS("Node states had different groups", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
 			return false
 		}

 		if len(nodeState1.MemoryMap) != len(nodeState2.MemoryMap) {
-			klog.ErrorS(nil, "Node states memory map have different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
+			klog.InfoS("Node state had memory maps of different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
 			return false
 		}

 		for resourceName, memoryState1 := range nodeState1.MemoryMap {
 			memoryState2, ok := nodeState2.MemoryMap[resourceName]
 			if !ok {
-				klog.ErrorS(nil, "Memory state does not have resource", "resource", resourceName)
+				klog.InfoS("Memory state didn't have resource", "resource", resourceName)
 				return false
 			}

@ -701,11 +710,11 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
 			}

 			if tmpState1.Free != tmpState2.Free {
-				klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
+				klog.InfoS("NUMA node and resource had different memory states", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
 				return false
 			}
 			if tmpState1.Reserved != tmpState2.Reserved {
-				klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
+				klog.InfoS("NUMA node and resource had different memory states", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
 				return false
 			}
 		}
@ -715,17 +724,17 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {

 func areMemoryStatesEqual(memoryState1, memoryState2 *state.MemoryTable, nodeID int, resourceName v1.ResourceName) bool {
 	if memoryState1.TotalMemSize != memoryState2.TotalMemSize {
-		klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
+		klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
 		return false
 	}

 	if memoryState1.SystemReserved != memoryState2.SystemReserved {
-		klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
+		klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
 		return false
 	}

 	if memoryState1.Allocatable != memoryState2.Allocatable {
-		klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
+		klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
 		return false
 	}
 	return true
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state/state_checkpoint.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state/state_checkpoint.go
@ -131,7 +131,7 @@ func (sc *stateCheckpoint) SetMachineState(memoryMap NUMANodeMap) {
 	sc.cache.SetMachineState(memoryMap)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint")
 	}
 }

@ -143,7 +143,7 @@ func (sc *stateCheckpoint) SetMemoryBlocks(podUID string, containerName string,
 	sc.cache.SetMemoryBlocks(podUID, containerName, blocks)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
 	}
 }

@ -155,7 +155,7 @@ func (sc *stateCheckpoint) SetMemoryAssignments(assignments ContainerMemoryAssig
 	sc.cache.SetMemoryAssignments(assignments)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint")
 	}
 }

@ -167,7 +167,7 @@ func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
 	sc.cache.Delete(podUID, containerName)
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
 	}
 }

@ -179,6 +179,6 @@ func (sc *stateCheckpoint) ClearState() {
 	sc.cache.ClearState()
 	err := sc.storeState()
 	if err != nil {
-		klog.InfoS("Store state to checkpoint error", "err", err)
+		klog.ErrorS(err, "Failed to store state to checkpoint")
 	}
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state/state_mem.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state/state_mem.go
@ -94,6 +94,7 @@ func (s *stateMemory) SetMemoryAssignments(assignments ContainerMemoryAssignment
 	defer s.Unlock()

 	s.assignments = assignments.Clone()
+	klog.V(5).InfoS("Updated Memory assignments", "assignments", assignments)
 }

 // Delete deletes corresponding Blocks from ContainerMemoryAssignments
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/pod_container_manager_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/pod_container_manager_linux.go
@ -23,7 +23,7 @@ import (
 	"path"
 	"strings"

-	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
+	libcontainercgroups "github.com/opencontainers/cgroups"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/types"
 	utilerrors "k8s.io/apimachinery/pkg/util/errors"
@ -55,6 +55,8 @@ type podContainerManagerImpl struct {
 	// cpuCFSQuotaPeriod is the cfs period value, cfs_period_us, setting per
 	// node for all containers in usec
 	cpuCFSQuotaPeriod uint64
+	// podContainerManager is the ContainerManager running on the machine
+	podContainerManager ContainerManager
 }

 // Make sure that podContainerManagerImpl implements the PodContainerManager interface
@ -73,6 +75,11 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
 	// check if container already exist
 	alreadyExists := m.Exists(pod)
 	if !alreadyExists {
+		enforceCPULimits := m.enforceCPULimits
+		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DisableCPUQuotaWithExclusiveCPUs) && m.podContainerManager.PodHasExclusiveCPUs(pod) {
+			klog.V(2).InfoS("Disabled CFS quota", "pod", klog.KObj(pod))
+			enforceCPULimits = false
+		}
 		enforceMemoryQoS := false
 		if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
 			libcontainercgroups.IsCgroup2UnifiedMode() {
@ -82,7 +89,7 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
 		podContainerName, _ := m.GetPodContainerName(pod)
 		containerConfig := &CgroupConfig{
 			Name:               podContainerName,
-			ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
+			ResourceParameters: ResourceConfigForPod(pod, enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
 		}
 		if m.podPidsLimit > 0 {
 			containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/qos_container_manager_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/qos_container_manager_linux.go
@ -29,7 +29,7 @@ import (
 	"k8s.io/apimachinery/pkg/util/wait"

 	units "github.com/docker/go-units"
-	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
+	libcontainercgroups "github.com/opencontainers/cgroups"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"

 	"k8s.io/component-helpers/resource"
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/fake_topology_manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/fake_topology_manager.go
@ -45,7 +45,7 @@ func NewFakeManagerWithHint(hint *TopologyHint) Manager {

 // NewFakeManagerWithPolicy returns an instance of fake topology manager with specified policy
 func NewFakeManagerWithPolicy(policy Policy) Manager {
-	klog.InfoS("NewFakeManagerWithPolicy")
+	klog.InfoS("NewFakeManagerWithPolicy", "policy", policy.Name())
 	return &fakeManager{
 		policy: policy,
 	}
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/policy_options.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/policy_options.go
@ -47,11 +47,11 @@ func CheckPolicyOptionAvailable(option string) error {
 	}

 	if alphaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyAlphaOptions) {
-		return fmt.Errorf("Topology Manager Policy Alpha-level Options not enabled, but option %q provided", option)
+		return fmt.Errorf("topology manager policy alpha-level options not enabled, but option %q provided", option)
 	}

 	if betaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyBetaOptions) {
-		return fmt.Errorf("Topology Manager Policy Beta-level Options not enabled, but option %q provided", option)
+		return fmt.Errorf("topology manager policy beta-level options not enabled, but option %q provided", option)
 	}

 	return nil
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/scope_container.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/scope_container.go
@ -50,6 +50,9 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
 		klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)

 		if !admit {
+			if IsAlignmentGuaranteed(s.policy) {
+				metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
+			}
 			metrics.TopologyManagerAdmissionErrorsTotal.Inc()
 			return admission.GetPodAdmitResult(&TopologyAffinityError{})
 		}
@ -63,6 +66,7 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
 		}

 		if IsAlignmentGuaranteed(s.policy) {
+			klog.V(4).InfoS("Resource alignment at container scope guaranteed", "pod", klog.KObj(pod))
 			metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
 		}
 	}
@ -84,6 +88,6 @@ func (s *containerScope) accumulateProvidersHints(pod *v1.Pod, container *v1.Con
 func (s *containerScope) calculateAffinity(pod *v1.Pod, container *v1.Container) (TopologyHint, bool) {
 	providersHints := s.accumulateProvidersHints(pod, container)
 	bestHint, admit := s.policy.Merge(providersHints)
-	klog.InfoS("ContainerTopologyHint", "bestHint", bestHint)
+	klog.InfoS("ContainerTopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
 	return bestHint, admit
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/scope_pod.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/scope_pod.go
@ -48,6 +48,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
 	bestHint, admit := s.calculateAffinity(pod)
 	klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
 	if !admit {
+		if IsAlignmentGuaranteed(s.policy) {
+			// increment only if we know we allocate aligned resources.
+			metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
+		}
 		metrics.TopologyManagerAdmissionErrorsTotal.Inc()
 		return admission.GetPodAdmitResult(&TopologyAffinityError{})
 	}
@ -64,6 +68,7 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
 	}
 	if IsAlignmentGuaranteed(s.policy) {
 		// increment only if we know we allocate aligned resources.
+		klog.V(4).InfoS("Resource alignment at pod scope guaranteed", "pod", klog.KObj(pod))
 		metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
 	}
 	return admission.GetPodAdmitResult(nil)
@ -84,6 +89,6 @@ func (s *podScope) accumulateProvidersHints(pod *v1.Pod) []map[string][]Topology
 func (s *podScope) calculateAffinity(pod *v1.Pod) (TopologyHint, bool) {
 	providersHints := s.accumulateProvidersHints(pod)
 	bestHint, admit := s.policy.Merge(providersHints)
-	klog.InfoS("PodTopologyHint", "bestHint", bestHint)
+	klog.InfoS("PodTopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
 	return bestHint, admit
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/topology_manager.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/topology_manager.go
@ -188,9 +188,19 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology
 		scope: scope,
 	}

+	manager.initializeMetrics()
+
 	return manager, nil
 }

+func (m *manager) initializeMetrics() {
+	// ensure the values exist
+	metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
+	metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
+	metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
+	metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
+}
+
 func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint {
 	return m.scope.GetAffinity(podUID, containerName)
 }
@ -212,11 +222,13 @@ func (m *manager) RemoveContainer(containerID string) error {
 }

 func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
+	klog.V(4).InfoS("Topology manager admission check", "pod", klog.KObj(attrs.Pod))
 	metrics.TopologyManagerAdmissionRequestsTotal.Inc()

 	startTime := time.Now()
 	podAdmitResult := m.scope.Admit(attrs.Pod)
 	metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds()))

+	klog.V(4).InfoS("Pod Admit Result", "Message", podAdmitResult.Message, "pod", klog.KObj(attrs.Pod))
 	return podAdmitResult
 }
--- a/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/util/cgroups_linux.go
+++ b/e2e/vendor/k8s.io/kubernetes/pkg/kubelet/cm/util/cgroups_linux.go
@ -21,7 +21,7 @@ import (

 	libcontainerutils "k8s.io/kubernetes/third_party/forked/libcontainer/utils"

-	libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
+	libcontainercgroups "github.com/opencontainers/cgroups"
 )

 const (