rebase: update replaced k8s.io modules to v0.33.0

Signed-off-by: Niels de Vos <ndevos@ibm.com>
This commit is contained in:
Niels de Vos
2025-05-07 13:13:33 +02:00
committed by mergify[bot]
parent dd77e72800
commit 107407b44b
1723 changed files with 65035 additions and 175239 deletions

View File

@ -17,4 +17,4 @@ limitations under the License.
// +k8s:deepcopy-gen=package
// +groupName=kubelet.config.k8s.io
package config // import "k8s.io/kubernetes/pkg/kubelet/apis/config"
package config

View File

@ -40,6 +40,8 @@ func addKnownTypes(scheme *runtime.Scheme) error {
&KubeletConfiguration{},
&SerializedNodeConfigSource{},
&CredentialProviderConfig{},
&ImagePullIntent{},
&ImagePulledRecord{},
)
return nil
}

View File

@ -155,6 +155,25 @@ type KubeletConfiguration struct {
// pulls to burst to this number, while still not exceeding registryPullQPS.
// Only used if registryPullQPS > 0.
RegistryBurst int32
// imagePullCredentialsVerificationPolicy determines how credentials should be
// verified when pod requests an image that is already present on the node:
// - NeverVerify
// - anyone on a node can use any image present on the node
// - NeverVerifyPreloadedImages
// - images that were pulled to the node by something else than the kubelet
// can be used without reverifying pull credentials
// - NeverVerifyAllowlistedImages
// - like "NeverVerifyPreloadedImages" but only node images from
// `preloadedImagesVerificationAllowlist` don't require reverification
// - AlwaysVerify
// - all images require credential reverification
ImagePullCredentialsVerificationPolicy string
// preloadedImagesVerificationAllowlist specifies a list of images that are
// exempted from credential reverification for the "NeverVerifyAllowlistedImages"
// `imagePullCredentialsVerificationPolicy`.
// The list accepts a full path segment wildcard suffix "/*".
// Only use image specs without an image tag or digest.
PreloadedImagesVerificationAllowlist []string
// eventRecordQPS is the maximum event creations per second. If 0, there
// is no limit enforced.
EventRecordQPS int32
@ -234,14 +253,11 @@ type KubeletConfiguration struct {
// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
SingleProcessOOMKill *bool
// CPUManagerPolicy is the name of the policy to use.
// Requires the CPUManager feature gate to be enabled.
CPUManagerPolicy string
// CPUManagerPolicyOptions is a set of key=value which allows to set extra options
// to fine tune the behaviour of the cpu manager policies.
// Requires both the "CPUManager" and "CPUManagerPolicyOptions" feature gates to be enabled.
CPUManagerPolicyOptions map[string]string
// CPU Manager reconciliation period.
// Requires the CPUManager feature gate to be enabled.
CPUManagerReconcilePeriod metav1.Duration
// MemoryManagerPolicy is the name of the policy to use.
// Requires the MemoryManager feature gate to be enabled.
@ -322,6 +338,14 @@ type KubeletConfiguration struct {
// amount of a given resource the kubelet will reclaim when performing a pod eviction while
// that resource is under pressure. For example: {"imagefs.available": "2Gi"}
EvictionMinimumReclaim map[string]string
// mergeDefaultEvictionSettings indicates that defaults for the evictionHard, evictionSoft, evictionSoftGracePeriod, and evictionMinimumReclaim
// fields should be merged into values specified for those fields in this configuration.
// Signals specified in this configuration take precedence.
// Signals not specified in this configuration inherit their defaults.
// If false, and if any signal is specified in this configuration then other signals that
// are not specified in this configuration will be set to 0.
// It applies to merging the fields for which the default exists, and currently only evictionHard has default values.
MergeDefaultEvictionSettings bool
// podsPerCore is the maximum number of pods per core. Cannot exceed MaxPods.
// If 0, this field is ignored.
PodsPerCore int32
@ -514,6 +538,11 @@ type KubeletConfiguration struct {
// +featureGate=KubeletCrashLoopBackoffMax
// +optional
CrashLoopBackOff CrashLoopBackOffConfig
// UserNamespaces contains User Namespace configurations.
// +featureGate=UserNamespaceSupport
// +optional
UserNamespaces *UserNamespaces
}
// KubeletAuthorizationMode denotes the authorization mode for the kubelet
@ -604,7 +633,7 @@ type CredentialProviderConfig struct {
// Multiple providers may match against a single image, in which case credentials
// from all providers will be returned to the kubelet. If multiple providers are called
// for a single image, the results are combined. If providers return overlapping
// auth keys, the value from the provider earlier in this list is used.
// auth keys, the value from the provider earlier in this list is attempted first.
Providers []CredentialProvider
}
@ -614,6 +643,7 @@ type CredentialProvider struct {
// name is the required name of the credential provider. It must match the name of the
// provider executable as seen by the kubelet. The executable must be in the kubelet's
// bin directory (set by the --credential-provider-bin-dir flag).
// Required to be unique across all providers.
Name string
// matchImages is a required list of strings used to match against images in order to
@ -661,6 +691,64 @@ type CredentialProvider struct {
// to pass argument to the plugin.
// +optional
Env []ExecEnvVar
// tokenAttributes is the configuration for the service account token that will be passed to the plugin.
// The credential provider opts in to using service account tokens for image pull by setting this field.
// When this field is set, kubelet will generate a service account token bound to the pod for which the
// image is being pulled and pass to the plugin as part of CredentialProviderRequest along with other
// attributes required by the plugin.
//
// The service account metadata and token attributes will be used as a dimension to cache
// the credentials in kubelet. The cache key is generated by combining the service account metadata
// (namespace, name, UID, and annotations key+value for the keys defined in
// serviceAccountTokenAttribute.requiredServiceAccountAnnotationKeys and serviceAccountTokenAttribute.optionalServiceAccountAnnotationKeys).
// The pod metadata (namespace, name, UID) that are in the service account token are not used as a dimension
// to cache the credentials in kubelet. This means workloads that are using the same service account
// could end up using the same credentials for image pull. For plugins that don't want this behavior, or
// plugins that operate in pass-through mode; i.e., they return the service account token as-is, they
// can set the credentialProviderResponse.cacheDuration to 0. This will disable the caching of
// credentials in kubelet and the plugin will be invoked for every image pull. This does result in
// token generation overhead for every image pull, but it is the only way to ensure that the
// credentials are not shared across pods (even if they are using the same service account).
// +optional
TokenAttributes *ServiceAccountTokenAttributes
}
// ServiceAccountTokenAttributes is the configuration for the service account token that will be passed to the plugin.
type ServiceAccountTokenAttributes struct {
// serviceAccountTokenAudience is the intended audience for the projected service account token.
// +required
ServiceAccountTokenAudience string
// requireServiceAccount indicates whether the plugin requires the pod to have a service account.
// If set to true, kubelet will only invoke the plugin if the pod has a service account.
// If set to false, kubelet will invoke the plugin even if the pod does not have a service account
// and will not include a token in the CredentialProviderRequest in that scenario. This is useful for plugins that
// are used to pull images for pods without service accounts (e.g., static pods).
// +required
RequireServiceAccount *bool
// requiredServiceAccountAnnotationKeys is the list of annotation keys that the plugin is interested in
// and that are required to be present in the service account.
// The keys defined in this list will be extracted from the corresponding service account and passed
// to the plugin as part of the CredentialProviderRequest. If any of the keys defined in this list
// are not present in the service account, kubelet will not invoke the plugin and will return an error.
// This field is optional and may be empty. Plugins may use this field to extract
// additional information required to fetch credentials or allow workloads to opt in to
// using service account tokens for image pull.
// If non-empty, requireServiceAccount must be set to true.
// +optional
RequiredServiceAccountAnnotationKeys []string
// optionalServiceAccountAnnotationKeys is the list of annotation keys that the plugin is interested in
// and that are optional to be present in the service account.
// The keys defined in this list will be extracted from the corresponding service account and passed
// to the plugin as part of the CredentialProviderRequest. The plugin is responsible for validating
// the existence of annotations and their values.
// This field is optional and may be empty. Plugins may use this field to extract
// additional information required to fetch credentials.
// +optional
OptionalServiceAccountAnnotationKeys []string
}
// ExecEnvVar is used for setting environment variables when executing an exec-based
@ -702,3 +790,107 @@ type CrashLoopBackOffConfig struct {
// +optional
MaxContainerRestartPeriod *metav1.Duration
}
// ImagePullCredentialsVerificationPolicy is an enum for the policy that is enforced
// when pod is requesting an image that appears on the system
type ImagePullCredentialsVerificationPolicy string
const (
// NeverVerify will never require credential verification for images that
// already exist on the node
NeverVerify ImagePullCredentialsVerificationPolicy = "NeverVerify"
// NeverVerifyPreloadedImages does not require credential verification for images
// pulled outside the kubelet process
NeverVerifyPreloadedImages ImagePullCredentialsVerificationPolicy = "NeverVerifyPreloadedImages"
// NeverVerifyAllowlistedImages does not require credential verification for
// a list of images that were pulled outside the kubelet process
NeverVerifyAllowlistedImages ImagePullCredentialsVerificationPolicy = "NeverVerifyAllowlistedImages"
// AlwaysVerify requires credential verification for accessing any image on the
// node irregardless how it was pulled
AlwaysVerify ImagePullCredentialsVerificationPolicy = "AlwaysVerify"
)
// ImagePullIntent is a record of the kubelet attempting to pull an image.
//
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
type ImagePullIntent struct {
metav1.TypeMeta
// Image is the image spec from a Container's `image` field.
// The filename is a SHA-256 hash of this value. This is to avoid filename-unsafe
// characters like ':' and '/'.
Image string
}
// ImagePullRecord is a record of an image that was pulled by the kubelet.
//
// If there are no records in the `kubernetesSecrets` field and both `nodeWideCredentials`
// and `anonymous` are `false`, credentials must be re-checked the next time an
// image represented by this record is being requested.
//
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
type ImagePulledRecord struct {
metav1.TypeMeta
// LastUpdatedTime is the time of the last update to this record
LastUpdatedTime metav1.Time
// ImageRef is a reference to the image represented by this file as received
// from the CRI.
// The filename is a SHA-256 hash of this value. This is to avoid filename-unsafe
// characters like ':' and '/'.
ImageRef string
// CredentialMapping maps `image` to the set of credentials that it was
// previously pulled with.
// `image` in this case is the content of a pod's container `image` field that's
// got its tag/digest removed.
//
// Example:
// Container requests the `hello-world:latest@sha256:91fb4b041da273d5a3273b6d587d62d518300a6ad268b28628f74997b93171b2` image:
// "credentialMapping": {
// "hello-world": { "nodePodsAccessible": true }
// }
CredentialMapping map[string]ImagePullCredentials
}
// ImagePullCredentials describe credentials that can be used to pull an image.
type ImagePullCredentials struct {
// KuberneteSecretCoordinates is an index of coordinates of all the kubernetes
// secrets that were used to pull the image.
// +optional
KubernetesSecrets []ImagePullSecret
// NodePodsAccessible is a flag denoting the pull credentials are accessible
// by all the pods on the node, or that no credentials are needed for the pull.
//
// If true, it is mutually exclusive with the `kubernetesSecrets` field.
// +optional
NodePodsAccessible bool
}
// ImagePullSecret is a representation of a Kubernetes secret object coordinates along
// with a credential hash of the pull secret credentials this object contains.
type ImagePullSecret struct {
UID string
Namespace string
Name string
// CredentialHash is a SHA-256 retrieved by hashing the image pull credentials
// content of the secret specified by the UID/Namespace/Name coordinates.
CredentialHash string
}
// UserNamespaces contains User Namespace configurations.
type UserNamespaces struct {
// IDsPerPod is the mapping length of UIDs and GIDs.
// The length must be a multiple of 65536, and must be less than 1<<32.
// On non-linux such as windows, only null / absent is allowed.
//
// Changing the value may require recreating all containers on the node.
//
// Default: 65536
// +featureGate=UserNamespaceSupport
// +optional
IDsPerPod *int64
}

View File

@ -72,6 +72,11 @@ func (in *CredentialProvider) DeepCopyInto(out *CredentialProvider) {
*out = make([]ExecEnvVar, len(*in))
copy(*out, *in)
}
if in.TokenAttributes != nil {
in, out := &in.TokenAttributes, &out.TokenAttributes
*out = new(ServiceAccountTokenAttributes)
(*in).DeepCopyInto(*out)
}
return
}
@ -133,6 +138,101 @@ func (in *ExecEnvVar) DeepCopy() *ExecEnvVar {
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ImagePullCredentials) DeepCopyInto(out *ImagePullCredentials) {
*out = *in
if in.KubernetesSecrets != nil {
in, out := &in.KubernetesSecrets, &out.KubernetesSecrets
*out = make([]ImagePullSecret, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ImagePullCredentials.
func (in *ImagePullCredentials) DeepCopy() *ImagePullCredentials {
if in == nil {
return nil
}
out := new(ImagePullCredentials)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ImagePullIntent) DeepCopyInto(out *ImagePullIntent) {
*out = *in
out.TypeMeta = in.TypeMeta
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ImagePullIntent.
func (in *ImagePullIntent) DeepCopy() *ImagePullIntent {
if in == nil {
return nil
}
out := new(ImagePullIntent)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *ImagePullIntent) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ImagePullSecret) DeepCopyInto(out *ImagePullSecret) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ImagePullSecret.
func (in *ImagePullSecret) DeepCopy() *ImagePullSecret {
if in == nil {
return nil
}
out := new(ImagePullSecret)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ImagePulledRecord) DeepCopyInto(out *ImagePulledRecord) {
*out = *in
out.TypeMeta = in.TypeMeta
in.LastUpdatedTime.DeepCopyInto(&out.LastUpdatedTime)
if in.CredentialMapping != nil {
in, out := &in.CredentialMapping, &out.CredentialMapping
*out = make(map[string]ImagePullCredentials, len(*in))
for key, val := range *in {
(*out)[key] = *val.DeepCopy()
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ImagePulledRecord.
func (in *ImagePulledRecord) DeepCopy() *ImagePulledRecord {
if in == nil {
return nil
}
out := new(ImagePulledRecord)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *ImagePulledRecord) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletAnonymousAuthentication) DeepCopyInto(out *KubeletAnonymousAuthentication) {
*out = *in
@ -214,6 +314,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
}
out.Authentication = in.Authentication
out.Authorization = in.Authorization
if in.PreloadedImagesVerificationAllowlist != nil {
in, out := &in.PreloadedImagesVerificationAllowlist, &out.PreloadedImagesVerificationAllowlist
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.ClusterDNS != nil {
in, out := &in.ClusterDNS, &out.ClusterDNS
*out = make([]string, len(*in))
@ -354,6 +459,11 @@ func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
(*in).DeepCopyInto(*out)
}
in.CrashLoopBackOff.DeepCopyInto(&out.CrashLoopBackOff)
if in.UserNamespaces != nil {
in, out := &in.UserNamespaces, &out.UserNamespaces
*out = new(UserNamespaces)
(*in).DeepCopyInto(*out)
}
return
}
@ -491,6 +601,37 @@ func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object {
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ServiceAccountTokenAttributes) DeepCopyInto(out *ServiceAccountTokenAttributes) {
*out = *in
if in.RequireServiceAccount != nil {
in, out := &in.RequireServiceAccount, &out.RequireServiceAccount
*out = new(bool)
**out = **in
}
if in.RequiredServiceAccountAnnotationKeys != nil {
in, out := &in.RequiredServiceAccountAnnotationKeys, &out.RequiredServiceAccountAnnotationKeys
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.OptionalServiceAccountAnnotationKeys != nil {
in, out := &in.OptionalServiceAccountAnnotationKeys, &out.OptionalServiceAccountAnnotationKeys
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServiceAccountTokenAttributes.
func (in *ServiceAccountTokenAttributes) DeepCopy() *ServiceAccountTokenAttributes {
if in == nil {
return nil
}
out := new(ServiceAccountTokenAttributes)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) {
*out = *in
@ -506,3 +647,24 @@ func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPod
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *UserNamespaces) DeepCopyInto(out *UserNamespaces) {
*out = *in
if in.IDsPerPod != nil {
in, out := &in.IDsPerPod, &out.IDsPerPod
*out = new(int64)
**out = **in
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UserNamespaces.
func (in *UserNamespaces) DeepCopy() *UserNamespaces {
if in == nil {
return nil
}
out := new(UserNamespaces)
in.DeepCopyInto(out)
return out
}

View File

@ -66,16 +66,13 @@ func (p *v1PodResourcesServer) List(ctx context.Context, req *podresourcesv1.Lis
Containers: make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.Containers)),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SidecarContainers) {
pRes.Containers = make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
for _, container := range pod.Spec.InitContainers {
if !podutil.IsRestartableInitContainer(&container) {
continue
}
pRes.Containers = append(pRes.Containers, p.getContainerResources(pod, &container))
pRes.Containers = make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
for _, container := range pod.Spec.InitContainers {
if !podutil.IsRestartableInitContainer(&container) {
continue
}
pRes.Containers = append(pRes.Containers, p.getContainerResources(pod, &container))
}
for _, container := range pod.Spec.Containers {
@ -126,16 +123,13 @@ func (p *v1PodResourcesServer) Get(ctx context.Context, req *podresourcesv1.GetP
Containers: make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.Containers)),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SidecarContainers) {
podResources.Containers = make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
for _, container := range pod.Spec.InitContainers {
if !podutil.IsRestartableInitContainer(&container) {
continue
}
podResources.Containers = append(podResources.Containers, p.getContainerResources(pod, &container))
podResources.Containers = make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
for _, container := range pod.Spec.InitContainers {
if !podutil.IsRestartableInitContainer(&container) {
continue
}
podResources.Containers = append(podResources.Containers, p.getContainerResources(pod, &container))
}
for _, container := range pod.Spec.Containers {

View File

@ -39,7 +39,9 @@ import (
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/manager"
"github.com/google/cadvisor/utils/sysfs"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/features"
"k8s.io/utils/ptr"
)
@ -93,6 +95,10 @@ func New(imageFsInfoProvider ImageFsInfoProvider, rootPath string, cgroupRoots [
cadvisormetrics.OOMMetrics: struct{}{},
}
if utilfeature.DefaultFeatureGate.Enabled(features.KubeletPSI) {
includedMetrics[cadvisormetrics.PressureMetrics] = struct{}{}
}
if usingLegacyStats || localStorageCapacityIsolation {
includedMetrics[cadvisormetrics.DiskUsageMetrics] = struct{}{}
}

View File

@ -15,4 +15,4 @@ limitations under the License.
*/
// Package cadvisor provides an interface for Kubelet interactions with cAdvisor.
package cadvisor // import "k8s.io/kubernetes/pkg/kubelet/cadvisor"
package cadvisor

View File

@ -6,6 +6,7 @@ approvers:
- derekwaynecarr
- yujuhong
- klueska
- ffromani
reviewers:
- sig-node-reviewers
emeritus_approvers:

View File

@ -25,11 +25,10 @@ import (
"sync"
"time"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
libcontainercgroupmanager "github.com/opencontainers/runc/libcontainer/cgroups/manager"
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
libcontainercgroups "github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/fscommon"
libcontainercgroupmanager "github.com/opencontainers/cgroups/manager"
cgroupsystemd "github.com/opencontainers/cgroups/systemd"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
@ -195,14 +194,14 @@ func (m *cgroupCommon) buildCgroupPaths(name CgroupName) map[string]string {
}
// libctCgroupConfig converts CgroupConfig to libcontainer's Cgroup config.
func (m *cgroupCommon) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup {
config := &libcontainerconfigs.Cgroup{
func (m *cgroupCommon) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainercgroups.Cgroup {
config := &libcontainercgroups.Cgroup{
Systemd: m.useSystemd,
}
if needResources {
config.Resources = m.toResources(in.ResourceParameters)
} else {
config.Resources = &libcontainerconfigs.Resources{}
config.Resources = &libcontainercgroups.Resources{}
}
if !config.Systemd {
@ -279,8 +278,8 @@ var (
availableRootControllers sets.Set[string]
)
func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
resources := &libcontainerconfigs.Resources{
func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainercgroups.Resources {
resources := &libcontainercgroups.Resources{
SkipDevices: true,
SkipFreezeOnSet: true,
}
@ -324,7 +323,7 @@ func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainer
return resources
}
func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) {
func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainercgroups.Resources) {
// Check if hugetlb is supported.
if libcontainercgroups.IsCgroup2UnifiedMode() {
if !getSupportedUnifiedControllers().Has("hugetlb") {
@ -344,7 +343,7 @@ func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources
klog.InfoS("Invalid pageSize", "err", err)
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainercgroups.HugepageLimit{
Pagesize: sizeString,
Limit: uint64(limit),
})
@ -355,7 +354,7 @@ func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources
if pageSizes.Has(pageSize) {
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainercgroups.HugepageLimit{
Pagesize: pageSize,
Limit: uint64(0),
})

View File

@ -22,8 +22,8 @@ import (
"strconv"
"strings"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
libcontainercgroups "github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/fscommon"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
)

View File

@ -24,13 +24,17 @@ import (
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/cgroups/fscommon"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
)
const cgroupv2MemLimitFile string = "memory.max"
const (
cgroupv2MemLimitFile = "memory.max"
cgroupv2CpuMaxFile = "cpu.max"
cgroupv2CpuWeightFile = "cpu.weight"
)
// cgroupV2impl implements the CgroupManager interface
// for cgroup v2.
@ -100,14 +104,14 @@ func (c *cgroupV2impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName
func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) {
var cpuLimitStr, cpuPeriodStr string
cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max")
cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, cgroupv2CpuMaxFile)
if err != nil {
return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %w", cgroupPath, err)
return nil, fmt.Errorf("failed to read %s file for cgroup %v: %w", cgroupv2CpuMaxFile, cgroupPath, err)
}
numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr)
if errScan != nil || numItems != 2 {
return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %w",
cpuLimitAndPeriod, cgroupPath, errScan)
return nil, fmt.Errorf("failed to correctly parse content of %s file ('%s') for cgroup %v: %w",
cgroupv2CpuMaxFile, cpuLimitAndPeriod, cgroupPath, errScan)
}
cpuLimit := int64(-1)
if cpuLimitStr != Cgroup2MaxCpuLimit {
@ -120,7 +124,7 @@ func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, e
if errPeriod != nil {
return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %w", cgroupPath, errPeriod)
}
cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight")
cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, cgroupv2CpuWeightFile)
if errWeight != nil {
return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %w", cgroupPath, errWeight)
}

View File

@ -31,6 +31,7 @@ import (
v1 "k8s.io/api/core/v1"
"k8s.io/apiserver/pkg/server/healthz"
internalapi "k8s.io/cri-api/pkg/apis"
"k8s.io/klog/v2"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
@ -154,6 +155,13 @@ type ContainerManager interface {
// Updates returns a channel that receives an Update when the device changed its status.
Updates() <-chan resourceupdates.Update
// PodHasExclusiveCPUs returns true if the provided pod has containers with exclusive CPUs,
// This means that at least one sidecar container or one app container has exclusive CPUs allocated.
PodHasExclusiveCPUs(pod *v1.Pod) bool
// ContainerHasExclusiveCPUs returns true if the provided container in the pod has exclusive cpu
ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool
// Implements the PodResources Provider API
podresources.CPUsProvider
podresources.DevicesProvider
@ -161,6 +169,10 @@ type ContainerManager interface {
podresources.DynamicResourcesProvider
}
type cpuAllocationReader interface {
GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet
}
type NodeConfig struct {
NodeName types.NodeName
RuntimeCgroupsName string
@ -174,19 +186,19 @@ type NodeConfig struct {
KubeletRootDir string
ProtectKernelDefaults bool
NodeAllocatableConfig
QOSReserved map[v1.ResourceName]int64
CPUManagerPolicy string
CPUManagerPolicyOptions map[string]string
TopologyManagerScope string
CPUManagerReconcilePeriod time.Duration
ExperimentalMemoryManagerPolicy string
ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation
PodPidsLimit int64
EnforceCPULimits bool
CPUCFSQuotaPeriod time.Duration
TopologyManagerPolicy string
TopologyManagerPolicyOptions map[string]string
CgroupVersion int
QOSReserved map[v1.ResourceName]int64
CPUManagerPolicy string
CPUManagerPolicyOptions map[string]string
TopologyManagerScope string
CPUManagerReconcilePeriod time.Duration
MemoryManagerPolicy string
MemoryManagerReservedMemory []kubeletconfig.MemoryReservation
PodPidsLimit int64
EnforceCPULimits bool
CPUCFSQuotaPeriod time.Duration
TopologyManagerPolicy string
TopologyManagerPolicyOptions map[string]string
CgroupVersion int
}
type NodeAllocatableConfig struct {
@ -212,6 +224,30 @@ func int64Slice(in []int) []int64 {
return out
}
func podHasExclusiveCPUs(cr cpuAllocationReader, pod *v1.Pod) bool {
for _, container := range pod.Spec.InitContainers {
if containerHasExclusiveCPUs(cr, pod, &container) {
return true
}
}
for _, container := range pod.Spec.Containers {
if containerHasExclusiveCPUs(cr, pod, &container) {
return true
}
}
klog.V(4).InfoS("Pod contains no container with pinned cpus", "podName", pod.Name)
return false
}
func containerHasExclusiveCPUs(cr cpuAllocationReader, pod *v1.Pod, container *v1.Container) bool {
exclusiveCPUs := cr.GetExclusiveCPUs(string(pod.UID), container.Name)
if !exclusiveCPUs.IsEmpty() {
klog.V(4).InfoS("Container has pinned cpus", "podName", pod.Name, "containerName", container.Name)
return true
}
return false
}
// parsePercentage parses the percentage string to numeric value.
func parsePercentage(v string) (int64, error) {
if !strings.HasSuffix(v, "%") {

View File

@ -27,9 +27,8 @@ import (
"sync"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/manager"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/cgroups"
"github.com/opencontainers/cgroups/manager"
"k8s.io/klog/v2"
"k8s.io/mount-utils"
utilpath "k8s.io/utils/path"
@ -336,10 +335,10 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
cm.topologyManager.AddHintProvider(cm.cpuManager)
cm.memoryManager, err = memorymanager.NewManager(
nodeConfig.ExperimentalMemoryManagerPolicy,
nodeConfig.MemoryManagerPolicy,
machineInfo,
cm.GetNodeAllocatableReservation(),
nodeConfig.ExperimentalMemoryManagerReservedMemory,
nodeConfig.MemoryManagerReservedMemory,
nodeConfig.KubeletRootDir,
cm.topologyManager,
)
@ -365,7 +364,8 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
enforceCPULimits: cm.EnforceCPULimits,
// cpuCFSQuotaPeriod is in microseconds. NodeConfig.CPUCFSQuotaPeriod is time.Duration (measured in nano seconds).
// Convert (cm.CPUCFSQuotaPeriod) [nanoseconds] / time.Microsecond (1000) to get cpuCFSQuotaPeriod in microseconds.
cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
cpuCFSQuotaPeriod: uint64(cm.CPUCFSQuotaPeriod / time.Microsecond),
podContainerManager: cm,
}
}
return &podContainerManagerNoop{
@ -373,16 +373,24 @@ func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
}
}
func (cm *containerManagerImpl) PodHasExclusiveCPUs(pod *v1.Pod) bool {
return podHasExclusiveCPUs(cm.cpuManager, pod)
}
func (cm *containerManagerImpl) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
return containerHasExclusiveCPUs(cm.cpuManager, pod, container)
}
func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cm.cpuManager, cm.memoryManager, cm.topologyManager}
}
// Create a cgroup container manager.
func createManager(containerName string) (cgroups.Manager, error) {
cg := &configs.Cgroup{
cg := &cgroups.Cgroup{
Parent: "/",
Name: containerName,
Resources: &configs.Resources{
Resources: &cgroups.Resources{
SkipDevices: true,
},
Systemd: false,

View File

@ -195,6 +195,14 @@ func (cm *containerManagerStub) Updates() <-chan resourceupdates.Update {
return nil
}
func (cm *containerManagerStub) PodHasExclusiveCPUs(pod *v1.Pod) bool {
return false
}
func (cm *containerManagerStub) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
return false
}
func NewStubContainerManager() ContainerManager {
return &containerManagerStub{shouldResetExtendedResourceCapacity: false}
}

View File

@ -168,10 +168,10 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
klog.InfoS("Creating memory manager")
cm.memoryManager, err = memorymanager.NewManager(
nodeConfig.ExperimentalMemoryManagerPolicy,
nodeConfig.MemoryManagerPolicy,
machineInfo,
cm.GetNodeAllocatableReservation(),
nodeConfig.ExperimentalMemoryManagerReservedMemory,
nodeConfig.MemoryManagerReservedMemory,
nodeConfig.KubeletRootDir,
cm.topologyManager,
)
@ -369,3 +369,11 @@ func (cm *containerManagerImpl) UnprepareDynamicResources(ctx context.Context, p
func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool {
return false
}
func (cm *containerManagerImpl) PodHasExclusiveCPUs(pod *v1.Pod) bool {
return podHasExclusiveCPUs(cm.cpuManager, pod)
}
func (cm *containerManagerImpl) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
return containerHasExclusiveCPUs(cm.cpuManager, pod, container)
}

View File

@ -18,6 +18,7 @@ package containermap
import (
"fmt"
"maps"
)
// cmItem (ContainerMap ITEM) is a pair podUID, containerName
@ -36,11 +37,7 @@ func NewContainerMap() ContainerMap {
// Clone creates a deep copy of the ContainerMap
func (cm ContainerMap) Clone() ContainerMap {
ret := make(ContainerMap, len(cm))
for key, val := range cm {
ret[key] = val
}
return ret
return maps.Clone(cm)
}
// Add adds a mapping of (containerID)->(podUID, containerName) to the ContainerMap

View File

@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
"maps"
"math"
"sort"
@ -39,11 +40,7 @@ const (
type mapIntInt map[int]int
func (m mapIntInt) Clone() mapIntInt {
cp := make(mapIntInt, len(m))
for k, v := range m {
cp[k] = v
}
return cp
return maps.Clone(m)
}
func (m mapIntInt) Keys() []int {

View File

@ -239,6 +239,8 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
return err
}
klog.V(4).InfoS("CPU manager started", "policy", m.policy.Name())
m.allocatableCPUs = m.policy.GetAllocatableCPUs(m.state)
if m.policy.Name() == string(PolicyNone) {
@ -465,7 +467,7 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name)
if cset.IsEmpty() {
// NOTE: This should not happen outside of tests.
klog.V(2).InfoS("ReconcileState: skipping container; assigned cpuset is empty", "pod", klog.KObj(pod), "containerName", container.Name)
klog.V(2).InfoS("ReconcileState: skipping container; empty cpuset assigned", "pod", klog.KObj(pod), "containerName", container.Name)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}

View File

@ -39,16 +39,17 @@ const (
var (
alphaOptions = sets.New[string](
DistributeCPUsAcrossNUMAOption,
AlignBySocketOption,
DistributeCPUsAcrossCoresOption,
StrictCPUReservationOption,
PreferAlignByUnCoreCacheOption,
)
betaOptions = sets.New[string](
StrictCPUReservationOption,
DistributeCPUsAcrossNUMAOption,
)
stableOptions = sets.New[string](
FullPCPUsOnlyOption,
)
stableOptions = sets.New[string]()
)
// CheckPolicyOptionAvailable verifies if the given option can be used depending on the Feature Gate Settings.
@ -66,6 +67,7 @@ func CheckPolicyOptionAvailable(option string) error {
return fmt.Errorf("CPU Manager Policy Beta-level Options not enabled, but option %q provided", option)
}
// if the option is stable, we need no CPUManagerPolicy*Options feature gate check
return nil
}

View File

@ -18,6 +18,7 @@ package cpumanager
import (
"fmt"
"strconv"
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
@ -325,13 +326,16 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
defer func() {
if rerr != nil {
metrics.CPUManagerPinningErrorsTotal.Inc()
if p.options.FullPhysicalCPUsOnly {
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}
return
}
if !p.options.FullPhysicalCPUsOnly {
// TODO: move in updateMetricsOnAllocate
if p.options.FullPhysicalCPUsOnly {
// increment only if we know we allocate aligned resources
return
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}()
if p.options.FullPhysicalCPUsOnly {
@ -367,8 +371,8 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
}
}
}
if cpuset, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
p.updateCPUsToReuse(pod, container, cpuset)
if cset, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
p.updateCPUsToReuse(pod, container, cset)
klog.InfoS("Static policy: container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
@ -378,16 +382,17 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
klog.InfoS("Topology Affinity", "pod", klog.KObj(pod), "containerName", container.Name, "affinity", hint)
// Allocate CPUs according to the NUMA affinity contained in the hint.
cpuset, err := p.allocateCPUs(s, numCPUs, hint.NUMANodeAffinity, p.cpusToReuse[string(pod.UID)])
cpuAllocation, err := p.allocateCPUs(s, numCPUs, hint.NUMANodeAffinity, p.cpusToReuse[string(pod.UID)])
if err != nil {
klog.ErrorS(err, "Unable to allocate CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "numCPUs", numCPUs)
return err
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
p.updateCPUsToReuse(pod, container, cpuset)
p.updateMetricsOnAllocate(cpuset)
s.SetCPUSet(string(pod.UID), container.Name, cpuAllocation.CPUs)
p.updateCPUsToReuse(pod, container, cpuAllocation.CPUs)
p.updateMetricsOnAllocate(s, cpuAllocation)
klog.V(4).InfoS("Allocated exclusive CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpuset", cpuAllocation.CPUs.String())
return nil
}
@ -412,18 +417,19 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
// Mutate the shared pool, adding released cpus.
toRelease = toRelease.Difference(cpusInUse)
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
p.updateMetricsOnRelease(toRelease)
p.updateMetricsOnRelease(s, toRelease)
}
return nil
}
func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bitmask.BitMask, reusableCPUs cpuset.CPUSet) (cpuset.CPUSet, error) {
func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bitmask.BitMask, reusableCPUs cpuset.CPUSet) (topology.Allocation, error) {
klog.InfoS("AllocateCPUs", "numCPUs", numCPUs, "socket", numaAffinity)
allocatableCPUs := p.GetAvailableCPUs(s).Union(reusableCPUs)
// If there are aligned CPUs in numaAffinity, attempt to take those first.
result := cpuset.New()
result := topology.EmptyAllocation()
if numaAffinity != nil {
alignedCPUs := p.getAlignedCPUs(numaAffinity, allocatableCPUs)
@ -432,30 +438,33 @@ func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bit
numAlignedToAlloc = numCPUs
}
alignedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc)
allocatedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc)
if err != nil {
return cpuset.New(), err
return topology.EmptyAllocation(), err
}
result = result.Union(alignedCPUs)
result.CPUs = result.CPUs.Union(allocatedCPUs)
}
// Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result), numCPUs-result.Size())
remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result.CPUs), numCPUs-result.CPUs.Size())
if err != nil {
return cpuset.New(), err
return topology.EmptyAllocation(), err
}
result = result.Union(remainingCPUs)
result.CPUs = result.CPUs.Union(remainingCPUs)
result.Aligned = p.topology.CheckAlignment(result.CPUs)
// Remove allocated CPUs from the shared CPUSet.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result))
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result.CPUs))
klog.InfoS("AllocateCPUs", "result", result)
klog.InfoS("AllocateCPUs", "result", result.String())
return result, nil
}
func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int {
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
qos := v1qos.GetPodQOS(pod)
if qos != v1.PodQOSGuaranteed {
klog.V(5).InfoS("Exclusive CPU allocation skipped, pod QoS is not guaranteed", "pod", klog.KObj(pod), "containerName", container.Name, "qos", qos)
return 0
}
cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
@ -464,11 +473,19 @@ func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int
// We should return this value because this is what kubelet agreed to allocate for the container
// and the value configured with runtime.
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
containerStatuses := pod.Status.ContainerStatuses
if podutil.IsRestartableInitContainer(container) {
if len(pod.Status.InitContainerStatuses) != 0 {
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
}
}
if cs, ok := podutil.GetContainerStatus(containerStatuses, container.Name); ok {
cpuQuantity = cs.AllocatedResources[v1.ResourceCPU]
}
}
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
cpuValue := cpuQuantity.Value()
if cpuValue*1000 != cpuQuantity.MilliValue() {
klog.V(5).InfoS("Exclusive CPU allocation skipped, pod requested non-integral CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "cpu", cpuValue)
return 0
}
// Safe downcast to do for all systems with < 2.1 billion CPUs.
@ -740,27 +757,60 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC
func (p *staticPolicy) initializeMetrics(s state.State) {
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Add(0) // ensure the value exists
totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(totalAssignedCPUs.Size()))
updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs)
}
func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {
ncpus := cset.Size()
func (p *staticPolicy) updateMetricsOnAllocate(s state.State, cpuAlloc topology.Allocation) {
ncpus := cpuAlloc.CPUs.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000))
if cpuAlloc.Aligned.UncoreCache {
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedUncoreCache).Inc()
}
totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs)
}
func (p *staticPolicy) updateMetricsOnRelease(cset cpuset.CPUSet) {
func (p *staticPolicy) updateMetricsOnRelease(s state.State, cset cpuset.CPUSet) {
ncpus := cset.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000))
totalAssignedCPUs := getTotalAssignedExclusiveCPUs(s)
updateAllocationPerNUMAMetric(p.topology, totalAssignedCPUs.Difference(cset))
}
func countExclusiveCPUs(s state.State) int {
exclusiveCPUs := 0
for _, cpuAssign := range s.GetCPUAssignments() {
for _, cset := range cpuAssign {
exclusiveCPUs += cset.Size()
func getTotalAssignedExclusiveCPUs(s state.State) cpuset.CPUSet {
totalAssignedCPUs := cpuset.New()
for _, assignment := range s.GetCPUAssignments() {
for _, cset := range assignment {
totalAssignedCPUs = totalAssignedCPUs.Union(cset)
}
}
return totalAssignedCPUs
}
func updateAllocationPerNUMAMetric(topo *topology.CPUTopology, allocatedCPUs cpuset.CPUSet) {
numaCount := make(map[int]int)
// Count CPUs allocated per NUMA node
for _, cpuID := range allocatedCPUs.UnsortedList() {
numaNode, err := topo.CPUNUMANodeID(cpuID)
if err != nil {
//NOTE: We are logging the error but it is highly unlikely to happen as the CPUset
// is already computed, evaluated and there is no room for user tampering.
klog.ErrorS(err, "Unable to determine NUMA node", "cpuID", cpuID)
}
numaCount[numaNode]++
}
// Update metric
for numaNode, count := range numaCount {
metrics.CPUManagerAllocationPerNUMA.WithLabelValues(strconv.Itoa(numaNode)).Set(float64(count))
}
return exclusiveCPUs
}

View File

@ -201,7 +201,7 @@ func (sc *stateCheckpoint) SetCPUSet(podUID string, containerName string, cset c
sc.cache.SetCPUSet(podUID, containerName, cset)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
}
}
@ -212,7 +212,7 @@ func (sc *stateCheckpoint) SetDefaultCPUSet(cset cpuset.CPUSet) {
sc.cache.SetDefaultCPUSet(cset)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}
@ -223,7 +223,7 @@ func (sc *stateCheckpoint) SetCPUAssignments(a ContainerCPUAssignments) {
sc.cache.SetCPUAssignments(a)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}
@ -234,7 +234,7 @@ func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
sc.cache.Delete(podUID, containerName)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
}
}
@ -245,6 +245,6 @@ func (sc *stateCheckpoint) ClearState() {
sc.cache.ClearState()
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}

View File

@ -0,0 +1,78 @@
/*
Copyright 2025 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topology
import (
"fmt"
"k8s.io/utils/cpuset"
)
// Alignment is metadata about a cpuset allocation
type Alignment struct {
// UncoreCache is true if all the CPUs are uncore-cache aligned,
// IOW if they all share the same Uncore cache block.
// If the allocated CPU count is greater than a Uncore Group size,
// CPUs can't be uncore-aligned; otherwise, they are.
// This flag tracks alignment, not interference or lack thereof.
UncoreCache bool
}
func (ca Alignment) String() string {
return fmt.Sprintf("aligned=<uncore:%v>", ca.UncoreCache)
}
// Allocation represents a CPU set plus alignment metadata
type Allocation struct {
CPUs cpuset.CPUSet
Aligned Alignment
}
func (ca Allocation) String() string {
return ca.CPUs.String() + " " + ca.Aligned.String()
}
// EmptyAllocation returns a new zero-valued CPU allocation. Please note that
// a empty cpuset is aligned according to every possible way we can consider
func EmptyAllocation() Allocation {
return Allocation{
CPUs: cpuset.New(),
Aligned: Alignment{
UncoreCache: true,
},
}
}
func isAlignedAtUncoreCache(topo *CPUTopology, cpuList ...int) bool {
if len(cpuList) <= 1 {
return true
}
reference, ok := topo.CPUDetails[cpuList[0]]
if !ok {
return false
}
for _, cpu := range cpuList[1:] {
info, ok := topo.CPUDetails[cpu]
if !ok {
return false
}
if info.UncoreCacheID != reference.UncoreCacheID {
return false
}
}
return true
}

View File

@ -15,4 +15,4 @@ limitations under the License.
*/
// Package topology contains helpers for the CPU manager.
package topology // import "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
package topology

View File

@ -101,6 +101,15 @@ func (topo *CPUTopology) CPUNUMANodeID(cpu int) (int, error) {
return info.NUMANodeID, nil
}
// CheckAlignment returns alignment information for the given cpuset in
// the context of the current CPU topology
func (topo *CPUTopology) CheckAlignment(cpus cpuset.CPUSet) Alignment {
cpuList := cpus.UnsortedList()
return Alignment{
UncoreCache: isAlignedAtUncoreCache(topo, cpuList...),
}
}
// CPUInfo contains the NUMA, socket, UncoreCache and core IDs associated with a CPU.
type CPUInfo struct {
NUMANodeID int

View File

@ -202,15 +202,12 @@ func (m *ManagerImpl) CleanupPluginDirectory(dir string) error {
if filePath == m.checkpointFile() {
continue
}
// TODO: Until the bug - https://github.com/golang/go/issues/33357 is fixed, os.stat wouldn't return the
// right mode(socket) on windows. Hence deleting the file, without checking whether
// its a socket, on windows.
stat, err := os.Lstat(filePath)
stat, err := os.Stat(filePath)
if err != nil {
klog.ErrorS(err, "Failed to stat file", "path", filePath)
continue
}
if stat.IsDir() {
if stat.IsDir() || stat.Mode()&os.ModeSocket == 0 {
continue
}
err = os.RemoveAll(filePath)
@ -351,7 +348,7 @@ func (m *ManagerImpl) Start(activePods ActivePodsFunc, sourcesReady config.Sourc
// Loads in allocatedDevices information from disk.
err := m.readCheckpoint()
if err != nil {
klog.InfoS("Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date", "err", err)
klog.ErrorS(err, "Continue after failing to read checkpoint file. Device allocation info may NOT be up-to-date")
}
return m.server.Start()
@ -453,7 +450,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
// should always be consistent. Otherwise, we run with the risk
// of failing to garbage collect non-existing resources or devices.
if !ok {
klog.ErrorS(nil, "Unexpected: healthyDevices and endpoints are out of sync")
klog.InfoS("Unexpected: healthyDevices and endpoints are out of sync")
}
delete(m.endpoints, resourceName)
delete(m.healthyDevices, resourceName)
@ -468,7 +465,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
eI, ok := m.endpoints[resourceName]
if (ok && eI.e.stopGracePeriodExpired()) || !ok {
if !ok {
klog.ErrorS(nil, "Unexpected: unhealthyDevices and endpoints are out of sync")
klog.InfoS("Unexpected: unhealthyDevices and endpoints became out of sync")
}
delete(m.endpoints, resourceName)
delete(m.unhealthyDevices, resourceName)
@ -484,7 +481,7 @@ func (m *ManagerImpl) GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
m.mutex.Unlock()
if needsUpdateCheckpoint {
if err := m.writeCheckpoint(); err != nil {
klog.ErrorS(err, "Error on writing checkpoint")
klog.ErrorS(err, "Failed to write checkpoint file")
}
}
return capacity, allocatable, deletedResources.UnsortedList()
@ -503,9 +500,10 @@ func (m *ManagerImpl) writeCheckpoint() error {
err := m.checkpointManager.CreateCheckpoint(kubeletDeviceManagerCheckpoint, data)
if err != nil {
err2 := fmt.Errorf("failed to write checkpoint file %q: %v", kubeletDeviceManagerCheckpoint, err)
klog.InfoS("Failed to write checkpoint file", "err", err)
klog.ErrorS(err, "Failed to write checkpoint file")
return err2
}
klog.V(4).InfoS("Checkpoint file written", "checkpoint", kubeletDeviceManagerCheckpoint)
return nil
}
@ -516,7 +514,7 @@ func (m *ManagerImpl) readCheckpoint() error {
if err != nil {
if err == errors.ErrCheckpointNotFound {
// no point in trying anything else
klog.InfoS("Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint, "err", err)
klog.ErrorS(err, "Failed to read data from checkpoint", "checkpoint", kubeletDeviceManagerCheckpoint)
return nil
}
return err
@ -534,6 +532,8 @@ func (m *ManagerImpl) readCheckpoint() error {
m.unhealthyDevices[resource] = sets.New[string]()
m.endpoints[resource] = endpointInfo{e: newStoppedEndpointImpl(resource), opts: nil}
}
klog.V(4).InfoS("Read data from checkpoint file", "checkpoint", kubeletDeviceManagerCheckpoint)
return nil
}
@ -596,7 +596,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
// running, then it can only be a kubelet restart. On node reboot the runtime and the containers were also shut down. Then, if the container was running, it can only be
// because it already has access to all the required devices, so we got nothing to do and we can bail out.
if !m.sourcesReady.AllReady() && m.isContainerAlreadyRunning(podUID, contName) {
klog.V(3).InfoS("container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
klog.V(3).InfoS("Container detected running, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
return nil, nil
}
@ -627,7 +627,7 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
// We handled the known error paths in scenario 3 (node reboot), so from now on we can fall back in a common path.
// We cover container restart on kubelet steady state with the same flow.
if needed == 0 {
klog.V(3).InfoS("no devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
klog.V(3).InfoS("No devices needed, nothing to do", "deviceNumber", needed, "resourceName", resource, "podUID", podUID, "containerName", contName)
// No change, no work.
return nil, nil
}
@ -836,7 +836,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
for k, v := range container.Resources.Limits {
resource := string(k)
needed := int(v.Value())
klog.V(3).InfoS("Looking for needed resources", "needed", needed, "resourceName", resource)
klog.V(3).InfoS("Looking for needed resources", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "needed", needed)
if !m.isDevicePluginResource(resource) {
continue
}
@ -882,7 +882,7 @@ func (m *ManagerImpl) allocateContainerResources(pod *v1.Pod, container *v1.Cont
devs := allocDevices.UnsortedList()
// TODO: refactor this part of code to just append a ContainerAllocationRequest
// in a passed in AllocateRequest pointer, and issues a single Allocate call per pod.
klog.V(3).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource)
klog.V(4).InfoS("Making allocation request for device plugin", "devices", devs, "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
resp, err := eI.e.allocate(devs)
metrics.DevicePluginAllocationDuration.WithLabelValues(resource).Observe(metrics.SinceInSeconds(startRPCTime))
if err != nil {
@ -952,7 +952,7 @@ func (m *ManagerImpl) GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Co
}
if !m.checkPodActive(pod) {
klog.ErrorS(nil, "pod deleted from activePods, skip to reAllocate", "podUID", podUID)
klog.V(5).InfoS("Pod deleted from activePods, skip to reAllocate", "pod", klog.KObj(pod), "podUID", podUID, "containerName", container.Name)
continue
}
@ -984,7 +984,7 @@ func (m *ManagerImpl) callPreStartContainerIfNeeded(podUID, contName, resource s
if eI.opts == nil || !eI.opts.PreStartRequired {
m.mutex.Unlock()
klog.V(4).InfoS("Plugin options indicate to skip PreStartContainer for resource", "resourceName", resource)
klog.V(5).InfoS("Plugin options indicate to skip PreStartContainer for resource", "podUID", podUID, "resourceName", resource, "containerName", contName)
return nil
}
@ -1014,12 +1014,12 @@ func (m *ManagerImpl) callGetPreferredAllocationIfAvailable(podUID, contName, re
}
if eI.opts == nil || !eI.opts.GetPreferredAllocationAvailable {
klog.V(4).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource)
klog.V(5).InfoS("Plugin options indicate to skip GetPreferredAllocation for resource", "resourceName", resource, "podUID", podUID, "containerName", contName)
return nil, nil
}
m.mutex.Unlock()
klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "containerName", contName, "podUID", podUID)
klog.V(4).InfoS("Issuing a GetPreferredAllocation call for container", "resourceName", resource, "containerName", contName, "podUID", podUID)
resp, err := eI.e.getPreferredAllocation(available.UnsortedList(), mustInclude.UnsortedList(), size)
m.mutex.Lock()
if err != nil {
@ -1167,7 +1167,7 @@ func (m *ManagerImpl) ShouldResetExtendedResourceCapacity() bool {
func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
cntID, err := m.containerMap.GetContainerID(podUID, cntName)
if err != nil {
klog.V(4).InfoS("container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName, "err", err)
klog.ErrorS(err, "Container not found in the initial map, assumed NOT running", "podUID", podUID, "containerName", cntName)
return false
}
@ -1175,11 +1175,11 @@ func (m *ManagerImpl) isContainerAlreadyRunning(podUID, cntName string) bool {
// so on kubelet restart containers will again fail admission, hitting https://github.com/kubernetes/kubernetes/issues/118559 again.
// This scenario should however be rare enough.
if !m.containerRunningSet.Has(cntID) {
klog.V(4).InfoS("container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
klog.V(4).InfoS("Container not present in the initial running set", "podUID", podUID, "containerName", cntName, "containerID", cntID)
return false
}
// Once we make it here we know we have a running container.
klog.V(4).InfoS("container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
klog.V(4).InfoS("Container found in the initial set, assumed running", "podUID", podUID, "containerName", cntName, "containerID", cntID)
return true
}

View File

@ -106,6 +106,8 @@ func (c *client) Disconnect() error {
}
c.mutex.Unlock()
c.handler.PluginDisconnected(c.resource)
klog.V(2).InfoS("Device plugin disconnected", "resource", c.resource)
return nil
}

View File

@ -43,8 +43,8 @@ func (s *server) RegisterPlugin(pluginName string, endpoint string, versions []s
return s.connectClient(pluginName, endpoint)
}
func (s *server) DeRegisterPlugin(pluginName string) {
klog.V(2).InfoS("Deregistering plugin", "plugin", pluginName)
func (s *server) DeRegisterPlugin(pluginName, endpoint string) {
klog.V(2).InfoS("Deregistering plugin", "plugin", pluginName, "endpoint", endpoint)
client := s.getClient(pluginName)
if client != nil {
s.disconnectClient(pluginName, client)
@ -62,6 +62,7 @@ func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []s
return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
}
klog.V(2).InfoS("Device plugin validated", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
return nil
}
@ -75,6 +76,7 @@ func (s *server) connectClient(name string, socketPath string) error {
return err
}
klog.V(2).InfoS("Connected to new client", "resource", name)
go func() {
s.runClient(name, c)
}()
@ -86,7 +88,6 @@ func (s *server) disconnectClient(name string, c Client) error {
s.deregisterClient(name)
return c.Disconnect()
}
func (s *server) registerClient(name string, c Client) {
s.mutex.Lock()
defer s.mutex.Unlock()
@ -112,7 +113,7 @@ func (s *server) runClient(name string, c Client) {
}
if err := s.disconnectClient(name, c); err != nil {
klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err)
klog.ErrorS(err, "Unable to disconnect client", "resource", name, "client", c)
}
}

View File

@ -91,7 +91,7 @@ func (s *server) Start() error {
if selinux.GetEnabled() {
if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err)
klog.ErrorS(err, "Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir)
}
}
@ -128,7 +128,7 @@ func (s *server) Start() error {
func (s *server) Stop() error {
s.visitClients(func(r string, c Client) {
if err := s.disconnectClient(r, c); err != nil {
klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err)
klog.ErrorS(err, "Failed to disconnect device plugin client", "resourceName", r)
}
})
@ -145,6 +145,7 @@ func (s *server) Stop() error {
// During kubelet termination, we do not need the registration server,
// and we consider the kubelet to be healthy even when it is down.
s.setHealthy()
klog.V(2).InfoS("Stopping device plugin registration server")
return nil
}
@ -159,18 +160,18 @@ func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Emp
if !s.isVersionCompatibleWithPlugin(r.Version) {
err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err)
klog.ErrorS(err, "Bad registration request from device plugin with resource", "resourceName", r.ResourceName)
return &api.Empty{}, err
}
if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
klog.InfoS("Bad registration request from device plugin", "err", err)
klog.ErrorS(err, "Bad registration request from device plugin")
return &api.Empty{}, err
}
if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
klog.InfoS("Error connecting to device plugin client", "err", err)
klog.ErrorS(err, "Error connecting to device plugin client")
return &api.Empty{}, err
}

View File

@ -17,6 +17,7 @@ limitations under the License.
package devicemanager
import (
"maps"
"sync"
"k8s.io/klog/v2"
@ -429,10 +430,7 @@ func NewResourceDeviceInstances() ResourceDeviceInstances {
func (rdev ResourceDeviceInstances) Clone() ResourceDeviceInstances {
clone := NewResourceDeviceInstances()
for resourceName, resourceDevs := range rdev {
clone[resourceName] = make(map[string]pluginapi.Device)
for devID, dev := range resourceDevs {
clone[resourceName][devID] = dev
}
clone[resourceName] = maps.Clone(resourceDevs)
}
return clone
}

View File

@ -43,7 +43,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested)
deviceHints[resource] = nil
continue
}
@ -54,11 +54,11 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name)
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name)
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
@ -67,7 +67,7 @@ func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map
available := m.getAvailableDevices(resource)
reusable := m.devicesToReuse[string(pod.UID)][resource]
if available.Union(reusable).Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len())
klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "available", available.Union(reusable).Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
@ -94,7 +94,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
klog.InfoS("Resource does not have a topology preference", "resourceName", resource, "pod", klog.KObj(pod), "request", requested)
deviceHints[resource] = nil
continue
}
@ -105,11 +105,11 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
allocated := m.podDevices.podDevices(string(pod.UID), resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
klog.InfoS("Resource already allocated to pod with different number than request", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod))
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resourceName", resource, "pod", klog.KObj(pod), "allocated", allocated.Len())
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
@ -117,7 +117,7 @@ func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymana
// Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource)
if available.Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len())
klog.InfoS("Unable to generate topology hints: requested number of devices unavailable", "resourceName", resource, "pod", klog.KObj(pod), "request", requested, "available", available.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}

View File

@ -18,4 +18,4 @@ limitations under the License.
// to manage containers. For example, they contain functions to configure containers' cgroups,
// ensure containers run with the desired QoS, and allocate compute resources like cpus, memory,
// devices...
package cm // import "k8s.io/kubernetes/pkg/kubelet/cm"
package cm

View File

@ -98,7 +98,20 @@ func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, n
}
func (m *ManagerImpl) GetWatcherHandler() cache.PluginHandler {
return cache.PluginHandler(dra.NewRegistrationHandler(m.kubeClient, m.getNode))
// The time that DRA drivers have to come back after being unregistered
// before the kubelet removes their ResourceSlices.
//
// This must be long enough to actually allow stopping a pod and
// starting the replacement (otherwise ResourceSlices get deleted
// unnecessarily) and not too long (otherwise the time window were
// pods might still get scheduled to the node after removal of a
// driver is too long).
//
// 30 seconds might be long enough for a simple container restart.
// If a DRA driver wants to be sure that slices don't get wiped,
// it should use rolling updates.
wipingDelay := 30 * time.Second
return cache.PluginHandler(dra.NewRegistrationHandler(m.kubeClient, m.getNode, wipingDelay))
}
// Start starts the reconcile loop of the manager.

View File

@ -18,13 +18,16 @@ package plugin
import (
"errors"
"fmt"
"slices"
"sync"
)
// PluginsStore holds a list of DRA Plugins.
type pluginsStore struct {
sync.RWMutex
store map[string]*Plugin
// plugin name -> Plugin in the order in which they got added
store map[string][]*Plugin
}
// draPlugins map keeps track of all registered DRA plugins on the node
@ -37,43 +40,57 @@ func (s *pluginsStore) get(pluginName string) *Plugin {
s.RLock()
defer s.RUnlock()
return s.store[pluginName]
instances := s.store[pluginName]
if len(instances) == 0 {
return nil
}
// Heuristic: pick the most recent one. It's most likely
// the newest, except when kubelet got restarted and registered
// all running plugins in random order.
return instances[len(instances)-1]
}
// Set lets you save a DRA Plugin to the list and give it a specific name.
// This method is protected by a mutex.
func (s *pluginsStore) add(p *Plugin) (replacedPlugin *Plugin, replaced bool) {
func (s *pluginsStore) add(p *Plugin) error {
s.Lock()
defer s.Unlock()
if s.store == nil {
s.store = make(map[string]*Plugin)
s.store = make(map[string][]*Plugin)
}
replacedPlugin, exists := s.store[p.name]
s.store[p.name] = p
if replacedPlugin != nil && replacedPlugin.cancel != nil {
replacedPlugin.cancel(errors.New("plugin got replaced"))
for _, oldP := range s.store[p.name] {
if oldP.endpoint == p.endpoint {
// One plugin instance cannot hijack the endpoint of another instance.
return fmt.Errorf("endpoint %s already registered for plugin %s", p.endpoint, p.name)
}
}
return replacedPlugin, exists
s.store[p.name] = append(s.store[p.name], p)
return nil
}
// Delete lets you delete a DRA Plugin by name.
// This method is protected by a mutex.
func (s *pluginsStore) delete(pluginName string) *Plugin {
// remove lets you remove one endpoint for a DRA Plugin.
// This method is protected by a mutex. It returns the
// plugin if found and true if that was the last instance
func (s *pluginsStore) remove(pluginName, endpoint string) (*Plugin, bool) {
s.Lock()
defer s.Unlock()
p, exists := s.store[pluginName]
if !exists {
return nil
instances := s.store[pluginName]
i := slices.IndexFunc(instances, func(p *Plugin) bool { return p.endpoint == endpoint })
if i == -1 {
return nil, false
}
p := instances[i]
last := len(instances) == 1
if last {
delete(s.store, pluginName)
} else {
s.store[pluginName] = slices.Delete(instances, i, i+1)
}
if p.cancel != nil {
p.cancel(errors.New("plugin got removed"))
}
delete(s.store, pluginName)
return p
return p, last
}

View File

@ -21,6 +21,7 @@ import (
"errors"
"fmt"
"slices"
"sync"
"time"
v1 "k8s.io/api/core/v1"
@ -51,8 +52,22 @@ type RegistrationHandler struct {
// This is necessary because it implements APIs which don't
// provide a context.
backgroundCtx context.Context
cancel func(err error)
kubeClient kubernetes.Interface
getNode func() (*v1.Node, error)
wipingDelay time.Duration
wg sync.WaitGroup
mutex sync.Mutex
// pendingWipes maps a plugin name to a cancel function for
// wiping of that plugin's ResourceSlices. Entries get added
// in DeRegisterPlugin and check in RegisterPlugin. If
// wiping is pending during RegisterPlugin, it gets canceled.
//
// Must use pointers to functions because the entries have to
// be comparable.
pendingWipes map[string]*context.CancelCauseFunc
}
var _ cache.PluginHandler = &RegistrationHandler{}
@ -62,12 +77,20 @@ var _ cache.PluginHandler = &RegistrationHandler{}
// Must only be called once per process because it manages global state.
// If a kubeClient is provided, then it synchronizes ResourceSlices
// with the resource information provided by plugins.
func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1.Node, error)) *RegistrationHandler {
func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1.Node, error), wipingDelay time.Duration) *RegistrationHandler {
// The context and thus logger should come from the caller.
return newRegistrationHandler(context.TODO(), kubeClient, getNode, wipingDelay)
}
func newRegistrationHandler(ctx context.Context, kubeClient kubernetes.Interface, getNode func() (*v1.Node, error), wipingDelay time.Duration) *RegistrationHandler {
ctx, cancel := context.WithCancelCause(ctx)
handler := &RegistrationHandler{
// The context and thus logger should come from the caller.
backgroundCtx: klog.NewContext(context.TODO(), klog.LoggerWithName(klog.TODO(), "DRA registration handler")),
backgroundCtx: klog.NewContext(ctx, klog.LoggerWithName(klog.FromContext(ctx), "DRA registration handler")),
cancel: cancel,
kubeClient: kubeClient,
getNode: getNode,
wipingDelay: wipingDelay,
pendingWipes: make(map[string]*context.CancelCauseFunc),
}
// When kubelet starts up, no DRA driver has registered yet. None of
@ -77,19 +100,45 @@ func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1
// to start up.
//
// This has to run in the background.
go handler.wipeResourceSlices("")
handler.wg.Add(1)
go func() {
defer handler.wg.Done()
logger := klog.LoggerWithName(klog.FromContext(handler.backgroundCtx), "startup")
ctx := klog.NewContext(handler.backgroundCtx, logger)
handler.wipeResourceSlices(ctx, 0 /* no delay */, "" /* all drivers */)
}()
return handler
}
// Stop cancels any remaining background activities and blocks until all goroutines have stopped.
func (h *RegistrationHandler) Stop() {
h.cancel(errors.New("Stop was called"))
h.wg.Wait()
}
// wipeResourceSlices deletes ResourceSlices of the node, optionally just for a specific driver.
func (h *RegistrationHandler) wipeResourceSlices(driver string) {
// Wiping will delay for a while and can be canceled by canceling the context.
func (h *RegistrationHandler) wipeResourceSlices(ctx context.Context, delay time.Duration, driver string) {
if h.kubeClient == nil {
return
}
ctx := h.backgroundCtx
logger := klog.FromContext(ctx)
if delay != 0 {
// Before we start deleting, give the driver time to bounce back.
// Perhaps it got removed as part of a DaemonSet update and the
// replacement pod is about to start.
logger.V(4).Info("Starting to wait before wiping ResourceSlices", "delay", delay)
select {
case <-ctx.Done():
logger.V(4).Info("Aborting wiping of ResourceSlices", "reason", context.Cause(ctx))
case <-time.After(delay):
logger.V(4).Info("Starting to wipe ResourceSlices after waiting", "delay", delay)
}
}
backoff := wait.Backoff{
Duration: time.Second,
Factor: 2,
@ -148,10 +197,10 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,
// into all log output related to the plugin.
ctx := h.backgroundCtx
logger := klog.FromContext(ctx)
logger = klog.LoggerWithValues(logger, "pluginName", pluginName)
logger = klog.LoggerWithValues(logger, "pluginName", pluginName, "endpoint", endpoint)
ctx = klog.NewContext(ctx, logger)
logger.V(3).Info("Register new DRA plugin", "endpoint", endpoint)
logger.V(3).Info("Register new DRA plugin")
chosenService, err := h.validateSupportedServices(pluginName, supportedServices)
if err != nil {
@ -179,9 +228,19 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,
// Storing endpoint of newly registered DRA Plugin into the map, where plugin name will be the key
// all other DRA components will be able to get the actual socket of DRA plugins by its name.
if err := draPlugins.add(pluginInstance); err != nil {
cancel(err)
// No wrapping, the error already contains details.
return err
}
if oldPlugin, replaced := draPlugins.add(pluginInstance); replaced {
logger.V(1).Info("DRA plugin already registered, the old plugin was replaced and will be forgotten by the kubelet till the next kubelet restart", "oldEndpoint", oldPlugin.endpoint)
// Now cancel any pending ResourceSlice wiping for this plugin.
// Only needs to be done once.
h.mutex.Lock()
defer h.mutex.Unlock()
if cancel := h.pendingWipes[pluginName]; cancel != nil {
(*cancel)(errors.New("new plugin instance registered"))
delete(h.pendingWipes, pluginName)
}
return nil
@ -220,16 +279,53 @@ func (h *RegistrationHandler) validateSupportedServices(pluginName string, suppo
// DeRegisterPlugin is called when a plugin has removed its socket,
// signaling it is no longer available.
func (h *RegistrationHandler) DeRegisterPlugin(pluginName string) {
if p := draPlugins.delete(pluginName); p != nil {
func (h *RegistrationHandler) DeRegisterPlugin(pluginName, endpoint string) {
if p, last := draPlugins.remove(pluginName, endpoint); p != nil {
// This logger includes endpoint and pluginName.
logger := klog.FromContext(p.backgroundCtx)
logger.V(3).Info("Deregister DRA plugin", "endpoint", p.endpoint)
logger.V(3).Info("Deregister DRA plugin", "lastInstance", last)
if !last {
return
}
// Prepare for canceling the background wiping. This needs to run
// in the context of the registration handler, the one from
// the plugin is canceled.
logger = klog.FromContext(h.backgroundCtx)
logger = klog.LoggerWithName(logger, "driver-cleanup")
logger = klog.LoggerWithValues(logger, "pluginName", pluginName)
ctx, cancel := context.WithCancelCause(h.backgroundCtx)
ctx = klog.NewContext(ctx, logger)
// Clean up the ResourceSlices for the deleted Plugin since it
// may have died without doing so itself and might never come
// back.
go h.wipeResourceSlices(pluginName)
//
// May get canceled if the plugin comes back quickly enough
// (see RegisterPlugin).
h.mutex.Lock()
defer h.mutex.Unlock()
if cancel := h.pendingWipes[pluginName]; cancel != nil {
(*cancel)(errors.New("plugin deregistered a second time"))
}
h.pendingWipes[pluginName] = &cancel
h.wg.Add(1)
go func() {
defer h.wg.Done()
defer func() {
h.mutex.Lock()
defer h.mutex.Unlock()
// Cancel our own context, but remove it from the map only if it
// is the current entry. Perhaps it already got replaced.
cancel(errors.New("wiping done"))
if h.pendingWipes[pluginName] == &cancel {
delete(h.pendingWipes, pluginName)
}
}()
h.wipeResourceSlices(ctx, h.wipingDelay, pluginName)
}()
return
}

View File

@ -268,3 +268,11 @@ func (cm *FakeContainerManager) UpdateAllocatedResourcesStatus(pod *v1.Pod, stat
func (cm *FakeContainerManager) Updates() <-chan resourceupdates.Update {
return nil
}
func (cm *FakeContainerManager) PodHasExclusiveCPUs(pod *v1.Pod) bool {
return false
}
func (cm *FakeContainerManager) ContainerHasExclusiveCPUs(pod *v1.Pod, container *v1.Container) bool {
return false
}

View File

@ -23,7 +23,7 @@ import (
"path/filepath"
"strconv"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontainercgroups "github.com/opencontainers/cgroups"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"

View File

@ -205,6 +205,7 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
m.allocatableMemory = m.policy.GetAllocatableMemory(m.state)
klog.V(4).InfoS("memorymanager started", "policy", m.policy.Name())
return nil
}
@ -248,7 +249,7 @@ func (m *manager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.
}
if numaNodes.Len() == 0 {
klog.V(5).InfoS("No allocation is available", "pod", klog.KObj(pod), "containerName", container.Name)
klog.V(5).InfoS("NUMA nodes not available for allocation", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
@ -266,7 +267,7 @@ func (m *manager) Allocate(pod *v1.Pod, container *v1.Container) error {
// Call down into the policy to assign this container memory if required.
if err := m.policy.Allocate(m.state, pod, container); err != nil {
klog.ErrorS(err, "Allocate error")
klog.ErrorS(err, "Allocate error", "pod", klog.KObj(pod), "containerName", container.Name)
return err
}
return nil
@ -280,7 +281,7 @@ func (m *manager) RemoveContainer(containerID string) error {
// if error appears it means container entry already does not exist under the container map
podUID, containerName, err := m.containerMap.GetContainerRef(containerID)
if err != nil {
klog.InfoS("Failed to get container from container map", "containerID", containerID, "err", err)
klog.ErrorS(err, "Failed to get container from container map", "containerID", containerID)
return nil
}
@ -344,7 +345,7 @@ func (m *manager) removeStaleState() {
for podUID := range assignments {
for containerName := range assignments[podUID] {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
klog.V(2).InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
}
@ -352,7 +353,7 @@ func (m *manager) removeStaleState() {
m.containerMap.Visit(func(podUID, containerName, containerID string) {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
klog.V(2).InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
})

View File

@ -96,7 +96,9 @@ func (p *staticPolicy) Start(s state.State) error {
// Allocate call is idempotent
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
// allocate the memory only for guaranteed pods
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
qos := v1qos.GetPodQOS(pod)
if qos != v1.PodQOSGuaranteed {
klog.V(5).InfoS("Exclusive memory allocation skipped, pod QoS is not guaranteed", "pod", klog.KObj(pod), "containerName", container.Name, "qos", qos)
return nil
}
@ -196,6 +198,7 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
// TODO: we should refactor our state structs to reflect the amount of the re-used memory
p.updateInitContainersMemoryBlocks(s, pod, container, containerBlocks)
klog.V(4).InfoS("Allocated exclusive memory", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
@ -304,24 +307,24 @@ func regenerateHints(pod *v1.Pod, ctn *v1.Container, ctnBlocks []state.Block, re
}
if len(ctnBlocks) != len(reqRsrc) {
klog.ErrorS(nil, "The number of requested resources by the container differs from the number of memory blocks", "containerName", ctn.Name)
klog.InfoS("The number of requested resources by the container differs from the number of memory blocks", "pod", klog.KObj(pod), "containerName", ctn.Name)
return nil
}
for _, b := range ctnBlocks {
if _, ok := reqRsrc[b.Type]; !ok {
klog.ErrorS(nil, "Container requested resources do not have resource of this type", "containerName", ctn.Name, "type", b.Type)
klog.InfoS("Container requested resources but none available of this type", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type)
return nil
}
if b.Size != reqRsrc[b.Type] {
klog.ErrorS(nil, "Memory already allocated with different numbers than requested", "podUID", pod.UID, "type", b.Type, "containerName", ctn.Name, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
klog.InfoS("Memory already allocated with different numbers than requested", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type, "requestedResource", reqRsrc[b.Type], "allocatedSize", b.Size)
return nil
}
containerNUMAAffinity, err := bitmask.NewBitMask(b.NUMAAffinity...)
if err != nil {
klog.ErrorS(err, "Failed to generate NUMA bitmask")
klog.ErrorS(err, "Failed to generate NUMA bitmask", "pod", klog.KObj(pod), "containerName", ctn.Name, "type", b.Type)
return nil
}
@ -447,7 +450,13 @@ func getRequestedResources(pod *v1.Pod, container *v1.Container) (map[v1.Resourc
// We should return this value because this is what kubelet agreed to allocate for the container
// and the value configured with runtime.
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
containerStatuses := pod.Status.ContainerStatuses
if podutil.IsRestartableInitContainer(container) {
if len(pod.Status.InitContainerStatuses) != 0 {
containerStatuses = append(containerStatuses, pod.Status.InitContainerStatuses...)
}
}
if cs, ok := podutil.GetContainerStatus(containerStatuses, container.Name); ok {
resources = cs.AllocatedResources
}
}
@ -654,36 +663,36 @@ func (p *staticPolicy) validateState(s state.State) error {
func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
if len(ms1) != len(ms2) {
klog.ErrorS(nil, "Node states are different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
klog.InfoS("Node states were different", "lengthNode1", len(ms1), "lengthNode2", len(ms2))
return false
}
for nodeID, nodeState1 := range ms1 {
nodeState2, ok := ms2[nodeID]
if !ok {
klog.ErrorS(nil, "Node state does not have node ID", "nodeID", nodeID)
klog.InfoS("Node state didn't have node ID", "nodeID", nodeID)
return false
}
if nodeState1.NumberOfAssignments != nodeState2.NumberOfAssignments {
klog.ErrorS(nil, "Node states number of assignments are different", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
klog.InfoS("Node state had a different number of memory assignments.", "assignment1", nodeState1.NumberOfAssignments, "assignment2", nodeState2.NumberOfAssignments)
return false
}
if !areGroupsEqual(nodeState1.Cells, nodeState2.Cells) {
klog.ErrorS(nil, "Node states groups are different", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
klog.InfoS("Node states had different groups", "stateNode1", nodeState1.Cells, "stateNode2", nodeState2.Cells)
return false
}
if len(nodeState1.MemoryMap) != len(nodeState2.MemoryMap) {
klog.ErrorS(nil, "Node states memory map have different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
klog.InfoS("Node state had memory maps of different lengths", "lengthNode1", len(nodeState1.MemoryMap), "lengthNode2", len(nodeState2.MemoryMap))
return false
}
for resourceName, memoryState1 := range nodeState1.MemoryMap {
memoryState2, ok := nodeState2.MemoryMap[resourceName]
if !ok {
klog.ErrorS(nil, "Memory state does not have resource", "resource", resourceName)
klog.InfoS("Memory state didn't have resource", "resource", resourceName)
return false
}
@ -701,11 +710,11 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
}
if tmpState1.Free != tmpState2.Free {
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("NUMA node and resource had different memory states", "node", nodeID, "resource", resourceName, "field", "free", "free1", tmpState1.Free, "free2", tmpState2.Free, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
if tmpState1.Reserved != tmpState2.Reserved {
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("NUMA node and resource had different memory states", "node", nodeID, "resource", resourceName, "field", "reserved", "reserved1", tmpState1.Reserved, "reserved2", tmpState2.Reserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
}
@ -715,17 +724,17 @@ func areMachineStatesEqual(ms1, ms2 state.NUMANodeMap) bool {
func areMemoryStatesEqual(memoryState1, memoryState2 *state.MemoryTable, nodeID int, resourceName v1.ResourceName) bool {
if memoryState1.TotalMemSize != memoryState2.TotalMemSize {
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "TotalMemSize", "TotalMemSize1", memoryState1.TotalMemSize, "TotalMemSize2", memoryState2.TotalMemSize, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
if memoryState1.SystemReserved != memoryState2.SystemReserved {
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "SystemReserved", "SystemReserved1", memoryState1.SystemReserved, "SystemReserved2", memoryState2.SystemReserved, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
if memoryState1.Allocatable != memoryState2.Allocatable {
klog.ErrorS(nil, "Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
klog.InfoS("Memory states for the NUMA node and resource are different", "node", nodeID, "resource", resourceName, "field", "Allocatable", "Allocatable1", memoryState1.Allocatable, "Allocatable2", memoryState2.Allocatable, "memoryState1", *memoryState1, "memoryState2", *memoryState2)
return false
}
return true

View File

@ -131,7 +131,7 @@ func (sc *stateCheckpoint) SetMachineState(memoryMap NUMANodeMap) {
sc.cache.SetMachineState(memoryMap)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}
@ -143,7 +143,7 @@ func (sc *stateCheckpoint) SetMemoryBlocks(podUID string, containerName string,
sc.cache.SetMemoryBlocks(podUID, containerName, blocks)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
}
}
@ -155,7 +155,7 @@ func (sc *stateCheckpoint) SetMemoryAssignments(assignments ContainerMemoryAssig
sc.cache.SetMemoryAssignments(assignments)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}
@ -167,7 +167,7 @@ func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
sc.cache.Delete(podUID, containerName)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint", "podUID", podUID, "containerName", containerName)
}
}
@ -179,6 +179,6 @@ func (sc *stateCheckpoint) ClearState() {
sc.cache.ClearState()
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
klog.ErrorS(err, "Failed to store state to checkpoint")
}
}

View File

@ -94,6 +94,7 @@ func (s *stateMemory) SetMemoryAssignments(assignments ContainerMemoryAssignment
defer s.Unlock()
s.assignments = assignments.Clone()
klog.V(5).InfoS("Updated Memory assignments", "assignments", assignments)
}
// Delete deletes corresponding Blocks from ContainerMemoryAssignments

View File

@ -23,7 +23,7 @@ import (
"path"
"strings"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontainercgroups "github.com/opencontainers/cgroups"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
@ -55,6 +55,8 @@ type podContainerManagerImpl struct {
// cpuCFSQuotaPeriod is the cfs period value, cfs_period_us, setting per
// node for all containers in usec
cpuCFSQuotaPeriod uint64
// podContainerManager is the ContainerManager running on the machine
podContainerManager ContainerManager
}
// Make sure that podContainerManagerImpl implements the PodContainerManager interface
@ -73,6 +75,11 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
// check if container already exist
alreadyExists := m.Exists(pod)
if !alreadyExists {
enforceCPULimits := m.enforceCPULimits
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DisableCPUQuotaWithExclusiveCPUs) && m.podContainerManager.PodHasExclusiveCPUs(pod) {
klog.V(2).InfoS("Disabled CFS quota", "pod", klog.KObj(pod))
enforceCPULimits = false
}
enforceMemoryQoS := false
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) &&
libcontainercgroups.IsCgroup2UnifiedMode() {
@ -82,7 +89,7 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
podContainerName, _ := m.GetPodContainerName(pod)
containerConfig := &CgroupConfig{
Name: podContainerName,
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
ResourceParameters: ResourceConfigForPod(pod, enforceCPULimits, m.cpuCFSQuotaPeriod, enforceMemoryQoS),
}
if m.podPidsLimit > 0 {
containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit

View File

@ -29,7 +29,7 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
units "github.com/docker/go-units"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontainercgroups "github.com/opencontainers/cgroups"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/resource"

View File

@ -45,7 +45,7 @@ func NewFakeManagerWithHint(hint *TopologyHint) Manager {
// NewFakeManagerWithPolicy returns an instance of fake topology manager with specified policy
func NewFakeManagerWithPolicy(policy Policy) Manager {
klog.InfoS("NewFakeManagerWithPolicy")
klog.InfoS("NewFakeManagerWithPolicy", "policy", policy.Name())
return &fakeManager{
policy: policy,
}

View File

@ -47,11 +47,11 @@ func CheckPolicyOptionAvailable(option string) error {
}
if alphaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyAlphaOptions) {
return fmt.Errorf("Topology Manager Policy Alpha-level Options not enabled, but option %q provided", option)
return fmt.Errorf("topology manager policy alpha-level options not enabled, but option %q provided", option)
}
if betaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.TopologyManagerPolicyBetaOptions) {
return fmt.Errorf("Topology Manager Policy Beta-level Options not enabled, but option %q provided", option)
return fmt.Errorf("topology manager policy beta-level options not enabled, but option %q provided", option)
}
return nil

View File

@ -50,6 +50,9 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
if !admit {
if IsAlignmentGuaranteed(s.policy) {
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
}
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(&TopologyAffinityError{})
}
@ -63,6 +66,7 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
}
if IsAlignmentGuaranteed(s.policy) {
klog.V(4).InfoS("Resource alignment at container scope guaranteed", "pod", klog.KObj(pod))
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
}
}
@ -84,6 +88,6 @@ func (s *containerScope) accumulateProvidersHints(pod *v1.Pod, container *v1.Con
func (s *containerScope) calculateAffinity(pod *v1.Pod, container *v1.Container) (TopologyHint, bool) {
providersHints := s.accumulateProvidersHints(pod, container)
bestHint, admit := s.policy.Merge(providersHints)
klog.InfoS("ContainerTopologyHint", "bestHint", bestHint)
klog.InfoS("ContainerTopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
return bestHint, admit
}

View File

@ -48,6 +48,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
bestHint, admit := s.calculateAffinity(pod)
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
if !admit {
if IsAlignmentGuaranteed(s.policy) {
// increment only if we know we allocate aligned resources.
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
}
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
return admission.GetPodAdmitResult(&TopologyAffinityError{})
}
@ -64,6 +68,7 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
}
if IsAlignmentGuaranteed(s.policy) {
// increment only if we know we allocate aligned resources.
klog.V(4).InfoS("Resource alignment at pod scope guaranteed", "pod", klog.KObj(pod))
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
}
return admission.GetPodAdmitResult(nil)
@ -84,6 +89,6 @@ func (s *podScope) accumulateProvidersHints(pod *v1.Pod) []map[string][]Topology
func (s *podScope) calculateAffinity(pod *v1.Pod) (TopologyHint, bool) {
providersHints := s.accumulateProvidersHints(pod)
bestHint, admit := s.policy.Merge(providersHints)
klog.InfoS("PodTopologyHint", "bestHint", bestHint)
klog.InfoS("PodTopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
return bestHint, admit
}

View File

@ -188,9 +188,19 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology
scope: scope,
}
manager.initializeMetrics()
return manager, nil
}
func (m *manager) initializeMetrics() {
// ensure the values exist
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
}
func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint {
return m.scope.GetAffinity(podUID, containerName)
}
@ -212,11 +222,13 @@ func (m *manager) RemoveContainer(containerID string) error {
}
func (m *manager) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
klog.V(4).InfoS("Topology manager admission check", "pod", klog.KObj(attrs.Pod))
metrics.TopologyManagerAdmissionRequestsTotal.Inc()
startTime := time.Now()
podAdmitResult := m.scope.Admit(attrs.Pod)
metrics.TopologyManagerAdmissionDuration.Observe(float64(time.Since(startTime).Milliseconds()))
klog.V(4).InfoS("Pod Admit Result", "Message", podAdmitResult.Message, "pod", klog.KObj(attrs.Pod))
return podAdmitResult
}

View File

@ -21,7 +21,7 @@ import (
libcontainerutils "k8s.io/kubernetes/third_party/forked/libcontainer/utils"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontainercgroups "github.com/opencontainers/cgroups"
)
const (

View File

@ -456,6 +456,7 @@ func checkAndUpdatePod(existing, ref *v1.Pod) (needUpdate, needReconcile, needGr
existing.Labels = ref.Labels
existing.DeletionTimestamp = ref.DeletionTimestamp
existing.DeletionGracePeriodSeconds = ref.DeletionGracePeriodSeconds
existing.Generation = ref.Generation
existing.Status = ref.Status
updateAnnotations(existing, ref)

View File

@ -30,4 +30,5 @@ const (
KubeletPluginsDirSELinuxLabel = "system_u:object_r:container_file_t:s0"
KubeletContainersSharedSELinuxLabel = "system_u:object_r:container_file_t:s0"
DefaultKubeletCheckpointsDirName = "checkpoints"
DefaultKubeletUserNamespacesIDsPerPod = 65536
)

View File

@ -15,4 +15,4 @@ limitations under the License.
*/
// Package config implements the pod configuration readers.
package config // import "k8s.io/kubernetes/pkg/kubelet/config"
package config

View File

@ -396,6 +396,8 @@ func MakePortMappings(container *v1.Container) (ports []PortMapping) {
// HasAnyRegularContainerStarted returns true if any regular container has
// started, which indicates all init containers have been initialized.
// Deprecated: This function is not accurate when its pod sandbox is recreated.
// Use HasAnyActiveRegularContainerStarted instead.
func HasAnyRegularContainerStarted(spec *v1.PodSpec, statuses []v1.ContainerStatus) bool {
if len(statuses) == 0 {
return false
@ -417,3 +419,26 @@ func HasAnyRegularContainerStarted(spec *v1.PodSpec, statuses []v1.ContainerStat
return false
}
// HasAnyActiveRegularContainerStarted returns true if any regular container of
// the current pod sandbox has started, which indicates all init containers
// have been initialized.
func HasAnyActiveRegularContainerStarted(spec *v1.PodSpec, podStatus *PodStatus) bool {
if podStatus == nil {
return false
}
containerNames := sets.New[string]()
for _, c := range spec.Containers {
containerNames.Insert(c.Name)
}
for _, status := range podStatus.ActiveContainerStatuses {
if !containerNames.Has(status.Name) {
continue
}
return true
}
return false
}

View File

@ -33,6 +33,8 @@ import (
"k8s.io/client-go/util/flowcontrol"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/credentialprovider"
kubelettypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/volume"
)
@ -137,6 +139,9 @@ type Runtime interface {
ListPodSandboxMetrics(ctx context.Context) ([]*runtimeapi.PodSandboxMetrics, error)
// GetContainerStatus returns the status for the container.
GetContainerStatus(ctx context.Context, id ContainerID) (*Status, error)
// GetContainerSwapBehavior reports whether a container could be swappable.
// This is used to decide whether to handle InPlacePodVerticalScaling for containers.
GetContainerSwapBehavior(pod *v1.Pod, container *v1.Container) kubelettypes.SwapBehavior
}
// StreamingRuntime is the interface implemented by runtimes that handle the serving of the
@ -151,8 +156,11 @@ type StreamingRuntime interface {
// ImageService interfaces allows to work with image service.
type ImageService interface {
// PullImage pulls an image from the network to local storage using the supplied
// secrets if necessary. It returns a reference (digest or ID) to the pulled image.
PullImage(ctx context.Context, image ImageSpec, pullSecrets []v1.Secret, podSandboxConfig *runtimeapi.PodSandboxConfig) (string, error)
// secrets if necessary.
// It returns a reference (digest or ID) to the pulled image and the credentials
// that were used to pull the image. If the returned credentials are nil, the
// pull was anonymous.
PullImage(ctx context.Context, image ImageSpec, credentials []credentialprovider.TrackedAuthConfig, podSandboxConfig *runtimeapi.PodSandboxConfig) (string, *credentialprovider.TrackedAuthConfig, error)
// GetImageRef gets the reference (digest or ID) of the image which has already been in
// the local storage. It returns ("", nil) if the image isn't in the local storage.
GetImageRef(ctx context.Context, image ImageSpec) (string, error)
@ -317,6 +325,8 @@ type PodStatus struct {
IPs []string
// Status of containers in the pod.
ContainerStatuses []*Status
// Statuses of containers of the active sandbox in the pod.
ActiveContainerStatuses []*Status
// Status of the pod sandbox.
// Only for kuberuntime now, other runtime may keep it nil.
SandboxStatuses []*runtimeapi.PodSandboxStatus
@ -378,6 +388,8 @@ type Status struct {
User *ContainerUser
// Mounts are the volume mounts of the container
Mounts []Mount
// StopSignal is used to show the container's effective stop signal in the Status
StopSignal *v1.Signal
}
// ContainerUser represents user identity information
@ -472,6 +484,9 @@ type Mount struct {
Propagation runtimeapi.MountPropagation
// Image is set if an OCI volume as image ID or digest should get mounted (special case).
Image *runtimeapi.ImageSpec
// ImageSubPath is set if an image volume sub path should get mounted. This
// field is only required if the above Image is set.
ImageSubPath string
}
// ImageVolumes is a map of image specs by volume name.

View File

@ -45,6 +45,8 @@ var (
ErrConfigPodSandbox = errors.New("ConfigPodSandboxError")
// ErrKillPodSandbox returned when runtime failed to stop pod's sandbox.
ErrKillPodSandbox = errors.New("KillPodSandboxError")
// ErrResizePodInPlace returned when runtime failed to resize a pod.
ErrResizePodInPlace = errors.New("ResizePodInPlaceError")
)
// SyncAction indicates different kind of actions in SyncPod() and KillPod(). Now there are only actions
@ -68,6 +70,8 @@ const (
ConfigPodSandbox SyncAction = "ConfigPodSandbox"
// KillPodSandbox action
KillPodSandbox SyncAction = "KillPodSandbox"
// ResizePodInPlace action is included whenever any containers in the pod are resized without restart
ResizePodInPlace SyncAction = "ResizePodInPlace"
)
// SyncResult is the result of sync action.

View File

@ -61,6 +61,7 @@ const (
VolumeResizeFailed = "VolumeResizeFailed"
VolumeResizeSuccess = "VolumeResizeSuccessful"
FileSystemResizeFailed = "FileSystemResizeFailed"
VolumePermissionChangeInProgress = "VolumePermissionChangeInProgress"
FileSystemResizeSuccess = "FileSystemResizeSuccessful"
FailedMapVolume = "FailedMapVolume"
WarnAlreadyMountedVolume = "AlreadyMountedVolume"

View File

@ -16,4 +16,4 @@ limitations under the License.
// Package lifecycle contains handlers for pod lifecycle events and interfaces
// to integrate with kubelet admission, synchronization, and eviction of pods.
package lifecycle // import "k8s.io/kubernetes/pkg/kubelet/lifecycle"
package lifecycle

View File

@ -22,15 +22,16 @@ import (
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/featuregate"
"k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/scheduler"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
"k8s.io/utils/ptr"
)
const (
@ -52,6 +53,11 @@ const (
// than Always for some of its init containers.
InitContainerRestartPolicyForbidden = "InitContainerRestartPolicyForbidden"
// SupplementalGroupsPolicyNotSupported is used to denote that the pod was
// rejected admission to the node because the node does not support
// the pod's SupplementalGroupsPolicy.
SupplementalGroupsPolicyNotSupported = "SupplementalGroupsPolicyNotSupported"
// UnexpectedAdmissionError is used to denote that the pod was rejected
// admission to the node because of an error during admission that could not
// be categorized.
@ -135,25 +141,20 @@ func (w *predicateAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult
}
}
if rejectPodAdmissionBasedOnSupplementalGroupsPolicy(admitPod, node) {
message := fmt.Sprintf("SupplementalGroupsPolicy=%s is not supported in this node", v1.SupplementalGroupsPolicyStrict)
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
return PodAdmitResult{
Admit: false,
Reason: SupplementalGroupsPolicyNotSupported,
Message: message,
}
}
pods := attrs.OtherPods
nodeInfo := schedulerframework.NewNodeInfo(pods...)
nodeInfo.SetNode(node)
// TODO: Remove this after the SidecarContainers feature gate graduates to GA.
if !utilfeature.DefaultFeatureGate.Enabled(features.SidecarContainers) {
for _, c := range admitPod.Spec.InitContainers {
if podutil.IsRestartableInitContainer(&c) {
message := fmt.Sprintf("Init container %q may not have a non-default restartPolicy", c.Name)
klog.InfoS("Failed to admit pod", "pod", klog.KObj(admitPod), "message", message)
return PodAdmitResult{
Admit: false,
Reason: InitContainerRestartPolicyForbidden,
Message: message,
}
}
}
}
// ensure the node has enough plugin resources for that required in pods
if err = w.pluginResourceUpdateFunc(nodeInfo, attrs); err != nil {
message := fmt.Sprintf("Update plugin resources failed due to %v, which is unexpected.", err)
@ -272,6 +273,45 @@ func rejectPodAdmissionBasedOnOSField(pod *v1.Pod) bool {
return string(pod.Spec.OS.Name) != runtime.GOOS
}
// rejectPodAdmissionBasedOnSupplementalGroupsPolicy rejects pod only if
// - the feature is beta or above, and SupplementalPolicy=Strict is set in the pod
// - but, the node does not support the feature
//
// Note: During the feature is alpha or before(not yet released) in emulated version,
// it should admit for backward compatibility
func rejectPodAdmissionBasedOnSupplementalGroupsPolicy(pod *v1.Pod, node *v1.Node) bool {
admit, reject := false, true // just for readability
inUse := (pod.Spec.SecurityContext != nil && pod.Spec.SecurityContext.SupplementalGroupsPolicy != nil)
if !inUse {
return admit
}
isBetaOrAbove := false
if featureSpec, ok := utilfeature.DefaultMutableFeatureGate.GetAll()[features.SupplementalGroupsPolicy]; ok {
isBetaOrAbove = (featureSpec.PreRelease == featuregate.Beta) || (featureSpec.PreRelease == featuregate.GA)
}
if !isBetaOrAbove {
return admit
}
featureSupportedOnNode := ptr.Deref(
ptr.Deref(node.Status.Features, v1.NodeFeatures{SupplementalGroupsPolicy: ptr.To(false)}).SupplementalGroupsPolicy,
false,
)
effectivePolicy := ptr.Deref(
pod.Spec.SecurityContext.SupplementalGroupsPolicy,
v1.SupplementalGroupsPolicyMerge,
)
if effectivePolicy == v1.SupplementalGroupsPolicyStrict && !featureSupportedOnNode {
return reject
}
return admit
}
func removeMissingExtendedResources(pod *v1.Pod, nodeInfo *schedulerframework.NodeInfo) *v1.Pod {
filterExtendedResources := func(containers []v1.Container) {
for i, c := range containers {

View File

@ -113,6 +113,7 @@ const (
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
CPUManagerSharedPoolSizeMilliCoresKey = "cpu_manager_shared_pool_size_millicores"
CPUManagerExclusiveCPUsAllocationCountKey = "cpu_manager_exclusive_cpu_allocation_count"
CPUManagerAllocationPerNUMAKey = "cpu_manager_allocation_per_numa"
// Metrics to track the Memory manager behavior
MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
@ -132,6 +133,7 @@ const (
// Metric for tracking aligment of compute resources
ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count"
ContainerAlignedComputeResourcesFailureNameKey = "container_aligned_compute_resources_failure_count"
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
@ -149,9 +151,15 @@ const (
AlignedPhysicalCPU = "physical_cpu"
AlignedNUMANode = "numa_node"
AlignedUncoreCache = "uncore_cache"
// Metrics to track kubelet admission rejections.
AdmissionRejectionsTotalKey = "admission_rejections_total"
// Image Volume metrics
ImageVolumeRequestedTotalKey = "image_volume_requested_total"
ImageVolumeMountedSucceedTotalKey = "image_volume_mounted_succeed_total"
ImageVolumeMountedErrorsTotalKey = "image_volume_mounted_errors_total"
)
type imageSizeBucket struct {
@ -808,6 +816,17 @@ var (
},
)
// CPUManagerAllocationPerNUMA tracks the count of CPUs allocated per NUMA node
CPUManagerAllocationPerNUMA = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: KubeletSubsystem,
Name: CPUManagerAllocationPerNUMAKey,
Help: "Number of CPUs allocated per NUMA node",
StabilityLevel: metrics.ALPHA,
},
[]string{AlignedNUMANode},
)
// ContainerAlignedComputeResources reports the count of resources allocation which granted aligned resources, per alignment boundary
ContainerAlignedComputeResources = metrics.NewCounterVec(
&metrics.CounterOpts{
@ -818,7 +837,18 @@ var (
},
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
)
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
// ContainerAlignedComputeResourcesFailure reports the count of resources allocation attempts which failed to align resources, per alignment boundary
ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: ContainerAlignedComputeResourcesFailureNameKey,
Help: "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.",
StabilityLevel: metrics.ALPHA,
},
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
)
MemoryManagerPinningRequestTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
@ -1008,6 +1038,36 @@ var (
},
[]string{"reason"},
)
// ImageVolumeRequestedTotal trakcs the number of requested image volumes.
ImageVolumeRequestedTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: ImageVolumeRequestedTotalKey,
Help: "Number of requested image volumes.",
StabilityLevel: metrics.ALPHA,
},
)
// ImageVolumeMountedSucceedTotal tracks the number of successful image volume mounts.
ImageVolumeMountedSucceedTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: ImageVolumeMountedSucceedTotalKey,
Help: "Number of successful image volume mounts.",
StabilityLevel: metrics.ALPHA,
},
)
// ImageVolumeMountedErrorsTotal tracks the number of failed image volume mounts.
ImageVolumeMountedErrorsTotal = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: KubeletSubsystem,
Name: ImageVolumeMountedErrorsTotalKey,
Help: "Number of failed image volume mounts.",
StabilityLevel: metrics.ALPHA,
},
)
)
var registerMetrics sync.Once
@ -1078,7 +1138,9 @@ func Register(collectors ...metrics.StableCollector) {
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores)
legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount)
legacyregistry.MustRegister(CPUManagerAllocationPerNUMA)
legacyregistry.MustRegister(ContainerAlignedComputeResources)
legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure)
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)
@ -1107,6 +1169,12 @@ func Register(collectors ...metrics.StableCollector) {
}
legacyregistry.MustRegister(AdmissionRejectionsTotal)
if utilfeature.DefaultFeatureGate.Enabled(features.ImageVolume) {
legacyregistry.MustRegister(ImageVolumeRequestedTotal)
legacyregistry.MustRegister(ImageVolumeMountedSucceedTotal)
legacyregistry.MustRegister(ImageVolumeMountedErrorsTotal)
}
})
}

View File

@ -89,6 +89,7 @@ type PluginInfo struct {
UUID types.UID
Handler PluginHandler
Name string
Endpoint string
}
func (asw *actualStateOfWorld) AddPlugin(pluginInfo PluginInfo) error {

View File

@ -56,5 +56,5 @@ type PluginHandler interface {
RegisterPlugin(pluginName, endpoint string, versions []string, pluginClientTimeout *time.Duration) error
// DeRegisterPlugin is called once the pluginwatcher observes that the socket has
// been deleted.
DeRegisterPlugin(pluginName string)
DeRegisterPlugin(pluginName, endpoint string)
}

25
e2e/vendor/k8s.io/kubernetes/pkg/kubelet/qos/doc.go generated vendored Normal file
View File

@ -0,0 +1,25 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package qos contains helper functions for quality of service.
// For each resource (memory, CPU) Kubelet supports three classes of containers.
// Memory guaranteed containers will receive the highest priority and will get all the resources
// they need.
// Burstable containers will be guaranteed their request and can "burst" and use more resources
// when available.
// Best-Effort containers, which don't specify a request, can use resources only if not being used
// by other pods.
package qos

View File

@ -0,0 +1,71 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package qos contains helper functions for quality of service.
// For each resource (memory, CPU) Kubelet supports three classes of containers.
// Memory guaranteed containers will receive the highest priority and will get all the resources
// they need.
// Burstable containers will be guaranteed their request and can "burst" and use more resources
// when available.
// Best-Effort containers, which don't specify a request, can use resources only if not being used
// by other pods.
package qos
import (
v1 "k8s.io/api/core/v1"
resourcehelper "k8s.io/component-helpers/resource"
)
// minRegularContainerMemory returns the minimum memory resource quantity
// across all regular containers in pod.Spec.Containers.
// It does not include initContainers (both restartable and non-restartable).
func minRegularContainerMemory(pod v1.Pod) int64 {
memoryValue := pod.Spec.Containers[0].Resources.Requests.Memory().Value()
for _, container := range pod.Spec.Containers[1:] {
if container.Resources.Requests.Memory().Value() < memoryValue {
memoryValue = container.Resources.Requests.Memory().Value()
}
}
return memoryValue
}
// remainingPodMemReqPerContainer calculates the remaining pod memory request per
// container by:
// 1. Taking the total pod memory requests
// 2. Subtracting total container memory requests from pod memory requests
// 3. Dividing the remainder by the number of containers.
// This gives us the additional memory request that is not allocated to any
// containers in the pod. This value will be divided equally among all containers to
// calculate oom score adjusment.
// See https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2837-pod-level-resource-spec/README.md#oom-score-adjustment
// for more details.
func remainingPodMemReqPerContainer(pod *v1.Pod) int64 {
var remainingMemory int64
if pod.Spec.Resources.Requests.Memory().IsZero() {
return remainingMemory
}
numContainers := len(pod.Spec.Containers) + len(pod.Spec.InitContainers)
// Aggregated requests of all containers.
aggrContainerReqs := resourcehelper.AggregateContainerRequests(pod, resourcehelper.PodResourcesOptions{})
remainingMemory = pod.Spec.Resources.Requests.Memory().Value() - aggrContainerReqs.Memory().Value()
remainingMemoryPerContainer := remainingMemory / int64(numContainers)
return remainingMemoryPerContainer
}

135
e2e/vendor/k8s.io/kubernetes/pkg/kubelet/qos/policy.go generated vendored Normal file
View File

@ -0,0 +1,135 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package qos
import (
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
resourcehelper "k8s.io/component-helpers/resource"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/types"
)
const (
// KubeletOOMScoreAdj is the OOM score adjustment for Kubelet
KubeletOOMScoreAdj int = -999
// KubeProxyOOMScoreAdj is the OOM score adjustment for kube-proxy
KubeProxyOOMScoreAdj int = -999
guaranteedOOMScoreAdj int = -997
besteffortOOMScoreAdj int = 1000
)
// GetContainerOOMScoreAdjust returns the amount by which the OOM score of all processes in the
// container should be adjusted.
// The OOM score of a process is the percentage of memory it consumes
// multiplied by 10 (barring exceptional cases) + a configurable quantity which is between -1000
// and 1000. Containers with higher OOM scores are killed if the system runs out of memory.
// See https://lwn.net/Articles/391222/ for more information.
// OOMScoreAdjust should be calculated based on the allocated resources, so the pod argument should
// contain the allocated resources in the spec.
func GetContainerOOMScoreAdjust(pod *v1.Pod, container *v1.Container, memoryCapacity int64) int {
if types.IsNodeCriticalPod(pod) {
// Only node critical pod should be the last to get killed.
return guaranteedOOMScoreAdj
}
switch v1qos.GetPodQOS(pod) {
case v1.PodQOSGuaranteed:
// Guaranteed containers should be the last to get killed.
return guaranteedOOMScoreAdj
case v1.PodQOSBestEffort:
return besteffortOOMScoreAdj
}
// Burstable containers are a middle tier, between Guaranteed and Best-Effort. Ideally,
// we want to protect Burstable containers that consume less memory than requested.
// The formula below is a heuristic. A container requesting for 10% of a system's
// memory will have an OOM score adjust of 900. If a process in container Y
// uses over 10% of memory, its OOM score will be 1000. The idea is that containers
// which use more than their request will have an OOM score of 1000 and will be prime
// targets for OOM kills.
// Note that this is a heuristic, it won't work if a container has many small processes.
containerMemReq := container.Resources.Requests.Memory().Value()
var oomScoreAdjust, remainingReqPerContainer int64
// When PodLevelResources feature is enabled, the OOM score adjustment formula is modified
// to account for pod-level memory requests. Any extra pod memory request that's
// not allocated to the containers is divided equally among all containers and
// added to their individual memory requests when calculating the OOM score
// adjustment. Otherwise, only container-level memory requests are used. See
// https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/2837-pod-level-resource-spec/README.md#oom-score-adjustment
// for more details.
if utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources) &&
resourcehelper.IsPodLevelRequestsSet(pod) {
// TODO(ndixita): Refactor to use this formula in all cases, as
// remainingReqPerContainer will be 0 when pod-level resources are not set.
remainingReqPerContainer = remainingPodMemReqPerContainer(pod)
oomScoreAdjust = 1000 - (1000 * (containerMemReq + remainingReqPerContainer) / memoryCapacity)
} else {
oomScoreAdjust = 1000 - (1000*containerMemReq)/memoryCapacity
}
// adapt the sidecarContainer memoryRequest for OOM ADJ calculation
// calculate the oom score adjustment based on: max-memory( currentSideCarContainer , min-memory(regular containers) ) .
if isSidecarContainer(pod, container) {
// check min memory quantity in regular containers
minMemoryRequest := minRegularContainerMemory(*pod)
// When calculating minMemoryOomScoreAdjust for sidecar containers with PodLevelResources enabled,
// we add the per-container share of unallocated pod memory requests to the minimum memory request.
// This ensures the OOM score adjustment i.e. minMemoryOomScoreAdjust
// calculation remains consistent
// with how we handle pod-level memory requests for regular containers.
if utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources) &&
resourcehelper.IsPodLevelRequestsSet(pod) {
minMemoryRequest += remainingReqPerContainer
}
minMemoryOomScoreAdjust := 1000 - (1000*minMemoryRequest)/memoryCapacity
// the OOM adjustment for sidecar container will match
// or fall below the OOM score adjustment of regular containers in the Pod.
if oomScoreAdjust > minMemoryOomScoreAdjust {
oomScoreAdjust = minMemoryOomScoreAdjust
}
}
// A guaranteed pod using 100% of memory can have an OOM score of 10. Ensure
// that burstable pods have a higher OOM score adjustment.
if int(oomScoreAdjust) < (1000 + guaranteedOOMScoreAdj) {
return (1000 + guaranteedOOMScoreAdj)
}
// Give burstable pods a higher chance of survival over besteffort pods.
if int(oomScoreAdjust) == besteffortOOMScoreAdj {
return int(oomScoreAdjust - 1)
}
return int(oomScoreAdjust)
}
// isSidecarContainer returns a boolean indicating whether a container is a sidecar or not.
// Since v1.Container does not directly specify whether a container is a sidecar,
// this function uses available indicators (container.RestartPolicy == v1.ContainerRestartPolicyAlways)
// to make that determination.
func isSidecarContainer(pod *v1.Pod, container *v1.Container) bool {
if container.RestartPolicy != nil && *container.RestartPolicy == v1.ContainerRestartPolicyAlways {
for _, initContainer := range pod.Spec.InitContainers {
if initContainer.Name == container.Name {
return true
}
}
}
return false
}

View File

@ -1,98 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package status
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status/state"
)
type fakeManager struct {
state state.State
}
func (m *fakeManager) Start() {
klog.InfoS("Start()")
return
}
func (m *fakeManager) GetPodStatus(uid types.UID) (v1.PodStatus, bool) {
klog.InfoS("GetPodStatus()")
return v1.PodStatus{}, false
}
func (m *fakeManager) SetPodStatus(pod *v1.Pod, status v1.PodStatus) {
klog.InfoS("SetPodStatus()")
return
}
func (m *fakeManager) SetContainerReadiness(podUID types.UID, containerID kubecontainer.ContainerID, ready bool) {
klog.InfoS("SetContainerReadiness()")
return
}
func (m *fakeManager) SetContainerStartup(podUID types.UID, containerID kubecontainer.ContainerID, started bool) {
klog.InfoS("SetContainerStartup()")
return
}
func (m *fakeManager) TerminatePod(pod *v1.Pod) {
klog.InfoS("TerminatePod()")
return
}
func (m *fakeManager) RemoveOrphanedStatuses(podUIDs map[types.UID]bool) {
klog.InfoS("RemoveOrphanedStatuses()")
return
}
func (m *fakeManager) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceRequirements, bool) {
klog.InfoS("GetContainerResourceAllocation()")
return m.state.GetContainerResourceAllocation(podUID, containerName)
}
func (m *fakeManager) GetPodResizeStatus(podUID types.UID) v1.PodResizeStatus {
return m.state.GetPodResizeStatus(string(podUID))
}
func (m *fakeManager) UpdatePodFromAllocation(pod *v1.Pod) (*v1.Pod, bool) {
allocs := m.state.GetPodResourceAllocation()
return updatePodFromAllocation(pod, allocs)
}
func (m *fakeManager) SetPodAllocation(pod *v1.Pod) error {
klog.InfoS("SetPodAllocation()")
for _, container := range pod.Spec.Containers {
alloc := *container.Resources.DeepCopy()
m.state.SetContainerResourceAllocation(string(pod.UID), container.Name, alloc)
}
return nil
}
func (m *fakeManager) SetPodResizeStatus(podUID types.UID, resizeStatus v1.PodResizeStatus) {
m.state.SetPodResizeStatus(string(podUID), resizeStatus)
}
// NewFakeManager creates empty/fake memory manager
func NewFakeManager() Manager {
return &fakeManager{
state: state.NewStateMemory(),
}
}

View File

@ -43,19 +43,20 @@ const (
// GenerateContainersReadyCondition returns the status of "ContainersReady" condition.
// The status of "ContainersReady" condition is true when all containers are ready.
func GenerateContainersReadyCondition(spec *v1.PodSpec, containerStatuses []v1.ContainerStatus, podPhase v1.PodPhase) v1.PodCondition {
func GenerateContainersReadyCondition(pod *v1.Pod, oldPodStatus *v1.PodStatus, containerStatuses []v1.ContainerStatus, podPhase v1.PodPhase) v1.PodCondition {
// Find if all containers are ready or not.
if containerStatuses == nil {
return v1.PodCondition{
Type: v1.ContainersReady,
Status: v1.ConditionFalse,
Reason: UnknownContainerStatuses,
Type: v1.ContainersReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.ContainersReady),
Status: v1.ConditionFalse,
Reason: UnknownContainerStatuses,
}
}
unknownContainers := []string{}
unreadyContainers := []string{}
for _, container := range spec.InitContainers {
for _, container := range pod.Spec.InitContainers {
if !podutil.IsRestartableInitContainer(&container) {
continue
}
@ -69,7 +70,7 @@ func GenerateContainersReadyCondition(spec *v1.PodSpec, containerStatuses []v1.C
}
}
for _, container := range spec.Containers {
for _, container := range pod.Spec.Containers {
if containerStatus, ok := podutil.GetContainerStatus(containerStatuses, container.Name); ok {
if !containerStatus.Ready {
unreadyContainers = append(unreadyContainers, container.Name)
@ -81,12 +82,12 @@ func GenerateContainersReadyCondition(spec *v1.PodSpec, containerStatuses []v1.C
// If all containers are known and succeeded, just return PodCompleted.
if podPhase == v1.PodSucceeded && len(unknownContainers) == 0 {
return generateContainersReadyConditionForTerminalPhase(podPhase)
return generateContainersReadyConditionForTerminalPhase(pod, oldPodStatus, podPhase)
}
// If the pod phase is failed, explicitly set the ready condition to false for containers since they may be in progress of terminating.
if podPhase == v1.PodFailed {
return generateContainersReadyConditionForTerminalPhase(podPhase)
return generateContainersReadyConditionForTerminalPhase(pod, oldPodStatus, podPhase)
}
// Generate message for containers in unknown condition.
@ -100,38 +101,41 @@ func GenerateContainersReadyCondition(spec *v1.PodSpec, containerStatuses []v1.C
unreadyMessage := strings.Join(unreadyMessages, ", ")
if unreadyMessage != "" {
return v1.PodCondition{
Type: v1.ContainersReady,
Status: v1.ConditionFalse,
Reason: ContainersNotReady,
Message: unreadyMessage,
Type: v1.ContainersReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.ContainersReady),
Status: v1.ConditionFalse,
Reason: ContainersNotReady,
Message: unreadyMessage,
}
}
return v1.PodCondition{
Type: v1.ContainersReady,
Status: v1.ConditionTrue,
Type: v1.ContainersReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.ContainersReady),
Status: v1.ConditionTrue,
}
}
// GeneratePodReadyCondition returns "Ready" condition of a pod.
// The status of "Ready" condition is "True", if all containers in a pod are ready
// AND all matching conditions specified in the ReadinessGates have status equal to "True".
func GeneratePodReadyCondition(spec *v1.PodSpec, conditions []v1.PodCondition, containerStatuses []v1.ContainerStatus, podPhase v1.PodPhase) v1.PodCondition {
containersReady := GenerateContainersReadyCondition(spec, containerStatuses, podPhase)
func GeneratePodReadyCondition(pod *v1.Pod, oldPodStatus *v1.PodStatus, conditions []v1.PodCondition, containerStatuses []v1.ContainerStatus, podPhase v1.PodPhase) v1.PodCondition {
containersReady := GenerateContainersReadyCondition(pod, oldPodStatus, containerStatuses, podPhase)
// If the status of ContainersReady is not True, return the same status, reason and message as ContainersReady.
if containersReady.Status != v1.ConditionTrue {
return v1.PodCondition{
Type: v1.PodReady,
Status: containersReady.Status,
Reason: containersReady.Reason,
Message: containersReady.Message,
Type: v1.PodReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodReady),
Status: containersReady.Status,
Reason: containersReady.Reason,
Message: containersReady.Message,
}
}
// Evaluate corresponding conditions specified in readiness gate
// Generate message if any readiness gate is not satisfied.
unreadyMessages := []string{}
for _, rg := range spec.ReadinessGates {
for _, rg := range pod.Spec.ReadinessGates {
_, c := podutil.GetPodConditionFromList(conditions, rg.ConditionType)
if c == nil {
unreadyMessages = append(unreadyMessages, fmt.Sprintf("corresponding condition of pod readiness gate %q does not exist.", string(rg.ConditionType)))
@ -144,16 +148,18 @@ func GeneratePodReadyCondition(spec *v1.PodSpec, conditions []v1.PodCondition, c
if len(unreadyMessages) != 0 {
unreadyMessage := strings.Join(unreadyMessages, ", ")
return v1.PodCondition{
Type: v1.PodReady,
Status: v1.ConditionFalse,
Reason: ReadinessGatesNotReady,
Message: unreadyMessage,
Type: v1.PodReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodReady),
Status: v1.ConditionFalse,
Reason: ReadinessGatesNotReady,
Message: unreadyMessage,
}
}
return v1.PodCondition{
Type: v1.PodReady,
Status: v1.ConditionTrue,
Type: v1.PodReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodReady),
Status: v1.ConditionTrue,
}
}
@ -172,19 +178,20 @@ func isInitContainerInitialized(initContainer *v1.Container, containerStatus *v1
// GeneratePodInitializedCondition returns initialized condition if all init containers in a pod are ready, else it
// returns an uninitialized condition.
func GeneratePodInitializedCondition(spec *v1.PodSpec, containerStatuses []v1.ContainerStatus, podPhase v1.PodPhase) v1.PodCondition {
func GeneratePodInitializedCondition(pod *v1.Pod, oldPodStatus *v1.PodStatus, containerStatuses []v1.ContainerStatus, podPhase v1.PodPhase) v1.PodCondition {
// Find if all containers are ready or not.
if containerStatuses == nil && len(spec.InitContainers) > 0 {
if containerStatuses == nil && len(pod.Spec.InitContainers) > 0 {
return v1.PodCondition{
Type: v1.PodInitialized,
Status: v1.ConditionFalse,
Reason: UnknownContainerStatuses,
Type: v1.PodInitialized,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodInitialized),
Status: v1.ConditionFalse,
Reason: UnknownContainerStatuses,
}
}
unknownContainers := []string{}
incompleteContainers := []string{}
for _, container := range spec.InitContainers {
for _, container := range pod.Spec.InitContainers {
containerStatus, ok := podutil.GetContainerStatus(containerStatuses, container.Name)
if !ok {
unknownContainers = append(unknownContainers, container.Name)
@ -198,9 +205,10 @@ func GeneratePodInitializedCondition(spec *v1.PodSpec, containerStatuses []v1.Co
// If all init containers are known and succeeded, just return PodCompleted.
if podPhase == v1.PodSucceeded && len(unknownContainers) == 0 {
return v1.PodCondition{
Type: v1.PodInitialized,
Status: v1.ConditionTrue,
Reason: PodCompleted,
Type: v1.PodInitialized,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodInitialized),
Status: v1.ConditionTrue,
Reason: PodCompleted,
}
}
@ -208,10 +216,11 @@ func GeneratePodInitializedCondition(spec *v1.PodSpec, containerStatuses []v1.Co
// been initialized before.
// This is needed to handle the case where the pod has been initialized but
// the restartable init containers are restarting.
if kubecontainer.HasAnyRegularContainerStarted(spec, containerStatuses) {
if kubecontainer.HasAnyRegularContainerStarted(&pod.Spec, containerStatuses) {
return v1.PodCondition{
Type: v1.PodInitialized,
Status: v1.ConditionTrue,
Type: v1.PodInitialized,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodInitialized),
Status: v1.ConditionTrue,
}
}
@ -225,20 +234,22 @@ func GeneratePodInitializedCondition(spec *v1.PodSpec, containerStatuses []v1.Co
unreadyMessage := strings.Join(unreadyMessages, ", ")
if unreadyMessage != "" {
return v1.PodCondition{
Type: v1.PodInitialized,
Status: v1.ConditionFalse,
Reason: ContainersNotInitialized,
Message: unreadyMessage,
Type: v1.PodInitialized,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodInitialized),
Status: v1.ConditionFalse,
Reason: ContainersNotInitialized,
Message: unreadyMessage,
}
}
return v1.PodCondition{
Type: v1.PodInitialized,
Status: v1.ConditionTrue,
Type: v1.PodInitialized,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodInitialized),
Status: v1.ConditionTrue,
}
}
func GeneratePodReadyToStartContainersCondition(pod *v1.Pod, podStatus *kubecontainer.PodStatus) v1.PodCondition {
func GeneratePodReadyToStartContainersCondition(pod *v1.Pod, oldPodStatus *v1.PodStatus, podStatus *kubecontainer.PodStatus) v1.PodCondition {
newSandboxNeeded, _, _ := runtimeutil.PodSandboxChanged(pod, podStatus)
// if a new sandbox does not need to be created for a pod, it indicates that
// a sandbox for the pod with networking configured already exists.
@ -246,20 +257,23 @@ func GeneratePodReadyToStartContainersCondition(pod *v1.Pod, podStatus *kubecont
// fresh sandbox and configure networking for the sandbox.
if !newSandboxNeeded {
return v1.PodCondition{
Type: v1.PodReadyToStartContainers,
Status: v1.ConditionTrue,
Type: v1.PodReadyToStartContainers,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodReadyToStartContainers),
Status: v1.ConditionTrue,
}
}
return v1.PodCondition{
Type: v1.PodReadyToStartContainers,
Status: v1.ConditionFalse,
Type: v1.PodReadyToStartContainers,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodReadyToStartContainers),
Status: v1.ConditionFalse,
}
}
func generateContainersReadyConditionForTerminalPhase(podPhase v1.PodPhase) v1.PodCondition {
func generateContainersReadyConditionForTerminalPhase(pod *v1.Pod, oldPodStatus *v1.PodStatus, podPhase v1.PodPhase) v1.PodCondition {
condition := v1.PodCondition{
Type: v1.ContainersReady,
Status: v1.ConditionFalse,
Type: v1.ContainersReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.ContainersReady),
Status: v1.ConditionFalse,
}
if podPhase == v1.PodFailed {
@ -271,10 +285,11 @@ func generateContainersReadyConditionForTerminalPhase(podPhase v1.PodPhase) v1.P
return condition
}
func generatePodReadyConditionForTerminalPhase(podPhase v1.PodPhase) v1.PodCondition {
func generatePodReadyConditionForTerminalPhase(pod *v1.Pod, oldPodStatus *v1.PodStatus, podPhase v1.PodPhase) v1.PodCondition {
condition := v1.PodCondition{
Type: v1.PodReady,
Status: v1.ConditionFalse,
Type: v1.PodReady,
ObservedGeneration: podutil.GetPodObservedGenerationIfEnabledOnCondition(oldPodStatus, pod.Generation, v1.PodReady),
Status: v1.ConditionFalse,
}
if podPhase == v1.PodFailed {

View File

@ -1,80 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)
var _ checkpointmanager.Checkpoint = &Checkpoint{}
type PodResourceAllocationInfo struct {
AllocationEntries map[string]map[string]v1.ResourceRequirements `json:"allocationEntries,omitempty"`
}
// Checkpoint represents a structure to store pod resource allocation checkpoint data
type Checkpoint struct {
// Data is a serialized PodResourceAllocationInfo
Data string `json:"data"`
// Checksum is a checksum of Data
Checksum checksum.Checksum `json:"checksum"`
}
// NewCheckpoint creates a new checkpoint from a list of claim info states
func NewCheckpoint(allocations *PodResourceAllocationInfo) (*Checkpoint, error) {
serializedAllocations, err := json.Marshal(allocations)
if err != nil {
return nil, fmt.Errorf("failed to serialize allocations for checkpointing: %w", err)
}
cp := &Checkpoint{
Data: string(serializedAllocations),
}
cp.Checksum = checksum.New(cp.Data)
return cp, nil
}
func (cp *Checkpoint) MarshalCheckpoint() ([]byte, error) {
return json.Marshal(cp)
}
// UnmarshalCheckpoint unmarshals checkpoint from JSON
func (cp *Checkpoint) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// VerifyChecksum verifies that current checksum
// of checkpointed Data is valid
func (cp *Checkpoint) VerifyChecksum() error {
return cp.Checksum.Verify(cp.Data)
}
// GetPodResourceAllocationInfo returns Pod Resource Allocation info states from checkpoint
func (cp *Checkpoint) GetPodResourceAllocationInfo() (*PodResourceAllocationInfo, error) {
var data PodResourceAllocationInfo
if err := json.Unmarshal([]byte(cp.Data), &data); err != nil {
return nil, err
}
return &data, nil
}

View File

@ -1,60 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
v1 "k8s.io/api/core/v1"
)
// PodResourceAllocation type is used in tracking resources allocated to pod's containers
type PodResourceAllocation map[string]map[string]v1.ResourceRequirements
// PodResizeStatus type is used in tracking the last resize decision for pod
type PodResizeStatus map[string]v1.PodResizeStatus
// Clone returns a copy of PodResourceAllocation
func (pr PodResourceAllocation) Clone() PodResourceAllocation {
prCopy := make(PodResourceAllocation)
for pod := range pr {
prCopy[pod] = make(map[string]v1.ResourceRequirements)
for container, alloc := range pr[pod] {
prCopy[pod][container] = *alloc.DeepCopy()
}
}
return prCopy
}
// Reader interface used to read current pod resource allocation state
type Reader interface {
GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceRequirements, bool)
GetPodResourceAllocation() PodResourceAllocation
GetPodResizeStatus(podUID string) v1.PodResizeStatus
}
type writer interface {
SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceRequirements) error
SetPodResourceAllocation(PodResourceAllocation) error
SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus)
Delete(podUID string, containerName string) error
ClearState() error
}
// State interface provides methods for tracking and setting pod resource allocation
type State interface {
Reader
writer
}

View File

@ -1,200 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"fmt"
"path"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
var _ State = &stateCheckpoint{}
type stateCheckpoint struct {
mux sync.RWMutex
cache State
checkpointManager checkpointmanager.CheckpointManager
checkpointName string
}
// NewStateCheckpoint creates new State for keeping track of pod resource allocations with checkpoint backend
func NewStateCheckpoint(stateDir, checkpointName string) (State, error) {
checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager for pod allocation tracking: %v", err)
}
stateCheckpoint := &stateCheckpoint{
cache: NewStateMemory(),
checkpointManager: checkpointManager,
checkpointName: checkpointName,
}
if err := stateCheckpoint.restoreState(); err != nil {
//lint:ignore ST1005 user-facing error message
return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete pod allocation checkpoint file %q before restarting Kubelet", err, path.Join(stateDir, checkpointName))
}
return stateCheckpoint, nil
}
// restores state from a checkpoint and creates it if it doesn't exist
func (sc *stateCheckpoint) restoreState() error {
sc.mux.Lock()
defer sc.mux.Unlock()
var err error
checkpoint, err := NewCheckpoint(nil)
if err != nil {
return fmt.Errorf("failed to create new checkpoint: %w", err)
}
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil {
if err == errors.ErrCheckpointNotFound {
return sc.storeState()
}
return err
}
praInfo, err := checkpoint.GetPodResourceAllocationInfo()
if err != nil {
return fmt.Errorf("failed to get pod resource allocation info: %w", err)
}
err = sc.cache.SetPodResourceAllocation(praInfo.AllocationEntries)
if err != nil {
return fmt.Errorf("failed to set pod resource allocation: %w", err)
}
klog.V(2).InfoS("State checkpoint: restored pod resource allocation state from checkpoint")
return nil
}
// saves state to a checkpoint, caller is responsible for locking
func (sc *stateCheckpoint) storeState() error {
podAllocation := sc.cache.GetPodResourceAllocation()
checkpoint, err := NewCheckpoint(&PodResourceAllocationInfo{
AllocationEntries: podAllocation,
})
if err != nil {
return fmt.Errorf("failed to create checkpoint: %w", err)
}
err = sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
if err != nil {
klog.ErrorS(err, "Failed to save pod allocation checkpoint")
return err
}
return nil
}
// GetContainerResourceAllocation returns current resources allocated to a pod's container
func (sc *stateCheckpoint) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceRequirements, bool) {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetContainerResourceAllocation(podUID, containerName)
}
// GetPodResourceAllocation returns current pod resource allocation
func (sc *stateCheckpoint) GetPodResourceAllocation() PodResourceAllocation {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetPodResourceAllocation()
}
// GetPodResizeStatus returns the last resize decision for a pod
func (sc *stateCheckpoint) GetPodResizeStatus(podUID string) v1.PodResizeStatus {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetPodResizeStatus(podUID)
}
// SetContainerResourceAllocation sets resources allocated to a pod's container
func (sc *stateCheckpoint) SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceRequirements) error {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetContainerResourceAllocation(podUID, containerName, alloc)
return sc.storeState()
}
// SetPodResourceAllocation sets pod resource allocation
func (sc *stateCheckpoint) SetPodResourceAllocation(a PodResourceAllocation) error {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetPodResourceAllocation(a)
return sc.storeState()
}
// SetPodResizeStatus sets the last resize decision for a pod
func (sc *stateCheckpoint) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetPodResizeStatus(podUID, resizeStatus)
}
// Delete deletes allocations for specified pod
func (sc *stateCheckpoint) Delete(podUID string, containerName string) error {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.Delete(podUID, containerName)
return sc.storeState()
}
// ClearState clears the state and saves it in a checkpoint
func (sc *stateCheckpoint) ClearState() error {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.ClearState()
return sc.storeState()
}
type noopStateCheckpoint struct{}
// NewNoopStateCheckpoint creates a dummy state checkpoint manager
func NewNoopStateCheckpoint() State {
return &noopStateCheckpoint{}
}
func (sc *noopStateCheckpoint) GetContainerResourceAllocation(_ string, _ string) (v1.ResourceRequirements, bool) {
return v1.ResourceRequirements{}, false
}
func (sc *noopStateCheckpoint) GetPodResourceAllocation() PodResourceAllocation {
return nil
}
func (sc *noopStateCheckpoint) GetPodResizeStatus(_ string) v1.PodResizeStatus {
return ""
}
func (sc *noopStateCheckpoint) SetContainerResourceAllocation(_ string, _ string, _ v1.ResourceRequirements) error {
return nil
}
func (sc *noopStateCheckpoint) SetPodResourceAllocation(_ PodResourceAllocation) error {
return nil
}
func (sc *noopStateCheckpoint) SetPodResizeStatus(_ string, _ v1.PodResizeStatus) {}
func (sc *noopStateCheckpoint) Delete(_ string, _ string) error {
return nil
}
func (sc *noopStateCheckpoint) ClearState() error {
return nil
}

View File

@ -1,128 +0,0 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
)
type stateMemory struct {
sync.RWMutex
podAllocation PodResourceAllocation
podResizeStatus PodResizeStatus
}
var _ State = &stateMemory{}
// NewStateMemory creates new State to track resources allocated to pods
func NewStateMemory() State {
klog.V(2).InfoS("Initialized new in-memory state store for pod resource allocation tracking")
return &stateMemory{
podAllocation: PodResourceAllocation{},
podResizeStatus: PodResizeStatus{},
}
}
func (s *stateMemory) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceRequirements, bool) {
s.RLock()
defer s.RUnlock()
alloc, ok := s.podAllocation[podUID][containerName]
return *alloc.DeepCopy(), ok
}
func (s *stateMemory) GetPodResourceAllocation() PodResourceAllocation {
s.RLock()
defer s.RUnlock()
return s.podAllocation.Clone()
}
func (s *stateMemory) GetPodResizeStatus(podUID string) v1.PodResizeStatus {
s.RLock()
defer s.RUnlock()
return s.podResizeStatus[podUID]
}
func (s *stateMemory) SetContainerResourceAllocation(podUID string, containerName string, alloc v1.ResourceRequirements) error {
s.Lock()
defer s.Unlock()
if _, ok := s.podAllocation[podUID]; !ok {
s.podAllocation[podUID] = make(map[string]v1.ResourceRequirements)
}
s.podAllocation[podUID][containerName] = alloc
klog.V(3).InfoS("Updated container resource allocation", "podUID", podUID, "containerName", containerName, "alloc", alloc)
return nil
}
func (s *stateMemory) SetPodResourceAllocation(a PodResourceAllocation) error {
s.Lock()
defer s.Unlock()
s.podAllocation = a.Clone()
klog.V(3).InfoS("Updated pod resource allocation", "allocation", a)
return nil
}
func (s *stateMemory) SetPodResizeStatus(podUID string, resizeStatus v1.PodResizeStatus) {
s.Lock()
defer s.Unlock()
if resizeStatus != "" {
s.podResizeStatus[podUID] = resizeStatus
} else {
delete(s.podResizeStatus, podUID)
}
klog.V(3).InfoS("Updated pod resize state", "podUID", podUID, "resizeStatus", resizeStatus)
}
func (s *stateMemory) deleteContainer(podUID string, containerName string) {
delete(s.podAllocation[podUID], containerName)
if len(s.podAllocation[podUID]) == 0 {
delete(s.podAllocation, podUID)
delete(s.podResizeStatus, podUID)
}
klog.V(3).InfoS("Deleted pod resource allocation", "podUID", podUID, "containerName", containerName)
}
func (s *stateMemory) Delete(podUID string, containerName string) error {
s.Lock()
defer s.Unlock()
if len(containerName) == 0 {
delete(s.podAllocation, podUID)
delete(s.podResizeStatus, podUID)
klog.V(3).InfoS("Deleted pod resource allocation and resize state", "podUID", podUID)
return nil
}
s.deleteContainer(podUID, containerName)
return nil
}
func (s *stateMemory) ClearState() error {
s.Lock()
defer s.Unlock()
s.podAllocation = make(PodResourceAllocation)
s.podResizeStatus = make(PodResizeStatus)
klog.V(3).InfoS("Cleared state")
return nil
}

View File

@ -25,7 +25,7 @@ import (
"sync"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp" //nolint:depguard
clientset "k8s.io/client-go/kubernetes"
v1 "k8s.io/api/core/v1"
@ -40,15 +40,11 @@ import (
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/status/state"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
kubeutil "k8s.io/kubernetes/pkg/kubelet/util"
statusutil "k8s.io/kubernetes/pkg/util/pod"
)
// podStatusManagerStateFile is the file name where status manager stores its state
const podStatusManagerStateFile = "pod_status_manager_state"
// A wrapper around v1.PodStatus that includes a version to enforce that stale pod statuses are
// not sent to the API server.
type versionedPodStatus struct {
@ -72,19 +68,32 @@ type manager struct {
kubeClient clientset.Interface
podManager PodManager
// Map from pod UID to sync status of the corresponding pod.
podStatuses map[types.UID]versionedPodStatus
podStatusesLock sync.RWMutex
podStatusChannel chan struct{}
podStatuses map[types.UID]versionedPodStatus
podResizeConditions map[types.UID]podResizeConditions
podStatusesLock sync.RWMutex
podStatusChannel chan struct{}
// Map from (mirror) pod UID to latest status version successfully sent to the API server.
// apiStatusVersions must only be accessed from the sync thread.
apiStatusVersions map[kubetypes.MirrorPodUID]uint64
podDeletionSafety PodDeletionSafetyProvider
podStartupLatencyHelper PodStartupLatencyStateHelper
// state allows to save/restore pod resource allocation and tolerate kubelet restarts.
state state.State
// stateFileDirectory holds the directory where the state file for checkpoints is held.
stateFileDirectory string
}
type podResizeConditions struct {
PodResizePending *v1.PodCondition
PodResizeInProgress *v1.PodCondition
}
func (prc podResizeConditions) List() []*v1.PodCondition {
var conditions []*v1.PodCondition
if prc.PodResizePending != nil {
conditions = append(conditions, prc.PodResizePending)
}
if prc.PodResizeInProgress != nil {
conditions = append(conditions, prc.PodResizeInProgress)
}
return conditions
}
// PodManager is the subset of methods the manager needs to observe the actual state of the kubelet.
@ -143,42 +152,35 @@ type Manager interface {
// the provided podUIDs.
RemoveOrphanedStatuses(podUIDs map[types.UID]bool)
// GetPodResizeStatus returns cached PodStatus.Resize value
GetPodResizeStatus(podUID types.UID) v1.PodResizeStatus
// GetPodResizeConditions returns cached PodStatus Resize conditions value
GetPodResizeConditions(podUID types.UID) []*v1.PodCondition
// SetPodResizeStatus caches the last resizing decision for the pod.
SetPodResizeStatus(podUID types.UID, resize v1.PodResizeStatus)
// SetPodResizePendingCondition caches the last PodResizePending condition for the pod.
SetPodResizePendingCondition(podUID types.UID, reason, message string)
allocationManager
}
// SetPodResizeInProgressCondition caches the last PodResizeInProgress condition for the pod.
SetPodResizeInProgressCondition(podUID types.UID, reason, message string, allowReasonToBeCleared bool)
// TODO(tallclair): Refactor allocation state handling out of the status manager.
type allocationManager interface {
// GetContainerResourceAllocation returns the checkpointed AllocatedResources value for the container
GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceRequirements, bool)
// ClearPodResizePendingCondition clears the PodResizePending condition for the pod from the cache.
ClearPodResizePendingCondition(podUID types.UID)
// UpdatePodFromAllocation overwrites the pod spec with the allocation.
// This function does a deep copy only if updates are needed.
// Returns the updated (or original) pod, and whether there was an allocation stored.
UpdatePodFromAllocation(pod *v1.Pod) (*v1.Pod, bool)
// SetPodAllocation checkpoints the resources allocated to a pod's containers.
SetPodAllocation(pod *v1.Pod) error
// ClearPodResizeInProgressCondition clears the PodResizeInProgress condition for the pod from the cache.
ClearPodResizeInProgressCondition(podUID types.UID)
}
const syncPeriod = 10 * time.Second
// NewManager returns a functional Manager.
func NewManager(kubeClient clientset.Interface, podManager PodManager, podDeletionSafety PodDeletionSafetyProvider, podStartupLatencyHelper PodStartupLatencyStateHelper, stateFileDirectory string) Manager {
func NewManager(kubeClient clientset.Interface, podManager PodManager, podDeletionSafety PodDeletionSafetyProvider, podStartupLatencyHelper PodStartupLatencyStateHelper) Manager {
return &manager{
kubeClient: kubeClient,
podManager: podManager,
podStatuses: make(map[types.UID]versionedPodStatus),
podResizeConditions: make(map[types.UID]podResizeConditions),
podStatusChannel: make(chan struct{}, 1),
apiStatusVersions: make(map[kubetypes.MirrorPodUID]uint64),
podDeletionSafety: podDeletionSafety,
podStartupLatencyHelper: podStartupLatencyHelper,
stateFileDirectory: stateFileDirectory,
}
}
@ -188,34 +190,35 @@ func NewManager(kubeClient clientset.Interface, podManager PodManager, podDeleti
// changes will be ignored.
func isPodStatusByKubeletEqual(oldStatus, status *v1.PodStatus) bool {
oldCopy := oldStatus.DeepCopy()
newConditions := make(map[v1.PodConditionType]*v1.PodCondition, len(status.Conditions))
oldConditions := make(map[v1.PodConditionType]*v1.PodCondition, len(oldStatus.Conditions))
for _, c := range status.Conditions {
// both owned and shared conditions are used for kubelet status equality
if kubetypes.PodConditionByKubelet(c.Type) || kubetypes.PodConditionSharedByKubelet(c.Type) {
_, oc := podutil.GetPodCondition(oldCopy, c.Type)
if oc == nil || oc.Status != c.Status || oc.Message != c.Message || oc.Reason != c.Reason {
return false
}
newConditions[c.Type] = &c
}
}
for _, c := range oldStatus.Conditions {
if kubetypes.PodConditionByKubelet(c.Type) || kubetypes.PodConditionSharedByKubelet(c.Type) {
oldConditions[c.Type] = &c
}
}
if len(newConditions) != len(oldConditions) {
return false
}
for _, newCondition := range newConditions {
oldCondition := oldConditions[newCondition.Type]
if oldCondition == nil || oldCondition.Status != newCondition.Status || oldCondition.Message != newCondition.Message || oldCondition.Reason != newCondition.Reason {
return false
}
}
oldCopy.Conditions = status.Conditions
return apiequality.Semantic.DeepEqual(oldCopy, status)
}
func (m *manager) Start() {
// Initialize m.state to no-op state checkpoint manager
m.state = state.NewNoopStateCheckpoint()
// Create pod allocation checkpoint manager even if client is nil so as to allow local get/set of AllocatedResources & Resize
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
stateImpl, err := state.NewStateCheckpoint(m.stateFileDirectory, podStatusManagerStateFile)
if err != nil {
// This is a crictical, non-recoverable failure.
klog.ErrorS(err, "Could not initialize pod allocation checkpoint manager, please drain node and remove policy state file")
panic(err)
}
m.state = stateImpl
}
// Don't start the status manager if we don't have a client. This will happen
// on the master, where the kubelet is responsible for bootstrapping the pods
// of the master components.
@ -244,72 +247,65 @@ func (m *manager) Start() {
}, 0)
}
// GetContainerResourceAllocation returns the last checkpointed AllocatedResources values
// If checkpoint manager has not been initialized, it returns nil, false
func (m *manager) GetContainerResourceAllocation(podUID string, containerName string) (v1.ResourceRequirements, bool) {
// GetPodResizeConditions returns the last cached ResizeStatus value.
func (m *manager) GetPodResizeConditions(podUID types.UID) []*v1.PodCondition {
m.podStatusesLock.RLock()
defer m.podStatusesLock.RUnlock()
return m.state.GetContainerResourceAllocation(podUID, containerName)
return m.podResizeConditions[podUID].List()
}
// UpdatePodFromAllocation overwrites the pod spec with the allocation.
// This function does a deep copy only if updates are needed.
func (m *manager) UpdatePodFromAllocation(pod *v1.Pod) (*v1.Pod, bool) {
m.podStatusesLock.RLock()
defer m.podStatusesLock.RUnlock()
// TODO(tallclair): This clones the whole cache, but we only need 1 pod.
allocs := m.state.GetPodResourceAllocation()
return updatePodFromAllocation(pod, allocs)
}
// SetPodResizePendingCondition caches the last PodResizePending condition for the pod.
func (m *manager) SetPodResizePendingCondition(podUID types.UID, reason, message string) {
m.podStatusesLock.Lock()
defer m.podStatusesLock.Unlock()
func updatePodFromAllocation(pod *v1.Pod, allocs state.PodResourceAllocation) (*v1.Pod, bool) {
allocated, found := allocs[string(pod.UID)]
if !found {
return pod, false
m.podResizeConditions[podUID] = podResizeConditions{
PodResizePending: updatedPodResizeCondition(v1.PodResizePending, m.podResizeConditions[podUID].PodResizePending, reason, message),
PodResizeInProgress: m.podResizeConditions[podUID].PodResizeInProgress,
}
}
updated := false
for i, c := range pod.Spec.Containers {
if cAlloc, ok := allocated[c.Name]; ok {
if !apiequality.Semantic.DeepEqual(c.Resources, cAlloc) {
// Allocation differs from pod spec, update
if !updated {
// If this is the first update, copy the pod
pod = pod.DeepCopy()
updated = true
}
pod.Spec.Containers[i].Resources = cAlloc
// SetPodResizeInProgressCondition caches the last PodResizeInProgress condition for the pod.
func (m *manager) SetPodResizeInProgressCondition(podUID types.UID, reason, message string, allowReasonToBeCleared bool) {
oldConditions := m.GetPodResizeConditions(podUID)
m.podStatusesLock.Lock()
defer m.podStatusesLock.Unlock()
if !allowReasonToBeCleared && reason == "" && message == "" {
// Preserve the old reason and message if there is one.
for _, c := range oldConditions {
if c.Type == v1.PodResizeInProgress {
reason = c.Reason
message = c.Message
}
}
}
return pod, updated
}
// GetPodResizeStatus returns the last cached ResizeStatus value.
func (m *manager) GetPodResizeStatus(podUID types.UID) v1.PodResizeStatus {
m.podStatusesLock.RLock()
defer m.podStatusesLock.RUnlock()
return m.state.GetPodResizeStatus(string(podUID))
}
// SetPodAllocation checkpoints the resources allocated to a pod's containers
func (m *manager) SetPodAllocation(pod *v1.Pod) error {
m.podStatusesLock.RLock()
defer m.podStatusesLock.RUnlock()
for _, container := range pod.Spec.Containers {
alloc := *container.Resources.DeepCopy()
if err := m.state.SetContainerResourceAllocation(string(pod.UID), container.Name, alloc); err != nil {
return err
}
m.podResizeConditions[podUID] = podResizeConditions{
PodResizeInProgress: updatedPodResizeCondition(v1.PodResizeInProgress, m.podResizeConditions[podUID].PodResizeInProgress, reason, message),
PodResizePending: m.podResizeConditions[podUID].PodResizePending,
}
return nil
}
// SetPodResizeStatus checkpoints the last resizing decision for the pod.
func (m *manager) SetPodResizeStatus(podUID types.UID, resizeStatus v1.PodResizeStatus) {
m.podStatusesLock.RLock()
defer m.podStatusesLock.RUnlock()
m.state.SetPodResizeStatus(string(podUID), resizeStatus)
// ClearPodResizePendingCondition clears the PodResizePending condition for the pod from the cache.
func (m *manager) ClearPodResizePendingCondition(podUID types.UID) {
m.podStatusesLock.Lock()
defer m.podStatusesLock.Unlock()
m.podResizeConditions[podUID] = podResizeConditions{
PodResizePending: nil,
PodResizeInProgress: m.podResizeConditions[podUID].PodResizeInProgress,
}
}
// ClearPodResizeInProgressCondition clears the PodResizeInProgress condition for the pod from the cache.
func (m *manager) ClearPodResizeInProgressCondition(podUID types.UID) {
m.podStatusesLock.Lock()
defer m.podStatusesLock.Unlock()
m.podResizeConditions[podUID] = podResizeConditions{
PodResizePending: m.podResizeConditions[podUID].PodResizePending,
PodResizeInProgress: nil,
}
}
func (m *manager) GetPodStatus(uid types.UID) (v1.PodStatus, bool) {
@ -326,6 +322,9 @@ func (m *manager) SetPodStatus(pod *v1.Pod, status v1.PodStatus) {
// Make sure we're caching a deep copy.
status = *status.DeepCopy()
// Set the observedGeneration for this pod status.
status.ObservedGeneration = podutil.GetPodObservedGenerationIfEnabled(pod)
// Force a status update if deletion timestamp is set. This is necessary
// because if the pod is in the non-running state, the pod worker still
// needs to be able to trigger an update and/or deletion.
@ -388,9 +387,10 @@ func (m *manager) SetContainerReadiness(podUID types.UID, containerID kubecontai
status.Conditions = append(status.Conditions, condition)
}
}
allContainerStatuses := append(status.InitContainerStatuses, status.ContainerStatuses...)
updateConditionFunc(v1.PodReady, GeneratePodReadyCondition(&pod.Spec, status.Conditions, allContainerStatuses, status.Phase))
updateConditionFunc(v1.ContainersReady, GenerateContainersReadyCondition(&pod.Spec, allContainerStatuses, status.Phase))
updateConditionFunc(v1.PodReady, GeneratePodReadyCondition(pod, &oldStatus.status, status.Conditions, allContainerStatuses, status.Phase))
updateConditionFunc(v1.ContainersReady, GenerateContainersReadyCondition(pod, &oldStatus.status, allContainerStatuses, status.Phase))
m.updateStatusInternal(pod, status, false, false)
}
@ -690,6 +690,11 @@ func (m *manager) updateStatusInternal(pod *v1.Pod, status v1.PodStatus, forceUp
status.StartTime = &now
}
// prevent sending unnecessary patches
if oldStatus.ObservedGeneration > status.ObservedGeneration {
status.ObservedGeneration = oldStatus.ObservedGeneration
}
normalizeStatus(pod, &status)
// Perform some more extensive logging of container termination state to assist in
@ -779,7 +784,7 @@ func (m *manager) deletePodStatus(uid types.UID) {
delete(m.podStatuses, uid)
m.podStartupLatencyHelper.DeletePodStartupState(uid)
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
m.state.Delete(string(uid), "")
delete(m.podResizeConditions, uid)
}
}
@ -792,7 +797,7 @@ func (m *manager) RemoveOrphanedStatuses(podUIDs map[types.UID]bool) {
klog.V(5).InfoS("Removing pod from status map.", "podUID", key)
delete(m.podStatuses, key)
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
m.state.Delete(string(key), "")
delete(m.podResizeConditions, key)
}
}
}
@ -905,7 +910,7 @@ func (m *manager) syncPod(uid types.UID, status versionedPodStatus) {
return
}
mergedStatus := mergePodStatus(pod.Status, status.status, m.podDeletionSafety.PodCouldHaveRunningContainers(pod))
mergedStatus := mergePodStatus(pod, pod.Status, status.status, m.podDeletionSafety.PodCouldHaveRunningContainers(pod))
newPod, patchBytes, unchanged, err := statusutil.PatchPodStatus(context.TODO(), m.kubeClient, pod.Namespace, pod.Name, pod.UID, pod.Status, mergedStatus)
klog.V(3).InfoS("Patch status for pod", "pod", klog.KObj(pod), "podUID", uid, "patch", string(patchBytes))
@ -1083,7 +1088,7 @@ func normalizeStatus(pod *v1.Pod, status *v1.PodStatus) *v1.PodStatus {
// mergePodStatus merges oldPodStatus and newPodStatus to preserve where pod conditions
// not owned by kubelet and to ensure terminal phase transition only happens after all
// running containers have terminated. This method does not modify the old status.
func mergePodStatus(oldPodStatus, newPodStatus v1.PodStatus, couldHaveRunningContainers bool) v1.PodStatus {
func mergePodStatus(pod *v1.Pod, oldPodStatus, newPodStatus v1.PodStatus, couldHaveRunningContainers bool) v1.PodStatus {
podConditions := make([]v1.PodCondition, 0, len(oldPodStatus.Conditions)+len(newPodStatus.Conditions))
for _, c := range oldPodStatus.Conditions {
@ -1145,10 +1150,10 @@ func mergePodStatus(oldPodStatus, newPodStatus v1.PodStatus, couldHaveRunningCon
// See https://issues.k8s.io/108594 for more details.
if podutil.IsPodPhaseTerminal(newPodStatus.Phase) {
if podutil.IsPodReadyConditionTrue(newPodStatus) || podutil.IsContainersReadyConditionTrue(newPodStatus) {
containersReadyCondition := generateContainersReadyConditionForTerminalPhase(newPodStatus.Phase)
containersReadyCondition := generateContainersReadyConditionForTerminalPhase(pod, &oldPodStatus, newPodStatus.Phase)
podutil.UpdatePodCondition(&newPodStatus, &containersReadyCondition)
podReadyCondition := generatePodReadyConditionForTerminalPhase(newPodStatus.Phase)
podReadyCondition := generatePodReadyConditionForTerminalPhase(pod, &oldPodStatus, newPodStatus.Phase)
podutil.UpdatePodCondition(&newPodStatus, &podReadyCondition)
}
}
@ -1161,7 +1166,7 @@ func NeedToReconcilePodReadiness(pod *v1.Pod) bool {
if len(pod.Spec.ReadinessGates) == 0 {
return false
}
podReadyCondition := GeneratePodReadyCondition(&pod.Spec, pod.Status.Conditions, pod.Status.ContainerStatuses, pod.Status.Phase)
podReadyCondition := GeneratePodReadyCondition(pod, &pod.Status, pod.Status.Conditions, pod.Status.ContainerStatuses, pod.Status.Phase)
i, curCondition := podutil.GetPodConditionFromList(pod.Status.Conditions, v1.PodReady)
// Only reconcile if "Ready" condition is present and Status or Message is not expected
if i >= 0 && (curCondition.Status != podReadyCondition.Status || curCondition.Message != podReadyCondition.Message) {
@ -1169,3 +1174,22 @@ func NeedToReconcilePodReadiness(pod *v1.Pod) bool {
}
return false
}
func updatedPodResizeCondition(conditionType v1.PodConditionType, oldCondition *v1.PodCondition, reason, message string) *v1.PodCondition {
now := metav1.NewTime(time.Now())
var lastTransitionTime metav1.Time
if oldCondition == nil || oldCondition.Reason != reason {
lastTransitionTime = now
} else {
lastTransitionTime = oldCondition.LastTransitionTime
}
return &v1.PodCondition{
Type: conditionType,
Status: v1.ConditionTrue,
LastProbeTime: now,
LastTransitionTime: lastTransitionTime,
Reason: reason,
Message: message,
}
}

View File

@ -32,7 +32,9 @@ const (
)
// SwapBehavior types
type SwapBehavior string
const (
LimitedSwap = "LimitedSwap"
NoSwap = "NoSwap"
LimitedSwap SwapBehavior = "LimitedSwap"
NoSwap SwapBehavior = "NoSwap"
)

View File

@ -15,4 +15,4 @@ limitations under the License.
*/
// Package types contains common types in the Kubelet.
package types // import "k8s.io/kubernetes/pkg/kubelet/types"
package types

View File

@ -28,6 +28,8 @@ var PodConditionsByKubelet = []v1.PodConditionType{
v1.PodReady,
v1.PodInitialized,
v1.ContainersReady,
v1.PodResizeInProgress,
v1.PodResizePending,
}
// PodConditionByKubelet returns if the pod condition type is owned by kubelet

View File

@ -15,4 +15,4 @@ limitations under the License.
*/
// Package util holds utility functions.
package util // import "k8s.io/kubernetes/pkg/kubelet/util"
package util

View File

@ -15,4 +15,4 @@ limitations under the License.
*/
// Package store hosts a Store interface and its implementations.
package store // import "k8s.io/kubernetes/pkg/kubelet/util/store"
package store

View File

@ -20,7 +20,7 @@ limitations under the License.
package util
import (
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
libcontainercgroups "github.com/opencontainers/cgroups"
)
// IsCgroup2UnifiedMode returns true if the cgroup v2 unified mode is enabled

View File

@ -15,4 +15,4 @@ limitations under the License.
*/
// Package winstats provides a client to get node and pod level stats on windows
package winstats // import "k8s.io/kubernetes/pkg/kubelet/winstats"
package winstats