rebase: update K8s packages to v0.32.1

Update K8s packages in go.mod to v0.32.1

Signed-off-by: Praveen M <m.praveen@ibm.com>
This commit is contained in:
Praveen M
2025-01-16 09:41:46 +05:30
committed by mergify[bot]
parent 5aef21ea4e
commit 7eb99fc6c9
2442 changed files with 273386 additions and 47788 deletions

View File

@ -0,0 +1,9 @@
# See the OWNERS docs at https://go.k8s.io/owners
# Disable inheritance as this is an api owners file
options:
no_parent_owners: true
approvers:
- api-approvers
reviewers:
- sig-node-api-reviewers

View File

@ -0,0 +1,20 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// +k8s:deepcopy-gen=package
// +groupName=kubelet.config.k8s.io
package config // import "k8s.io/kubernetes/pkg/kubelet/apis/config"

View File

@ -0,0 +1,32 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
// KubeletConfigurationPathRefs returns pointers to all of the KubeletConfiguration fields that contain filepaths.
// You might use this, for example, to resolve all relative paths against some common root before
// passing the configuration to the application. This method must be kept up to date as new fields are added.
func KubeletConfigurationPathRefs(kc *KubeletConfiguration) []*string {
paths := []*string{}
paths = append(paths, &kc.StaticPodPath)
paths = append(paths, &kc.Authentication.X509.ClientCAFile)
paths = append(paths, &kc.TLSCertFile)
paths = append(paths, &kc.TLSPrivateKeyFile)
paths = append(paths, &kc.ResolverConfig)
paths = append(paths, &kc.VolumePluginDir)
paths = append(paths, &kc.PodLogsDir)
return paths
}

View File

@ -0,0 +1,45 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
)
// GroupName is the group name used in this package
const GroupName = "kubelet.config.k8s.io"
// SchemeGroupVersion is group version used to register these objects
var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: runtime.APIVersionInternal}
var (
// SchemeBuilder is the scheme builder with scheme init functions to run for this API package
SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
// AddToScheme is a global function that registers this API group & version to a scheme
AddToScheme = SchemeBuilder.AddToScheme
)
// addKnownTypes registers known types to the given scheme
func addKnownTypes(scheme *runtime.Scheme) error {
scheme.AddKnownTypes(SchemeGroupVersion,
&KubeletConfiguration{},
&SerializedNodeConfigSource{},
&CredentialProviderConfig{},
)
return nil
}

View File

@ -0,0 +1,704 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
logsapi "k8s.io/component-base/logs/api/v1"
tracingapi "k8s.io/component-base/tracing/api/v1"
)
// HairpinMode denotes how the kubelet should configure networking to handle
// hairpin packets.
type HairpinMode string
// Enum settings for different ways to handle hairpin packets.
const (
// Set the hairpin flag on the veth of containers in the respective
// container runtime.
HairpinVeth = "hairpin-veth"
// Make the container bridge promiscuous. This will force it to accept
// hairpin packets, even if the flag isn't set on ports of the bridge.
PromiscuousBridge = "promiscuous-bridge"
// Neither of the above. If the kubelet is started in this hairpin mode
// and kube-proxy is running in iptables mode, hairpin packets will be
// dropped by the container bridge.
HairpinNone = "none"
)
// ResourceChangeDetectionStrategy denotes a mode in which internal
// managers (secret, configmap) are discovering object changes.
type ResourceChangeDetectionStrategy string
// Enum settings for different strategies of kubelet managers.
const (
// GetChangeDetectionStrategy is a mode in which kubelet fetches
// necessary objects directly from apiserver.
GetChangeDetectionStrategy ResourceChangeDetectionStrategy = "Get"
// TTLCacheChangeDetectionStrategy is a mode in which kubelet uses
// ttl cache for object directly fetched from apiserver.
TTLCacheChangeDetectionStrategy ResourceChangeDetectionStrategy = "Cache"
// WatchChangeDetectionStrategy is a mode in which kubelet uses
// watches to observe changes to objects that are in its interest.
WatchChangeDetectionStrategy ResourceChangeDetectionStrategy = "Watch"
// RestrictedTopologyManagerPolicy is a mode in which kubelet only allows
// pods with optimal NUMA node alignment for requested resources
RestrictedTopologyManagerPolicy = "restricted"
// BestEffortTopologyManagerPolicy is a mode in which kubelet will favour
// pods with NUMA alignment of CPU and device resources.
BestEffortTopologyManagerPolicy = "best-effort"
// NoneTopologyManagerPolicy is a mode in which kubelet has no knowledge
// of NUMA alignment of a pod's CPU and device resources.
NoneTopologyManagerPolicy = "none"
// SingleNumaNodeTopologyManagerPolicy is a mode in which kubelet only allows
// pods with a single NUMA alignment of CPU and device resources.
SingleNumaNodeTopologyManagerPolicy = "single-numa-node"
// ContainerTopologyManagerScope represents that
// topology policy is applied on a per-container basis.
ContainerTopologyManagerScope = "container"
// PodTopologyManagerScope represents that
// topology policy is applied on a per-pod basis.
PodTopologyManagerScope = "pod"
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// KubeletConfiguration contains the configuration for the Kubelet
type KubeletConfiguration struct {
metav1.TypeMeta
// enableServer enables Kubelet's secured server.
// Note: Kubelet's insecure port is controlled by the readOnlyPort option.
EnableServer bool
// staticPodPath is the path to the directory containing local (static) pods to
// run, or the path to a single static pod file.
StaticPodPath string
// podLogsDir is a custom root directory path kubelet will use to place pod's log files.
// Default: "/var/log/pods/"
// Note: it is not recommended to use the temp folder as a log directory as it may cause
// unexpected behavior in many places.
PodLogsDir string
// syncFrequency is the max period between synchronizing running
// containers and config
SyncFrequency metav1.Duration
// fileCheckFrequency is the duration between checking config files for
// new data
FileCheckFrequency metav1.Duration
// httpCheckFrequency is the duration between checking http for new data
HTTPCheckFrequency metav1.Duration
// staticPodURL is the URL for accessing static pods to run
StaticPodURL string
// staticPodURLHeader is a map of slices with HTTP headers to use when accessing the podURL
StaticPodURLHeader map[string][]string `datapolicy:"token"`
// address is the IP address for the Kubelet to serve on (set to 0.0.0.0
// for all interfaces)
Address string
// port is the port for the Kubelet to serve on.
Port int32
// readOnlyPort is the read-only port for the Kubelet to serve on with
// no authentication/authorization (set to 0 to disable)
ReadOnlyPort int32
// volumePluginDir is the full path of the directory in which to search
// for additional third party volume plugins.
VolumePluginDir string
// providerID, if set, sets the unique id of the instance that an external provider (i.e. cloudprovider)
// can use to identify a specific node
ProviderID string
// tlsCertFile is the file containing x509 Certificate for HTTPS. (CA cert,
// if any, concatenated after server cert). If tlsCertFile and
// tlsPrivateKeyFile are not provided, a self-signed certificate
// and key are generated for the public address and saved to the directory
// passed to the Kubelet's --cert-dir flag.
TLSCertFile string
// tlsPrivateKeyFile is the file containing x509 private key matching tlsCertFile
TLSPrivateKeyFile string
// TLSCipherSuites is the list of allowed cipher suites for the server.
// Note that TLS 1.3 ciphersuites are not configurable.
// Values are from tls package constants (https://golang.org/pkg/crypto/tls/#pkg-constants).
TLSCipherSuites []string
// TLSMinVersion is the minimum TLS version supported.
// Values are from tls package constants (https://golang.org/pkg/crypto/tls/#pkg-constants).
TLSMinVersion string
// rotateCertificates enables client certificate rotation. The Kubelet will request a
// new certificate from the certificates.k8s.io API. This requires an approver to approve the
// certificate signing requests.
RotateCertificates bool
// serverTLSBootstrap enables server certificate bootstrap. Instead of self
// signing a serving certificate, the Kubelet will request a certificate from
// the certificates.k8s.io API. This requires an approver to approve the
// certificate signing requests. The RotateKubeletServerCertificate feature
// must be enabled.
ServerTLSBootstrap bool
// authentication specifies how requests to the Kubelet's server are authenticated
Authentication KubeletAuthentication
// authorization specifies how requests to the Kubelet's server are authorized
Authorization KubeletAuthorization
// registryPullQPS is the limit of registry pulls per second.
// Set to 0 for no limit.
RegistryPullQPS int32
// registryBurst is the maximum size of bursty pulls, temporarily allows
// pulls to burst to this number, while still not exceeding registryPullQPS.
// Only used if registryPullQPS > 0.
RegistryBurst int32
// eventRecordQPS is the maximum event creations per second. If 0, there
// is no limit enforced.
EventRecordQPS int32
// eventBurst is the maximum size of a burst of event creations, temporarily
// allows event creations to burst to this number, while still not exceeding
// eventRecordQPS. Only used if eventRecordQPS > 0.
EventBurst int32
// enableDebuggingHandlers enables server endpoints for log collection
// and local running of containers and commands
EnableDebuggingHandlers bool
// enableContentionProfiling enables block profiling, if enableDebuggingHandlers is true.
EnableContentionProfiling bool
// healthzPort is the port of the localhost healthz endpoint (set to 0 to disable)
HealthzPort int32
// healthzBindAddress is the IP address for the healthz server to serve on
HealthzBindAddress string
// oomScoreAdj is The oom-score-adj value for kubelet process. Values
// must be within the range [-1000, 1000].
OOMScoreAdj int32
// clusterDomain is the DNS domain for this cluster. If set, kubelet will
// configure all containers to search this domain in addition to the
// host's search domains.
ClusterDomain string
// clusterDNS is a list of IP addresses for a cluster DNS server. If set,
// kubelet will configure all containers to use this for DNS resolution
// instead of the host's DNS servers.
ClusterDNS []string
// streamingConnectionIdleTimeout is the maximum time a streaming connection
// can be idle before the connection is automatically closed.
StreamingConnectionIdleTimeout metav1.Duration
// nodeStatusUpdateFrequency is the frequency that kubelet computes node
// status. If node lease feature is not enabled, it is also the frequency that
// kubelet posts node status to master. In that case, be cautious when
// changing the constant, it must work with nodeMonitorGracePeriod in nodecontroller.
NodeStatusUpdateFrequency metav1.Duration
// nodeStatusReportFrequency is the frequency that kubelet posts node
// status to master if node status does not change. Kubelet will ignore this
// frequency and post node status immediately if any change is detected. It is
// only used when node lease feature is enabled.
NodeStatusReportFrequency metav1.Duration
// nodeLeaseDurationSeconds is the duration the Kubelet will set on its corresponding Lease.
NodeLeaseDurationSeconds int32
// ImageMinimumGCAge is the minimum age for an unused image before it is
// garbage collected.
ImageMinimumGCAge metav1.Duration
// ImageMaximumGCAge is the maximum age an image can be unused before it is garbage collected.
// The default of this field is "0s", which disables this field--meaning images won't be garbage
// collected based on being unused for too long.
ImageMaximumGCAge metav1.Duration
// imageGCHighThresholdPercent is the percent of disk usage after which
// image garbage collection is always run. The percent is calculated as
// this field value out of 100.
ImageGCHighThresholdPercent int32
// imageGCLowThresholdPercent is the percent of disk usage before which
// image garbage collection is never run. Lowest disk usage to garbage
// collect to. The percent is calculated as this field value out of 100.
ImageGCLowThresholdPercent int32
// How frequently to calculate and cache volume disk usage for all pods
VolumeStatsAggPeriod metav1.Duration
// KubeletCgroups is the absolute name of cgroups to isolate the kubelet in
KubeletCgroups string
// SystemCgroups is absolute name of cgroups in which to place
// all non-kernel processes that are not already in a container. Empty
// for no container. Rolling back the flag requires a reboot.
SystemCgroups string
// CgroupRoot is the root cgroup to use for pods.
// If CgroupsPerQOS is enabled, this is the root of the QoS cgroup hierarchy.
CgroupRoot string
// Enable QoS based Cgroup hierarchy: top level cgroups for QoS Classes
// And all Burstable and BestEffort pods are brought up under their
// specific top level QoS cgroup.
CgroupsPerQOS bool
// driver that the kubelet uses to manipulate cgroups on the host (cgroupfs or systemd)
CgroupDriver string
// SingleProcessOOMKill, if true, will prevent the `memory.oom.group` flag from being set for container
// cgroups in cgroups v2. This causes processes in the container to be OOM killed individually instead of as
// a group. It means that if true, the behavior aligns with the behavior of cgroups v1.
SingleProcessOOMKill *bool
// CPUManagerPolicy is the name of the policy to use.
// Requires the CPUManager feature gate to be enabled.
CPUManagerPolicy string
// CPUManagerPolicyOptions is a set of key=value which allows to set extra options
// to fine tune the behaviour of the cpu manager policies.
// Requires both the "CPUManager" and "CPUManagerPolicyOptions" feature gates to be enabled.
CPUManagerPolicyOptions map[string]string
// CPU Manager reconciliation period.
// Requires the CPUManager feature gate to be enabled.
CPUManagerReconcilePeriod metav1.Duration
// MemoryManagerPolicy is the name of the policy to use.
// Requires the MemoryManager feature gate to be enabled.
MemoryManagerPolicy string
// TopologyManagerPolicy is the name of the policy to use.
TopologyManagerPolicy string
// TopologyManagerScope represents the scope of topology hint generation
// that topology manager requests and hint providers generate.
// Default: "container"
// +optional
TopologyManagerScope string
// TopologyManagerPolicyOptions is a set of key=value which allows to set extra options
// to fine tune the behaviour of the topology manager policies.
// Requires both the "TopologyManager" and "TopologyManagerPolicyOptions" feature gates to be enabled.
TopologyManagerPolicyOptions map[string]string
// Map of QoS resource reservation percentages (memory only for now).
// Requires the QOSReserved feature gate to be enabled.
QOSReserved map[string]string
// runtimeRequestTimeout is the timeout for all runtime requests except long running
// requests - pull, logs, exec and attach.
RuntimeRequestTimeout metav1.Duration
// hairpinMode specifies how the Kubelet should configure the container
// bridge for hairpin packets.
// Setting this flag allows endpoints in a Service to loadbalance back to
// themselves if they should try to access their own Service. Values:
// "promiscuous-bridge": make the container bridge promiscuous.
// "hairpin-veth": set the hairpin flag on container veth interfaces.
// "none": do nothing.
// Generally, one must set --hairpin-mode=hairpin-veth to achieve hairpin NAT,
// because promiscuous-bridge assumes the existence of a container bridge named cbr0.
HairpinMode string
// maxPods is the number of pods that can run on this Kubelet.
MaxPods int32
// The CIDR to use for pod IP addresses, only used in standalone mode.
// In cluster mode, this is obtained from the master.
PodCIDR string
// The maximum number of processes per pod. If -1, the kubelet defaults to the node allocatable pid capacity.
PodPidsLimit int64
// ResolverConfig is the resolver configuration file used as the basis
// for the container DNS resolution configuration.
ResolverConfig string
// RunOnce causes the Kubelet to check the API server once for pods,
// run those in addition to the pods specified by static pod files, and exit.
// Deprecated: no longer has any effect.
RunOnce bool
// cpuCFSQuota enables CPU CFS quota enforcement for containers that
// specify CPU limits
CPUCFSQuota bool
// CPUCFSQuotaPeriod sets the CPU CFS quota period value, cpu.cfs_period_us, defaults to 100ms
CPUCFSQuotaPeriod metav1.Duration
// maxOpenFiles is Number of files that can be opened by Kubelet process.
MaxOpenFiles int64
// nodeStatusMaxImages caps the number of images reported in Node.Status.Images.
NodeStatusMaxImages int32
// contentType is contentType of requests sent to apiserver.
ContentType string
// kubeAPIQPS is the QPS to use while talking with kubernetes apiserver
KubeAPIQPS int32
// kubeAPIBurst is the burst to allow while talking with kubernetes
// apiserver
KubeAPIBurst int32
// serializeImagePulls when enabled, tells the Kubelet to pull images one at a time.
SerializeImagePulls bool
// MaxParallelImagePulls sets the maximum number of image pulls in parallel.
MaxParallelImagePulls *int32
// Map of signal names to quantities that defines hard eviction thresholds. For example: {"memory.available": "300Mi"}.
// Some default signals are Linux only: nodefs.inodesFree
EvictionHard map[string]string
// Map of signal names to quantities that defines soft eviction thresholds. For example: {"memory.available": "300Mi"}.
EvictionSoft map[string]string
// Map of signal names to quantities that defines grace periods for each soft eviction signal. For example: {"memory.available": "30s"}.
EvictionSoftGracePeriod map[string]string
// Duration for which the kubelet has to wait before transitioning out of an eviction pressure condition.
EvictionPressureTransitionPeriod metav1.Duration
// Maximum allowed grace period (in seconds) to use when terminating pods in response to a soft eviction threshold being met.
EvictionMaxPodGracePeriod int32
// Map of signal names to quantities that defines minimum reclaims, which describe the minimum
// amount of a given resource the kubelet will reclaim when performing a pod eviction while
// that resource is under pressure. For example: {"imagefs.available": "2Gi"}
EvictionMinimumReclaim map[string]string
// podsPerCore is the maximum number of pods per core. Cannot exceed MaxPods.
// If 0, this field is ignored.
PodsPerCore int32
// enableControllerAttachDetach enables the Attach/Detach controller to
// manage attachment/detachment of volumes scheduled to this node, and
// disables kubelet from executing any attach/detach operations
EnableControllerAttachDetach bool
// protectKernelDefaults, if true, causes the Kubelet to error if kernel
// flags are not as it expects. Otherwise the Kubelet will attempt to modify
// kernel flags to match its expectation.
ProtectKernelDefaults bool
// If true, Kubelet creates the KUBE-IPTABLES-HINT chain in iptables as a hint to
// other components about the configuration of iptables on the system.
MakeIPTablesUtilChains bool
// iptablesMasqueradeBit formerly controlled the creation of the KUBE-MARK-MASQ
// chain.
// Deprecated: no longer has any effect.
IPTablesMasqueradeBit int32
// iptablesDropBit formerly controlled the creation of the KUBE-MARK-DROP chain.
// Deprecated: no longer has any effect.
IPTablesDropBit int32
// featureGates is a map of feature names to bools that enable or disable alpha/experimental
// features. This field modifies piecemeal the built-in default values from
// "k8s.io/kubernetes/pkg/features/kube_features.go".
FeatureGates map[string]bool
// Tells the Kubelet to fail to start if swap is enabled on the node.
FailSwapOn bool
// memorySwap configures swap memory available to container workloads.
// +featureGate=NodeSwap
// +optional
MemorySwap MemorySwapConfiguration
// A quantity defines the maximum size of the container log file before it is rotated. For example: "5Mi" or "256Ki".
ContainerLogMaxSize string
// Maximum number of container log files that can be present for a container.
ContainerLogMaxFiles int32
// Maximum number of concurrent log rotation workers to spawn for processing the log rotation
// requests
ContainerLogMaxWorkers int32
// Interval at which the container logs are monitored for rotation
ContainerLogMonitorInterval metav1.Duration
// ConfigMapAndSecretChangeDetectionStrategy is a mode in which config map and secret managers are running.
ConfigMapAndSecretChangeDetectionStrategy ResourceChangeDetectionStrategy
// A comma separated allowlist of unsafe sysctls or sysctl patterns (ending in `*`).
// Unsafe sysctl groups are `kernel.shm*`, `kernel.msg*`, `kernel.sem`, `fs.mqueue.*`, and `net.*`.
// These sysctls are namespaced but not allowed by default.
// For example: "`kernel.msg*,net.ipv4.route.min_pmtu`"
// +optional
AllowedUnsafeSysctls []string
// kernelMemcgNotification if enabled, the kubelet will integrate with the kernel memcg
// notification to determine if memory eviction thresholds are crossed rather than polling.
KernelMemcgNotification bool
/* the following fields are meant for Node Allocatable */
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,ephemeral-storage=1G,pid=100) pairs
// that describe resources reserved for non-kubernetes components.
// Currently only cpu, memory and local ephemeral storage for root file system are supported.
// See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources for more detail.
SystemReserved map[string]string
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,ephemeral-storage=1G,pid=100) pairs
// that describe resources reserved for kubernetes system components.
// Currently only cpu, memory and local ephemeral storage for root file system are supported.
// See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources for more detail.
KubeReserved map[string]string
// This flag helps kubelet identify absolute name of top level cgroup used to enforce `SystemReserved` compute resource reservation for OS system daemons.
// Refer to [Node Allocatable](https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable) doc for more information.
SystemReservedCgroup string
// This flag helps kubelet identify absolute name of top level cgroup used to enforce `KubeReserved` compute resource reservation for Kubernetes node system daemons.
// Refer to [Node Allocatable](https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable) doc for more information.
KubeReservedCgroup string
// This flag specifies the various Node Allocatable enforcements that Kubelet needs to perform.
// This flag accepts a list of options. Acceptable options are `pods`, `system-reserved` & `kube-reserved`.
// Refer to [Node Allocatable](https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable) doc for more information.
EnforceNodeAllocatable []string
// This option specifies the cpu list reserved for the host level system threads and kubernetes related threads.
// This provide a "static" CPU list rather than the "dynamic" list by system-reserved and kube-reserved.
// This option overwrites CPUs provided by system-reserved and kube-reserved.
ReservedSystemCPUs string
// The previous version for which you want to show hidden metrics.
// Only the previous minor version is meaningful, other values will not be allowed.
// The format is <major>.<minor>, e.g.: '1.16'.
// The purpose of this format is make sure you have the opportunity to notice if the next release hides additional metrics,
// rather than being surprised when they are permanently removed in the release after that.
ShowHiddenMetricsForVersion string
// Logging specifies the options of logging.
// Refer [Logs Options](https://github.com/kubernetes/component-base/blob/master/logs/options.go) for more information.
Logging logsapi.LoggingConfiguration
// EnableSystemLogHandler enables /logs handler.
EnableSystemLogHandler bool
// EnableSystemLogQuery enables the node log query feature on the /logs endpoint.
// EnableSystemLogHandler has to be enabled in addition for this feature to work.
// Enabling this feature has security implications. The recommendation is to enable it on a need basis for debugging
// purposes and disabling otherwise.
// +featureGate=NodeLogQuery
// +optional
EnableSystemLogQuery bool
// ShutdownGracePeriod specifies the total duration that the node should delay the shutdown and total grace period for pod termination during a node shutdown.
// Defaults to 0 seconds.
// +featureGate=GracefulNodeShutdown
// +optional
ShutdownGracePeriod metav1.Duration
// ShutdownGracePeriodCriticalPods specifies the duration used to terminate critical pods during a node shutdown. This should be less than ShutdownGracePeriod.
// Defaults to 0 seconds.
// For example, if ShutdownGracePeriod=30s, and ShutdownGracePeriodCriticalPods=10s, during a node shutdown the first 20 seconds would be reserved for gracefully terminating normal pods, and the last 10 seconds would be reserved for terminating critical pods.
// +featureGate=GracefulNodeShutdown
// +optional
ShutdownGracePeriodCriticalPods metav1.Duration
// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based
// on their associated priority class value.
// When a shutdown request is received, the Kubelet will initiate shutdown on all pods
// running on the node with a grace period that depends on the priority of the pod,
// and then wait for all pods to exit.
// Each entry in the array represents the graceful shutdown time a pod with a priority
// class value that lies in the range of that value and the next higher entry in the
// list when the node is shutting down.
ShutdownGracePeriodByPodPriority []ShutdownGracePeriodByPodPriority
// ReservedMemory specifies a comma-separated list of memory reservations for NUMA nodes.
// The parameter makes sense only in the context of the memory manager feature. The memory manager will not allocate reserved memory for container workloads.
// For example, if you have a NUMA0 with 10Gi of memory and the ReservedMemory was specified to reserve 1Gi of memory at NUMA0,
// the memory manager will assume that only 9Gi is available for allocation.
// You can specify a different amount of NUMA node and memory types.
// You can omit this parameter at all, but you should be aware that the amount of reserved memory from all NUMA nodes
// should be equal to the amount of memory specified by the node allocatable features(https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#node-allocatable).
// If at least one node allocatable parameter has a non-zero value, you will need to specify at least one NUMA node.
// Also, avoid specifying:
// 1. Duplicates, the same NUMA node, and memory type, but with a different value.
// 2. zero limits for any memory type.
// 3. NUMAs nodes IDs that do not exist under the machine.
// 4. memory types except for memory and hugepages-<size>
ReservedMemory []MemoryReservation
// EnableProfiling enables /debug/pprof handler.
EnableProfilingHandler bool
// EnableDebugFlagsHandler enables/debug/flags/v handler.
EnableDebugFlagsHandler bool
// SeccompDefault enables the use of `RuntimeDefault` as the default seccomp profile for all workloads.
SeccompDefault bool
// MemoryThrottlingFactor specifies the factor multiplied by the memory limit or node allocatable memory
// when setting the cgroupv2 memory.high value to enforce MemoryQoS.
// Decreasing this factor will set lower high limit for container cgroups and put heavier reclaim pressure
// while increasing will put less reclaim pressure.
// See https://kep.k8s.io/2570 for more details.
// Default: 0.9
// +featureGate=MemoryQoS
// +optional
MemoryThrottlingFactor *float64
// registerWithTaints are an array of taints to add to a node object when
// the kubelet registers itself. This only takes effect when registerNode
// is true and upon the initial registration of the node.
// +optional
RegisterWithTaints []v1.Taint
// registerNode enables automatic registration with the apiserver.
// +optional
RegisterNode bool
// Tracing specifies the versioned configuration for OpenTelemetry tracing clients.
// See https://kep.k8s.io/2832 for more details.
// +featureGate=KubeletTracing
// +optional
Tracing *tracingapi.TracingConfiguration
// LocalStorageCapacityIsolation enables local ephemeral storage isolation feature. The default setting is true.
// This feature allows users to set request/limit for container's ephemeral storage and manage it in a similar way
// as cpu and memory. It also allows setting sizeLimit for emptyDir volume, which will trigger pod eviction if disk
// usage from the volume exceeds the limit.
// This feature depends on the capability of detecting correct root file system disk usage. For certain systems,
// such as kind rootless, if this capability cannot be supported, the feature LocalStorageCapacityIsolation should be
// disabled. Once disabled, user should not set request/limit for container's ephemeral storage, or sizeLimit for emptyDir.
// +optional
LocalStorageCapacityIsolation bool
// ContainerRuntimeEndpoint is the endpoint of container runtime.
// unix domain sockets supported on Linux while npipes and tcp endpoints are supported for windows.
// Examples:'unix:///path/to/runtime.sock', 'npipe:////./pipe/runtime'
ContainerRuntimeEndpoint string
// ImageServiceEndpoint is the endpoint of container image service.
// If not specified the default value is ContainerRuntimeEndpoint
// +optional
ImageServiceEndpoint string
// FailCgroupV1 prevents the kubelet from starting on hosts
// that use cgroup v1. By default, this is set to 'false', meaning
// the kubelet is allowed to start on cgroup v1 hosts unless this
// option is explicitly enabled.
// +optional
FailCgroupV1 bool
// CrashLoopBackOff contains config to modify node-level parameters for
// container restart behavior
// +featureGate=KubeletCrashLoopBackoffMax
// +optional
CrashLoopBackOff CrashLoopBackOffConfig
}
// KubeletAuthorizationMode denotes the authorization mode for the kubelet
type KubeletAuthorizationMode string
const (
// KubeletAuthorizationModeAlwaysAllow authorizes all authenticated requests
KubeletAuthorizationModeAlwaysAllow KubeletAuthorizationMode = "AlwaysAllow"
// KubeletAuthorizationModeWebhook uses the SubjectAccessReview API to determine authorization
KubeletAuthorizationModeWebhook KubeletAuthorizationMode = "Webhook"
)
// KubeletAuthorization holds the state related to the authorization in the kublet.
type KubeletAuthorization struct {
// mode is the authorization mode to apply to requests to the kubelet server.
// Valid values are AlwaysAllow and Webhook.
// Webhook mode uses the SubjectAccessReview API to determine authorization.
Mode KubeletAuthorizationMode
// webhook contains settings related to Webhook authorization.
Webhook KubeletWebhookAuthorization
}
// KubeletWebhookAuthorization holds the state related to the Webhook
// Authorization in the Kubelet.
type KubeletWebhookAuthorization struct {
// cacheAuthorizedTTL is the duration to cache 'authorized' responses from the webhook authorizer.
CacheAuthorizedTTL metav1.Duration
// cacheUnauthorizedTTL is the duration to cache 'unauthorized' responses from the webhook authorizer.
CacheUnauthorizedTTL metav1.Duration
}
// KubeletAuthentication holds the Kubetlet Authentication setttings.
type KubeletAuthentication struct {
// x509 contains settings related to x509 client certificate authentication
X509 KubeletX509Authentication
// webhook contains settings related to webhook bearer token authentication
Webhook KubeletWebhookAuthentication
// anonymous contains settings related to anonymous authentication
Anonymous KubeletAnonymousAuthentication
}
// KubeletX509Authentication contains settings related to x509 client certificate authentication
type KubeletX509Authentication struct {
// clientCAFile is the path to a PEM-encoded certificate bundle. If set, any request presenting a client certificate
// signed by one of the authorities in the bundle is authenticated with a username corresponding to the CommonName,
// and groups corresponding to the Organization in the client certificate.
ClientCAFile string
}
// KubeletWebhookAuthentication contains settings related to webhook authentication
type KubeletWebhookAuthentication struct {
// enabled allows bearer token authentication backed by the tokenreviews.authentication.k8s.io API
Enabled bool
// cacheTTL enables caching of authentication results
CacheTTL metav1.Duration
}
// KubeletAnonymousAuthentication enables anonymous requests to the kubelet server.
type KubeletAnonymousAuthentication struct {
// enabled allows anonymous requests to the kubelet server.
// Requests that are not rejected by another authentication method are treated as anonymous requests.
// Anonymous requests have a username of system:anonymous, and a group name of system:unauthenticated.
Enabled bool
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// SerializedNodeConfigSource allows us to serialize NodeConfigSource
// This type is used internally by the Kubelet for tracking checkpointed dynamic configs.
// It exists in the kubeletconfig API group because it is classified as a versioned input to the Kubelet.
type SerializedNodeConfigSource struct {
metav1.TypeMeta
// Source is the source that we are serializing
// +optional
Source v1.NodeConfigSource
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// CredentialProviderConfig is the configuration containing information about
// each exec credential provider. Kubelet reads this configuration from disk and enables
// each provider as specified by the CredentialProvider type.
type CredentialProviderConfig struct {
metav1.TypeMeta
// providers is a list of credential provider plugins that will be enabled by the kubelet.
// Multiple providers may match against a single image, in which case credentials
// from all providers will be returned to the kubelet. If multiple providers are called
// for a single image, the results are combined. If providers return overlapping
// auth keys, the value from the provider earlier in this list is used.
Providers []CredentialProvider
}
// CredentialProvider represents an exec plugin to be invoked by the kubelet. The plugin is only
// invoked when an image being pulled matches the images handled by the plugin (see matchImages).
type CredentialProvider struct {
// name is the required name of the credential provider. It must match the name of the
// provider executable as seen by the kubelet. The executable must be in the kubelet's
// bin directory (set by the --credential-provider-bin-dir flag).
Name string
// matchImages is a required list of strings used to match against images in order to
// determine if this provider should be invoked. If one of the strings matches the
// requested image from the kubelet, the plugin will be invoked and given a chance
// to provide credentials. Images are expected to contain the registry domain
// and URL path.
//
// Each entry in matchImages is a pattern which can optionally contain a port and a path.
// Globs can be used in the domain, but not in the port or the path. Globs are supported
// as subdomains like `*.k8s.io` or `k8s.*.io`, and top-level-domains such as `k8s.*`.
// Matching partial subdomains like `app*.k8s.io` is also supported. Each glob can only match
// a single subdomain segment, so `*.io` does not match *.k8s.io.
//
// A match exists between an image and a matchImage when all of the below are true:
// - Both contain the same number of domain parts and each part matches.
// - The URL path of an imageMatch must be a prefix of the target image URL path.
// - If the imageMatch contains a port, then the port must match in the image as well.
//
// Example values of matchImages:
// - `123456789.dkr.ecr.us-east-1.amazonaws.com`
// - `*.azurecr.io`
// - `gcr.io`
// - `*.*.registry.io`
// - `registry.io:8080/path`
MatchImages []string
// defaultCacheDuration is the default duration the plugin will cache credentials in-memory
// if a cache duration is not provided in the plugin response. This field is required.
DefaultCacheDuration *metav1.Duration
// Required input version of the exec CredentialProviderRequest. The returned CredentialProviderResponse
// MUST use the same encoding version as the input. Current supported values are:
// - credentialprovider.kubelet.k8s.io/v1alpha1
// - credentialprovider.kubelet.k8s.io/v1beta1
// - credentialprovider.kubelet.k8s.io/v1
APIVersion string
// Arguments to pass to the command when executing it.
// +optional
Args []string
// Env defines additional environment variables to expose to the process. These
// are unioned with the host's environment, as well as variables client-go uses
// to pass argument to the plugin.
// +optional
Env []ExecEnvVar
}
// ExecEnvVar is used for setting environment variables when executing an exec-based
// credential plugin.
type ExecEnvVar struct {
Name string
Value string
}
// MemoryReservation specifies the memory reservation of different types for each NUMA node
type MemoryReservation struct {
NumaNode int32
Limits v1.ResourceList
}
// ShutdownGracePeriodByPodPriority specifies the shutdown grace period for Pods based on their associated priority class value
type ShutdownGracePeriodByPodPriority struct {
// priority is the priority value associated with the shutdown grace period
Priority int32
// shutdownGracePeriodSeconds is the shutdown grace period in seconds
ShutdownGracePeriodSeconds int64
}
type MemorySwapConfiguration struct {
// swapBehavior configures swap memory available to container workloads. May be one of
// "", "NoSwap": workloads can not use swap, default option.
// "LimitedSwap": workload swap usage is limited. The swap limit is proportionate to the container's memory request.
// +featureGate=NodeSwap
// +optional
SwapBehavior string
}
// CrashLoopBackOffConfig is used for setting configuration for this kubelet's
// container restart behavior
type CrashLoopBackOffConfig struct {
// MaxContainerRestartPeriod is the maximum duration the backoff delay can accrue
// to for container restarts, minimum 1 second, maximum 300 seconds.
// +featureGate=KubeletCrashLoopBackOffMax
// +optional
MaxContainerRestartPeriod *metav1.Duration
}

View File

@ -0,0 +1,508 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by deepcopy-gen. DO NOT EDIT.
package config
import (
corev1 "k8s.io/api/core/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
apiv1 "k8s.io/component-base/tracing/api/v1"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *CrashLoopBackOffConfig) DeepCopyInto(out *CrashLoopBackOffConfig) {
*out = *in
if in.MaxContainerRestartPeriod != nil {
in, out := &in.MaxContainerRestartPeriod, &out.MaxContainerRestartPeriod
*out = new(v1.Duration)
**out = **in
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CrashLoopBackOffConfig.
func (in *CrashLoopBackOffConfig) DeepCopy() *CrashLoopBackOffConfig {
if in == nil {
return nil
}
out := new(CrashLoopBackOffConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *CredentialProvider) DeepCopyInto(out *CredentialProvider) {
*out = *in
if in.MatchImages != nil {
in, out := &in.MatchImages, &out.MatchImages
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.DefaultCacheDuration != nil {
in, out := &in.DefaultCacheDuration, &out.DefaultCacheDuration
*out = new(v1.Duration)
**out = **in
}
if in.Args != nil {
in, out := &in.Args, &out.Args
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.Env != nil {
in, out := &in.Env, &out.Env
*out = make([]ExecEnvVar, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CredentialProvider.
func (in *CredentialProvider) DeepCopy() *CredentialProvider {
if in == nil {
return nil
}
out := new(CredentialProvider)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *CredentialProviderConfig) DeepCopyInto(out *CredentialProviderConfig) {
*out = *in
out.TypeMeta = in.TypeMeta
if in.Providers != nil {
in, out := &in.Providers, &out.Providers
*out = make([]CredentialProvider, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CredentialProviderConfig.
func (in *CredentialProviderConfig) DeepCopy() *CredentialProviderConfig {
if in == nil {
return nil
}
out := new(CredentialProviderConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *CredentialProviderConfig) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ExecEnvVar) DeepCopyInto(out *ExecEnvVar) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExecEnvVar.
func (in *ExecEnvVar) DeepCopy() *ExecEnvVar {
if in == nil {
return nil
}
out := new(ExecEnvVar)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletAnonymousAuthentication) DeepCopyInto(out *KubeletAnonymousAuthentication) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeletAnonymousAuthentication.
func (in *KubeletAnonymousAuthentication) DeepCopy() *KubeletAnonymousAuthentication {
if in == nil {
return nil
}
out := new(KubeletAnonymousAuthentication)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletAuthentication) DeepCopyInto(out *KubeletAuthentication) {
*out = *in
out.X509 = in.X509
out.Webhook = in.Webhook
out.Anonymous = in.Anonymous
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeletAuthentication.
func (in *KubeletAuthentication) DeepCopy() *KubeletAuthentication {
if in == nil {
return nil
}
out := new(KubeletAuthentication)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletAuthorization) DeepCopyInto(out *KubeletAuthorization) {
*out = *in
out.Webhook = in.Webhook
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeletAuthorization.
func (in *KubeletAuthorization) DeepCopy() *KubeletAuthorization {
if in == nil {
return nil
}
out := new(KubeletAuthorization)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletConfiguration) DeepCopyInto(out *KubeletConfiguration) {
*out = *in
out.TypeMeta = in.TypeMeta
out.SyncFrequency = in.SyncFrequency
out.FileCheckFrequency = in.FileCheckFrequency
out.HTTPCheckFrequency = in.HTTPCheckFrequency
if in.StaticPodURLHeader != nil {
in, out := &in.StaticPodURLHeader, &out.StaticPodURLHeader
*out = make(map[string][]string, len(*in))
for key, val := range *in {
var outVal []string
if val == nil {
(*out)[key] = nil
} else {
in, out := &val, &outVal
*out = make([]string, len(*in))
copy(*out, *in)
}
(*out)[key] = outVal
}
}
if in.TLSCipherSuites != nil {
in, out := &in.TLSCipherSuites, &out.TLSCipherSuites
*out = make([]string, len(*in))
copy(*out, *in)
}
out.Authentication = in.Authentication
out.Authorization = in.Authorization
if in.ClusterDNS != nil {
in, out := &in.ClusterDNS, &out.ClusterDNS
*out = make([]string, len(*in))
copy(*out, *in)
}
out.StreamingConnectionIdleTimeout = in.StreamingConnectionIdleTimeout
out.NodeStatusUpdateFrequency = in.NodeStatusUpdateFrequency
out.NodeStatusReportFrequency = in.NodeStatusReportFrequency
out.ImageMinimumGCAge = in.ImageMinimumGCAge
out.ImageMaximumGCAge = in.ImageMaximumGCAge
out.VolumeStatsAggPeriod = in.VolumeStatsAggPeriod
if in.SingleProcessOOMKill != nil {
in, out := &in.SingleProcessOOMKill, &out.SingleProcessOOMKill
*out = new(bool)
**out = **in
}
if in.CPUManagerPolicyOptions != nil {
in, out := &in.CPUManagerPolicyOptions, &out.CPUManagerPolicyOptions
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
out.CPUManagerReconcilePeriod = in.CPUManagerReconcilePeriod
if in.TopologyManagerPolicyOptions != nil {
in, out := &in.TopologyManagerPolicyOptions, &out.TopologyManagerPolicyOptions
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.QOSReserved != nil {
in, out := &in.QOSReserved, &out.QOSReserved
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
out.RuntimeRequestTimeout = in.RuntimeRequestTimeout
out.CPUCFSQuotaPeriod = in.CPUCFSQuotaPeriod
if in.MaxParallelImagePulls != nil {
in, out := &in.MaxParallelImagePulls, &out.MaxParallelImagePulls
*out = new(int32)
**out = **in
}
if in.EvictionHard != nil {
in, out := &in.EvictionHard, &out.EvictionHard
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.EvictionSoft != nil {
in, out := &in.EvictionSoft, &out.EvictionSoft
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.EvictionSoftGracePeriod != nil {
in, out := &in.EvictionSoftGracePeriod, &out.EvictionSoftGracePeriod
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
out.EvictionPressureTransitionPeriod = in.EvictionPressureTransitionPeriod
if in.EvictionMinimumReclaim != nil {
in, out := &in.EvictionMinimumReclaim, &out.EvictionMinimumReclaim
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.FeatureGates != nil {
in, out := &in.FeatureGates, &out.FeatureGates
*out = make(map[string]bool, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
out.MemorySwap = in.MemorySwap
out.ContainerLogMonitorInterval = in.ContainerLogMonitorInterval
if in.AllowedUnsafeSysctls != nil {
in, out := &in.AllowedUnsafeSysctls, &out.AllowedUnsafeSysctls
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.SystemReserved != nil {
in, out := &in.SystemReserved, &out.SystemReserved
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.KubeReserved != nil {
in, out := &in.KubeReserved, &out.KubeReserved
*out = make(map[string]string, len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.EnforceNodeAllocatable != nil {
in, out := &in.EnforceNodeAllocatable, &out.EnforceNodeAllocatable
*out = make([]string, len(*in))
copy(*out, *in)
}
in.Logging.DeepCopyInto(&out.Logging)
out.ShutdownGracePeriod = in.ShutdownGracePeriod
out.ShutdownGracePeriodCriticalPods = in.ShutdownGracePeriodCriticalPods
if in.ShutdownGracePeriodByPodPriority != nil {
in, out := &in.ShutdownGracePeriodByPodPriority, &out.ShutdownGracePeriodByPodPriority
*out = make([]ShutdownGracePeriodByPodPriority, len(*in))
copy(*out, *in)
}
if in.ReservedMemory != nil {
in, out := &in.ReservedMemory, &out.ReservedMemory
*out = make([]MemoryReservation, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.MemoryThrottlingFactor != nil {
in, out := &in.MemoryThrottlingFactor, &out.MemoryThrottlingFactor
*out = new(float64)
**out = **in
}
if in.RegisterWithTaints != nil {
in, out := &in.RegisterWithTaints, &out.RegisterWithTaints
*out = make([]corev1.Taint, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.Tracing != nil {
in, out := &in.Tracing, &out.Tracing
*out = new(apiv1.TracingConfiguration)
(*in).DeepCopyInto(*out)
}
in.CrashLoopBackOff.DeepCopyInto(&out.CrashLoopBackOff)
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeletConfiguration.
func (in *KubeletConfiguration) DeepCopy() *KubeletConfiguration {
if in == nil {
return nil
}
out := new(KubeletConfiguration)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *KubeletConfiguration) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletWebhookAuthentication) DeepCopyInto(out *KubeletWebhookAuthentication) {
*out = *in
out.CacheTTL = in.CacheTTL
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeletWebhookAuthentication.
func (in *KubeletWebhookAuthentication) DeepCopy() *KubeletWebhookAuthentication {
if in == nil {
return nil
}
out := new(KubeletWebhookAuthentication)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletWebhookAuthorization) DeepCopyInto(out *KubeletWebhookAuthorization) {
*out = *in
out.CacheAuthorizedTTL = in.CacheAuthorizedTTL
out.CacheUnauthorizedTTL = in.CacheUnauthorizedTTL
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeletWebhookAuthorization.
func (in *KubeletWebhookAuthorization) DeepCopy() *KubeletWebhookAuthorization {
if in == nil {
return nil
}
out := new(KubeletWebhookAuthorization)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeletX509Authentication) DeepCopyInto(out *KubeletX509Authentication) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeletX509Authentication.
func (in *KubeletX509Authentication) DeepCopy() *KubeletX509Authentication {
if in == nil {
return nil
}
out := new(KubeletX509Authentication)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MemoryReservation) DeepCopyInto(out *MemoryReservation) {
*out = *in
if in.Limits != nil {
in, out := &in.Limits, &out.Limits
*out = make(corev1.ResourceList, len(*in))
for key, val := range *in {
(*out)[key] = val.DeepCopy()
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemoryReservation.
func (in *MemoryReservation) DeepCopy() *MemoryReservation {
if in == nil {
return nil
}
out := new(MemoryReservation)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *MemorySwapConfiguration) DeepCopyInto(out *MemorySwapConfiguration) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MemorySwapConfiguration.
func (in *MemorySwapConfiguration) DeepCopy() *MemorySwapConfiguration {
if in == nil {
return nil
}
out := new(MemorySwapConfiguration)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *SerializedNodeConfigSource) DeepCopyInto(out *SerializedNodeConfigSource) {
*out = *in
out.TypeMeta = in.TypeMeta
in.Source.DeepCopyInto(&out.Source)
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SerializedNodeConfigSource.
func (in *SerializedNodeConfigSource) DeepCopy() *SerializedNodeConfigSource {
if in == nil {
return nil
}
out := new(SerializedNodeConfigSource)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *SerializedNodeConfigSource) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ShutdownGracePeriodByPodPriority) DeepCopyInto(out *ShutdownGracePeriodByPodPriority) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ShutdownGracePeriodByPodPriority.
func (in *ShutdownGracePeriodByPodPriority) DeepCopy() *ShutdownGracePeriodByPodPriority {
if in == nil {
return nil
}
out := new(ShutdownGracePeriodByPodPriority)
in.DeepCopyInto(out)
return out
}

View File

@ -0,0 +1,16 @@
---
dir: testing
filename: "{{.InterfaceName | snakecase}}.go"
boilerplate-file: ../../../../hack/boilerplate/boilerplate.generatego.txt
outpkg: testing
with-expecter: true
packages:
k8s.io/kubernetes/pkg/kubelet/apis/podresources:
interfaces:
CPUsProvider:
config:
filename: cpus_provider.go
DevicesProvider:
DynamicResourcesProvider:
MemoryProvider:
PodsProvider:

View File

@ -0,0 +1,72 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podresources
import (
"context"
"fmt"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/cri-client/pkg/util"
"k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
)
// Note: Consumers of the pod resources API should not be importing this package.
// They should copy paste the function in their project.
// GetV1alpha1Client returns a client for the PodResourcesLister grpc service
// Note: This is deprecated
func GetV1alpha1Client(socket string, connectionTimeout time.Duration, maxMsgSize int) (v1alpha1.PodResourcesListerClient, *grpc.ClientConn, error) {
addr, dialer, err := util.GetAddressAndDialer(socket)
if err != nil {
return nil, nil, err
}
ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
defer cancel()
conn, err := grpc.DialContext(ctx, addr,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithContextDialer(dialer),
grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(maxMsgSize)))
if err != nil {
return nil, nil, fmt.Errorf("error dialing socket %s: %v", socket, err)
}
return v1alpha1.NewPodResourcesListerClient(conn), conn, nil
}
// GetV1Client returns a client for the PodResourcesLister grpc service
func GetV1Client(socket string, connectionTimeout time.Duration, maxMsgSize int) (v1.PodResourcesListerClient, *grpc.ClientConn, error) {
addr, dialer, err := util.GetAddressAndDialer(socket)
if err != nil {
return nil, nil, err
}
ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
defer cancel()
conn, err := grpc.DialContext(ctx, addr,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithContextDialer(dialer),
grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(maxMsgSize)))
if err != nil {
return nil, nil, fmt.Errorf("error dialing socket %s: %v", socket, err)
}
return v1.NewPodResourcesListerClient(conn), conn, nil
}

View File

@ -0,0 +1,32 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podresources
const (
// Socket is the name of the podresources server socket
Socket = "kubelet"
// DefaultQPS is determined by empirically reviewing known consumers of the API.
// It's at least unlikely that there is a legitimate need to query podresources
// more than 100 times per second, the other subsystems are not guaranteed to react
// so fast in the first place.
DefaultQPS = 100
// DefaultBurstTokens is determined by empirically reviewing known consumers of the API.
// See the documentation of DefaultQPS, same caveats apply.
DefaultBurstTokens = 10
)

View File

@ -0,0 +1,163 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podresources
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/metrics"
podresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
)
// v1PodResourcesServer implements PodResourcesListerServer
type v1PodResourcesServer struct {
podsProvider PodsProvider
devicesProvider DevicesProvider
cpusProvider CPUsProvider
memoryProvider MemoryProvider
dynamicResourcesProvider DynamicResourcesProvider
}
// NewV1PodResourcesServer returns a PodResourcesListerServer which lists pods provided by the PodsProvider
// with device information provided by the DevicesProvider
func NewV1PodResourcesServer(providers PodResourcesProviders) podresourcesv1.PodResourcesListerServer {
return &v1PodResourcesServer{
podsProvider: providers.Pods,
devicesProvider: providers.Devices,
cpusProvider: providers.Cpus,
memoryProvider: providers.Memory,
dynamicResourcesProvider: providers.DynamicResources,
}
}
// List returns information about the resources assigned to pods on the node
func (p *v1PodResourcesServer) List(ctx context.Context, req *podresourcesv1.ListPodResourcesRequest) (*podresourcesv1.ListPodResourcesResponse, error) {
metrics.PodResourcesEndpointRequestsTotalCount.WithLabelValues("v1").Inc()
metrics.PodResourcesEndpointRequestsListCount.WithLabelValues("v1").Inc()
pods := p.podsProvider.GetPods()
podResources := make([]*podresourcesv1.PodResources, len(pods))
p.devicesProvider.UpdateAllocatedDevices()
for i, pod := range pods {
pRes := podresourcesv1.PodResources{
Name: pod.Name,
Namespace: pod.Namespace,
Containers: make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.Containers)),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SidecarContainers) {
pRes.Containers = make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
for _, container := range pod.Spec.InitContainers {
if !podutil.IsRestartableInitContainer(&container) {
continue
}
pRes.Containers = append(pRes.Containers, p.getContainerResources(pod, &container))
}
}
for _, container := range pod.Spec.Containers {
pRes.Containers = append(pRes.Containers, p.getContainerResources(pod, &container))
}
podResources[i] = &pRes
}
response := &podresourcesv1.ListPodResourcesResponse{
PodResources: podResources,
}
return response, nil
}
// GetAllocatableResources returns information about all the resources known by the server - this more like the capacity, not like the current amount of free resources.
func (p *v1PodResourcesServer) GetAllocatableResources(ctx context.Context, req *podresourcesv1.AllocatableResourcesRequest) (*podresourcesv1.AllocatableResourcesResponse, error) {
metrics.PodResourcesEndpointRequestsTotalCount.WithLabelValues("v1").Inc()
metrics.PodResourcesEndpointRequestsGetAllocatableCount.WithLabelValues("v1").Inc()
response := &podresourcesv1.AllocatableResourcesResponse{
Devices: p.devicesProvider.GetAllocatableDevices(),
CpuIds: p.cpusProvider.GetAllocatableCPUs(),
Memory: p.memoryProvider.GetAllocatableMemory(),
}
return response, nil
}
// Get returns information about the resources assigned to a specific pod
func (p *v1PodResourcesServer) Get(ctx context.Context, req *podresourcesv1.GetPodResourcesRequest) (*podresourcesv1.GetPodResourcesResponse, error) {
metrics.PodResourcesEndpointRequestsTotalCount.WithLabelValues("v1").Inc()
metrics.PodResourcesEndpointRequestsGetCount.WithLabelValues("v1").Inc()
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.KubeletPodResourcesGet) {
metrics.PodResourcesEndpointErrorsGetCount.WithLabelValues("v1").Inc()
return nil, fmt.Errorf("PodResources API Get method disabled")
}
pod, exist := p.podsProvider.GetPodByName(req.PodNamespace, req.PodName)
if !exist {
metrics.PodResourcesEndpointErrorsGetCount.WithLabelValues("v1").Inc()
return nil, fmt.Errorf("pod %s in namespace %s not found", req.PodName, req.PodNamespace)
}
podResources := &podresourcesv1.PodResources{
Name: pod.Name,
Namespace: pod.Namespace,
Containers: make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.Containers)),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SidecarContainers) {
podResources.Containers = make([]*podresourcesv1.ContainerResources, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
for _, container := range pod.Spec.InitContainers {
if !podutil.IsRestartableInitContainer(&container) {
continue
}
podResources.Containers = append(podResources.Containers, p.getContainerResources(pod, &container))
}
}
for _, container := range pod.Spec.Containers {
podResources.Containers = append(podResources.Containers, p.getContainerResources(pod, &container))
}
response := &podresourcesv1.GetPodResourcesResponse{
PodResources: podResources,
}
return response, nil
}
func (p *v1PodResourcesServer) getContainerResources(pod *v1.Pod, container *v1.Container) *podresourcesv1.ContainerResources {
containerResources := &podresourcesv1.ContainerResources{
Name: container.Name,
Devices: p.devicesProvider.GetDevices(string(pod.UID), container.Name),
CpuIds: p.cpusProvider.GetCPUs(string(pod.UID), container.Name),
Memory: p.memoryProvider.GetMemory(string(pod.UID), container.Name),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.KubeletPodResourcesDynamicResources) {
containerResources.DynamicResources = p.dynamicResourcesProvider.GetDynamicResources(pod, container)
}
return containerResources
}

View File

@ -0,0 +1,82 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podresources
import (
"context"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
)
// v1alpha1PodResourcesServer implements PodResourcesListerServer
type v1alpha1PodResourcesServer struct {
podsProvider PodsProvider
devicesProvider DevicesProvider
}
// NewV1alpha1PodResourcesServer returns a PodResourcesListerServer which lists pods provided by the PodsProvider
// with device information provided by the DevicesProvider
func NewV1alpha1PodResourcesServer(providers PodResourcesProviders) v1alpha1.PodResourcesListerServer {
return &v1alpha1PodResourcesServer{
podsProvider: providers.Pods,
devicesProvider: providers.Devices,
}
}
func v1DevicesToAlphaV1(alphaDevs []*v1.ContainerDevices) []*v1alpha1.ContainerDevices {
var devs []*v1alpha1.ContainerDevices
for _, alphaDev := range alphaDevs {
dev := v1alpha1.ContainerDevices{
ResourceName: alphaDev.ResourceName,
DeviceIds: alphaDev.DeviceIds,
}
devs = append(devs, &dev)
}
return devs
}
// List returns information about the resources assigned to pods on the node
func (p *v1alpha1PodResourcesServer) List(ctx context.Context, req *v1alpha1.ListPodResourcesRequest) (*v1alpha1.ListPodResourcesResponse, error) {
metrics.PodResourcesEndpointRequestsTotalCount.WithLabelValues("v1alpha1").Inc()
pods := p.podsProvider.GetPods()
podResources := make([]*v1alpha1.PodResources, len(pods))
p.devicesProvider.UpdateAllocatedDevices()
for i, pod := range pods {
pRes := v1alpha1.PodResources{
Name: pod.Name,
Namespace: pod.Namespace,
Containers: make([]*v1alpha1.ContainerResources, len(pod.Spec.Containers)),
}
for j, container := range pod.Spec.Containers {
pRes.Containers[j] = &v1alpha1.ContainerResources{
Name: container.Name,
Devices: v1DevicesToAlphaV1(p.devicesProvider.GetDevices(string(pod.UID), container.Name)),
}
}
podResources[i] = &pRes
}
return &v1alpha1.ListPodResourcesResponse{
PodResources: podResources,
}, nil
}

View File

@ -0,0 +1,67 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
//go:generate mockery
package podresources
import (
v1 "k8s.io/api/core/v1"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
)
// DevicesProvider knows how to provide the devices used by the given container
type DevicesProvider interface {
// UpdateAllocatedDevices frees any Devices that are bound to terminated pods.
UpdateAllocatedDevices()
// GetDevices returns information about the devices assigned to pods and containers
GetDevices(podUID, containerName string) []*podresourcesapi.ContainerDevices
// GetAllocatableDevices returns information about all the devices known to the manager
GetAllocatableDevices() []*podresourcesapi.ContainerDevices
}
// PodsProvider knows how to provide the pods admitted by the node
type PodsProvider interface {
GetPods() []*v1.Pod
GetPodByName(namespace, name string) (*v1.Pod, bool)
}
// CPUsProvider knows how to provide the cpus used by the given container
type CPUsProvider interface {
// GetCPUs returns information about the cpus assigned to pods and containers
GetCPUs(podUID, containerName string) []int64
// GetAllocatableCPUs returns the allocatable (not allocated) CPUs
GetAllocatableCPUs() []int64
}
type MemoryProvider interface {
// GetMemory returns information about the memory assigned to containers
GetMemory(podUID, containerName string) []*podresourcesapi.ContainerMemory
// GetAllocatableMemory returns the allocatable memory from the node
GetAllocatableMemory() []*podresourcesapi.ContainerMemory
}
type DynamicResourcesProvider interface {
// GetDynamicResources returns information about dynamic resources assigned to pods and containers
GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource
}
type PodResourcesProviders struct {
Pods PodsProvider
Devices DevicesProvider
Cpus CPUsProvider
Memory MemoryProvider
DynamicResources DynamicResourcesProvider
}

View File

@ -0,0 +1,10 @@
---
dir: testing
filename: cadvisor_mock.go
boilerplate-file: ../../../hack/boilerplate/boilerplate.generatego.txt
outpkg: testing
with-expecter: true
packages:
k8s.io/kubernetes/pkg/kubelet/cadvisor:
interfaces:
Interface:

View File

@ -0,0 +1,179 @@
//go:build linux
// +build linux
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cadvisor
import (
"context"
"flag"
"fmt"
"net/http"
"os"
"path"
"time"
// Register supported container handlers.
_ "github.com/google/cadvisor/container/containerd/install"
_ "github.com/google/cadvisor/container/crio/install"
_ "github.com/google/cadvisor/container/systemd/install"
"github.com/google/cadvisor/cache/memory"
cadvisormetrics "github.com/google/cadvisor/container"
cadvisorapi "github.com/google/cadvisor/info/v1"
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
"github.com/google/cadvisor/manager"
"github.com/google/cadvisor/utils/sysfs"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
)
type cadvisorClient struct {
imageFsInfoProvider ImageFsInfoProvider
rootPath string
manager.Manager
}
var _ Interface = new(cadvisorClient)
// TODO(vmarmol): Make configurable.
// The amount of time for which to keep stats in memory.
const statsCacheDuration = 2 * time.Minute
const maxHousekeepingInterval = 15 * time.Second
const defaultHousekeepingInterval = 10 * time.Second
const allowDynamicHousekeeping = true
func init() {
// Override cAdvisor flag defaults.
flagOverrides := map[string]string{
// Override the default cAdvisor housekeeping interval.
"housekeeping_interval": defaultHousekeepingInterval.String(),
// Disable event storage by default.
"event_storage_event_limit": "default=0",
"event_storage_age_limit": "default=0",
}
for name, defaultValue := range flagOverrides {
if f := flag.Lookup(name); f != nil {
f.DefValue = defaultValue
f.Value.Set(defaultValue)
} else {
ctx := context.Background()
klog.FromContext(ctx).Error(nil, "Expected cAdvisor flag not found", "flag", name)
}
}
}
// New creates a new cAdvisor Interface for linux systems.
func New(imageFsInfoProvider ImageFsInfoProvider, rootPath string, cgroupRoots []string, usingLegacyStats, localStorageCapacityIsolation bool) (Interface, error) {
sysFs := sysfs.NewRealSysFs()
includedMetrics := cadvisormetrics.MetricSet{
cadvisormetrics.CpuUsageMetrics: struct{}{},
cadvisormetrics.MemoryUsageMetrics: struct{}{},
cadvisormetrics.CpuLoadMetrics: struct{}{},
cadvisormetrics.DiskIOMetrics: struct{}{},
cadvisormetrics.NetworkUsageMetrics: struct{}{},
cadvisormetrics.AppMetrics: struct{}{},
cadvisormetrics.ProcessMetrics: struct{}{},
cadvisormetrics.OOMMetrics: struct{}{},
}
if usingLegacyStats || localStorageCapacityIsolation {
includedMetrics[cadvisormetrics.DiskUsageMetrics] = struct{}{}
}
duration := maxHousekeepingInterval
housekeepingConfig := manager.HousekeepingConfig{
Interval: &duration,
AllowDynamic: ptr.To(allowDynamicHousekeeping),
}
// Create the cAdvisor container manager.
m, err := manager.New(memory.New(statsCacheDuration, nil), sysFs, housekeepingConfig, includedMetrics, http.DefaultClient, cgroupRoots, nil /* containerEnvMetadataWhiteList */, "" /* perfEventsFile */, time.Duration(0) /*resctrlInterval*/)
if err != nil {
return nil, err
}
if _, err := os.Stat(rootPath); err != nil {
if os.IsNotExist(err) {
if err := os.MkdirAll(path.Clean(rootPath), 0750); err != nil {
return nil, fmt.Errorf("error creating root directory %q: %v", rootPath, err)
}
} else {
return nil, fmt.Errorf("failed to Stat %q: %v", rootPath, err)
}
}
return &cadvisorClient{
imageFsInfoProvider: imageFsInfoProvider,
rootPath: rootPath,
Manager: m,
}, nil
}
func (cc *cadvisorClient) Start() error {
return cc.Manager.Start()
}
func (cc *cadvisorClient) ContainerInfoV2(name string, options cadvisorapiv2.RequestOptions) (map[string]cadvisorapiv2.ContainerInfo, error) {
return cc.GetContainerInfoV2(name, options)
}
func (cc *cadvisorClient) VersionInfo() (*cadvisorapi.VersionInfo, error) {
return cc.GetVersionInfo()
}
func (cc *cadvisorClient) MachineInfo() (*cadvisorapi.MachineInfo, error) {
return cc.GetMachineInfo()
}
func (cc *cadvisorClient) ImagesFsInfo(ctx context.Context) (cadvisorapiv2.FsInfo, error) {
label, err := cc.imageFsInfoProvider.ImageFsInfoLabel()
if err != nil {
return cadvisorapiv2.FsInfo{}, err
}
return cc.getFsInfo(ctx, label)
}
func (cc *cadvisorClient) RootFsInfo() (cadvisorapiv2.FsInfo, error) {
return cc.GetDirFsInfo(cc.rootPath)
}
func (cc *cadvisorClient) getFsInfo(ctx context.Context, label string) (cadvisorapiv2.FsInfo, error) {
res, err := cc.GetFsInfo(label)
if err != nil {
return cadvisorapiv2.FsInfo{}, err
}
if len(res) == 0 {
return cadvisorapiv2.FsInfo{}, fmt.Errorf("failed to find information for the filesystem labeled %q", label)
}
// TODO(vmarmol): Handle this better when a label has more than one image filesystem.
if len(res) > 1 {
klog.FromContext(ctx).Info("More than one filesystem labeled. Only using the first one", "label", label, "fileSystem", res)
}
return res[0], nil
}
func (cc *cadvisorClient) ContainerFsInfo(ctx context.Context) (cadvisorapiv2.FsInfo, error) {
label, err := cc.imageFsInfoProvider.ContainerFsInfoLabel()
if err != nil {
return cadvisorapiv2.FsInfo{}, err
}
return cc.getFsInfo(ctx, label)
}

View File

@ -0,0 +1,76 @@
//go:build !linux && !windows
// +build !linux,!windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cadvisor
import (
"context"
"errors"
cadvisorapi "github.com/google/cadvisor/info/v1"
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
)
type cadvisorUnsupported struct {
}
var _ Interface = new(cadvisorUnsupported)
// New creates a new cAdvisor Interface for unsupported systems.
func New(imageFsInfoProvider ImageFsInfoProvider, rootPath string, cgroupsRoots []string, usingLegacyStats, localStorageCapacityIsolation bool) (Interface, error) {
return &cadvisorUnsupported{}, nil
}
var errUnsupported = errors.New("cAdvisor is unsupported in this build")
func (cu *cadvisorUnsupported) Start() error {
return errUnsupported
}
func (cu *cadvisorUnsupported) ContainerInfoV2(name string, options cadvisorapiv2.RequestOptions) (map[string]cadvisorapiv2.ContainerInfo, error) {
return nil, errUnsupported
}
func (cu *cadvisorUnsupported) GetRequestedContainersInfo(containerName string, options cadvisorapiv2.RequestOptions) (map[string]*cadvisorapi.ContainerInfo, error) {
return nil, errUnsupported
}
func (cu *cadvisorUnsupported) MachineInfo() (*cadvisorapi.MachineInfo, error) {
return nil, errUnsupported
}
func (cu *cadvisorUnsupported) VersionInfo() (*cadvisorapi.VersionInfo, error) {
return nil, errUnsupported
}
func (cu *cadvisorUnsupported) ImagesFsInfo(context.Context) (cadvisorapiv2.FsInfo, error) {
return cadvisorapiv2.FsInfo{}, errUnsupported
}
func (cu *cadvisorUnsupported) RootFsInfo() (cadvisorapiv2.FsInfo, error) {
return cadvisorapiv2.FsInfo{}, errUnsupported
}
func (cu *cadvisorUnsupported) ContainerFsInfo(context.Context) (cadvisorapiv2.FsInfo, error) {
return cadvisorapiv2.FsInfo{}, errUnsupported
}
func (cu *cadvisorUnsupported) GetDirFsInfo(path string) (cadvisorapiv2.FsInfo, error) {
return cadvisorapiv2.FsInfo{}, nil
}

View File

@ -0,0 +1,81 @@
//go:build windows
// +build windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cadvisor
import (
"context"
cadvisorapi "github.com/google/cadvisor/info/v1"
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
"k8s.io/kubernetes/pkg/kubelet/winstats"
)
type cadvisorClient struct {
rootPath string
winStatsClient winstats.Client
}
var _ Interface = new(cadvisorClient)
// New creates a cAdvisor and exports its API on the specified port if port > 0.
func New(imageFsInfoProvider ImageFsInfoProvider, rootPath string, cgroupRoots []string, usingLegacyStats, localStorageCapacityIsolation bool) (Interface, error) {
client, err := winstats.NewPerfCounterClient()
return &cadvisorClient{
rootPath: rootPath,
winStatsClient: client,
}, err
}
func (cu *cadvisorClient) Start() error {
return nil
}
// ContainerInfoV2 is only expected to be used for the root container. Returns info for all containers in the node.
func (cu *cadvisorClient) ContainerInfoV2(name string, options cadvisorapiv2.RequestOptions) (map[string]cadvisorapiv2.ContainerInfo, error) {
return cu.winStatsClient.WinContainerInfos()
}
func (cu *cadvisorClient) GetRequestedContainersInfo(containerName string, options cadvisorapiv2.RequestOptions) (map[string]*cadvisorapi.ContainerInfo, error) {
return nil, nil
}
func (cu *cadvisorClient) MachineInfo() (*cadvisorapi.MachineInfo, error) {
return cu.winStatsClient.WinMachineInfo()
}
func (cu *cadvisorClient) VersionInfo() (*cadvisorapi.VersionInfo, error) {
return cu.winStatsClient.WinVersionInfo()
}
func (cu *cadvisorClient) ImagesFsInfo(context.Context) (cadvisorapiv2.FsInfo, error) {
return cadvisorapiv2.FsInfo{}, nil
}
func (cu *cadvisorClient) ContainerFsInfo(context.Context) (cadvisorapiv2.FsInfo, error) {
return cadvisorapiv2.FsInfo{}, nil
}
func (cu *cadvisorClient) RootFsInfo() (cadvisorapiv2.FsInfo, error) {
return cu.GetDirFsInfo(cu.rootPath)
}
func (cu *cadvisorClient) GetDirFsInfo(path string) (cadvisorapiv2.FsInfo, error) {
return cu.winStatsClient.GetDirFsInfo(path)
}

18
vendor/k8s.io/kubernetes/pkg/kubelet/cadvisor/doc.go generated vendored Normal file
View File

@ -0,0 +1,18 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package cadvisor provides an interface for Kubelet interactions with cAdvisor.
package cadvisor // import "k8s.io/kubernetes/pkg/kubelet/cadvisor"

View File

@ -0,0 +1,63 @@
//go:build linux
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cadvisor
import (
"fmt"
"strings"
cadvisorfs "github.com/google/cadvisor/fs"
)
// imageFsInfoProvider knows how to translate the configured runtime
// to its file system label for images.
type imageFsInfoProvider struct {
runtimeEndpoint string
}
// ImageFsInfoLabel returns the image fs label for the configured runtime.
// For remote runtimes, it handles additional runtimes natively understood by cAdvisor.
func (i *imageFsInfoProvider) ImageFsInfoLabel() (string, error) {
if detectCrioWorkaround(i) {
return cadvisorfs.LabelCrioImages, nil
}
return "", fmt.Errorf("no imagefs label for configured runtime")
}
// ContainerFsInfoLabel returns the container fs label for the configured runtime.
// For remote runtimes, it handles addition runtimes natively understood by cAdvisor.
func (i *imageFsInfoProvider) ContainerFsInfoLabel() (string, error) {
if detectCrioWorkaround(i) {
return cadvisorfs.LabelCrioContainers, nil
}
return "", fmt.Errorf("no containerfs label for configured runtime")
}
// This is a temporary workaround to get stats for cri-o from cadvisor
// and should be removed.
// Related to https://github.com/kubernetes/kubernetes/issues/51798
func detectCrioWorkaround(i *imageFsInfoProvider) bool {
return strings.HasSuffix(i.runtimeEndpoint, CrioSocketSuffix)
}
// NewImageFsInfoProvider returns a provider for the specified runtime configuration.
func NewImageFsInfoProvider(runtimeEndpoint string) ImageFsInfoProvider {
return &imageFsInfoProvider{runtimeEndpoint: runtimeEndpoint}
}

View File

@ -0,0 +1,39 @@
//go:build !linux
// +build !linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cadvisor
import "errors"
type unsupportedImageFsInfoProvider struct{}
// ImageFsInfoLabel returns the image fs label for the configured runtime.
// For remote runtimes, it handles additional runtimes natively understood by cAdvisor.
func (i *unsupportedImageFsInfoProvider) ImageFsInfoLabel() (string, error) {
return "", errors.New("unsupported")
}
func (i *unsupportedImageFsInfoProvider) ContainerFsInfoLabel() (string, error) {
return "", errors.New("unsupported")
}
// NewImageFsInfoProvider returns a provider for the specified runtime configuration.
func NewImageFsInfoProvider(runtimeEndpoint string) ImageFsInfoProvider {
return &unsupportedImageFsInfoProvider{}
}

56
vendor/k8s.io/kubernetes/pkg/kubelet/cadvisor/types.go generated vendored Normal file
View File

@ -0,0 +1,56 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
//go:generate mockery
package cadvisor
import (
"context"
cadvisorapi "github.com/google/cadvisor/info/v1"
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
)
// Interface is an abstract interface for testability. It abstracts the interface to cAdvisor.
type Interface interface {
Start() error
ContainerInfoV2(name string, options cadvisorapiv2.RequestOptions) (map[string]cadvisorapiv2.ContainerInfo, error)
GetRequestedContainersInfo(containerName string, options cadvisorapiv2.RequestOptions) (map[string]*cadvisorapi.ContainerInfo, error)
MachineInfo() (*cadvisorapi.MachineInfo, error)
VersionInfo() (*cadvisorapi.VersionInfo, error)
// Returns usage information about the filesystem holding container images.
ImagesFsInfo(context.Context) (cadvisorapiv2.FsInfo, error)
// Returns usage information about the root filesystem.
RootFsInfo() (cadvisorapiv2.FsInfo, error)
// Returns usage information about the writeable layer.
// KEP 4191 can separate the image filesystem
ContainerFsInfo(context.Context) (cadvisorapiv2.FsInfo, error)
// Get filesystem information for the filesystem that contains the given file.
GetDirFsInfo(path string) (cadvisorapiv2.FsInfo, error)
}
// ImageFsInfoProvider informs cAdvisor how to find imagefs for container images.
type ImageFsInfoProvider interface {
// ImageFsInfoLabel returns the label cAdvisor should use to find the filesystem holding container images.
ImageFsInfoLabel() (string, error)
// In split image filesystem this will be different from ImageFsInfoLabel
ContainerFsInfoLabel() (string, error)
}

85
vendor/k8s.io/kubernetes/pkg/kubelet/cadvisor/util.go generated vendored Normal file
View File

@ -0,0 +1,85 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cadvisor
import (
"strings"
cadvisorapi "github.com/google/cadvisor/info/v1"
cadvisorapi2 "github.com/google/cadvisor/info/v2"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
utilfeature "k8s.io/apiserver/pkg/util/feature"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/features"
)
const (
// CrioSocketSuffix is the path to the CRI-O socket.
// Please keep this in sync with the one in:
// github.com/google/cadvisor/tree/master/container/crio/client.go
// Note that however we only match on the suffix, as /var/run is often a
// symlink to /run, so the user can specify either path.
CrioSocketSuffix = "run/crio/crio.sock"
)
// CapacityFromMachineInfo returns the capacity of the resources from the machine info.
func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
c := v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(
int64(info.NumCores*1000),
resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(
int64(info.MemoryCapacity),
resource.BinarySI),
}
// if huge pages are enabled, we report them as a schedulable resource on the node
for _, hugepagesInfo := range info.HugePages {
pageSizeBytes := int64(hugepagesInfo.PageSize * 1024)
hugePagesBytes := pageSizeBytes * int64(hugepagesInfo.NumPages)
pageSizeQuantity := resource.NewQuantity(pageSizeBytes, resource.BinarySI)
c[v1helper.HugePageResourceName(*pageSizeQuantity)] = *resource.NewQuantity(hugePagesBytes, resource.BinarySI)
}
return c
}
// EphemeralStorageCapacityFromFsInfo returns the capacity of the ephemeral storage from the FsInfo.
func EphemeralStorageCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList {
c := v1.ResourceList{
v1.ResourceEphemeralStorage: *resource.NewQuantity(
int64(info.Capacity),
resource.BinarySI),
}
return c
}
// UsingLegacyCadvisorStats returns true if container stats are provided by cadvisor instead of through the CRI.
// CRI integrations should get container metrics via CRI.
// TODO: cri-o relies on cadvisor as a temporary workaround. The code should
// be removed. Related issue:
// https://github.com/kubernetes/kubernetes/issues/51798
func UsingLegacyCadvisorStats(runtimeEndpoint string) bool {
// If PodAndContainerStatsFromCRI feature is enabled, then assume the user
// wants to use CRI stats, as the aforementioned workaround isn't needed
// when this feature is enabled.
if utilfeature.DefaultFeatureGate.Enabled(features.PodAndContainerStatsFromCRI) {
return false
}
return strings.HasSuffix(runtimeEndpoint, CrioSocketSuffix)
}

View File

@ -0,0 +1,25 @@
## DISCLAIMER
- Sig-Node community has reached a general consensus, as a best practice, to
avoid introducing any new checkpointing support. We reached this understanding
after struggling with some hard-to-debug issues in the production environments
caused by the checkpointing.
- Any changes to the checkpointed data structure would be considered incompatible and a component should add its own handling if it needs to ensure backward compatibility of reading old-format checkpoint files.
## Introduction
This folder contains a framework & primitives, Checkpointing Manager, which is
used by several other Kubelet submodules, `dockershim`, `devicemanager`, `pods`
and `cpumanager`, to implement checkpointing at each submodule level. As already
explained in above `Disclaimer` section, think twice before introducing any further
checkpointing in Kubelet. If still checkpointing is required, then this folder
provides the common APIs and the framework for implementing checkpointing.
Using same APIs across all the submodules will help maintaining consistency at
Kubelet level.
Below is the history of checkpointing support in Kubelet.
| Package | First checkpointing support merged on | PR link |
| ------- | --------------------------------------| ------- |
|kubelet/dockershim | Feb 3, 2017 | [[CRI] Implement Dockershim Checkpoint](https://github.com/kubernetes/kubernetes/pull/39903)
|devicemanager| Sep 6, 2017 | [Deviceplugin checkpoint](https://github.com/kubernetes/kubernetes/pull/51744)
| kubelet/pod | Nov 22, 2017 | [Initial basic bootstrap-checkpoint support](https://github.com/kubernetes/kubernetes/pull/50984)
|cpumanager| Oct 27, 2017 |[Add file backed state to cpu manager ](https://github.com/kubernetes/kubernetes/pull/54408)

View File

@ -0,0 +1,110 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checkpointmanager
import (
"fmt"
"sync"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
utilstore "k8s.io/kubernetes/pkg/kubelet/util/store"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
)
// Checkpoint provides the process checkpoint data
type Checkpoint interface {
MarshalCheckpoint() ([]byte, error)
UnmarshalCheckpoint(blob []byte) error
VerifyChecksum() error
}
// CheckpointManager provides the interface to manage checkpoint
type CheckpointManager interface {
// CreateCheckpoint persists checkpoint in CheckpointStore. checkpointKey is the key for utilstore to locate checkpoint.
// For file backed utilstore, checkpointKey is the file name to write the checkpoint data.
CreateCheckpoint(checkpointKey string, checkpoint Checkpoint) error
// GetCheckpoint retrieves checkpoint from CheckpointStore.
GetCheckpoint(checkpointKey string, checkpoint Checkpoint) error
// WARNING: RemoveCheckpoint will not return error if checkpoint does not exist.
RemoveCheckpoint(checkpointKey string) error
// ListCheckpoint returns the list of existing checkpoints.
ListCheckpoints() ([]string, error)
}
// impl is an implementation of CheckpointManager. It persists checkpoint in CheckpointStore
type impl struct {
path string
store utilstore.Store
mutex sync.Mutex
}
// NewCheckpointManager returns a new instance of a checkpoint manager
func NewCheckpointManager(checkpointDir string) (CheckpointManager, error) {
fstore, err := utilstore.NewFileStore(checkpointDir, &utilfs.DefaultFs{})
if err != nil {
return nil, err
}
return &impl{path: checkpointDir, store: fstore}, nil
}
// CreateCheckpoint persists checkpoint in CheckpointStore.
func (manager *impl) CreateCheckpoint(checkpointKey string, checkpoint Checkpoint) error {
manager.mutex.Lock()
defer manager.mutex.Unlock()
blob, err := checkpoint.MarshalCheckpoint()
if err != nil {
return err
}
return manager.store.Write(checkpointKey, blob)
}
// GetCheckpoint retrieves checkpoint from CheckpointStore.
func (manager *impl) GetCheckpoint(checkpointKey string, checkpoint Checkpoint) error {
manager.mutex.Lock()
defer manager.mutex.Unlock()
blob, err := manager.store.Read(checkpointKey)
if err != nil {
if err == utilstore.ErrKeyNotFound {
return errors.ErrCheckpointNotFound
}
return err
}
err = checkpoint.UnmarshalCheckpoint(blob)
if err == nil {
err = checkpoint.VerifyChecksum()
}
return err
}
// RemoveCheckpoint will not return error if checkpoint does not exist.
func (manager *impl) RemoveCheckpoint(checkpointKey string) error {
manager.mutex.Lock()
defer manager.mutex.Unlock()
return manager.store.Delete(checkpointKey)
}
// ListCheckpoints returns the list of existing checkpoints.
func (manager *impl) ListCheckpoints() ([]string, error) {
manager.mutex.Lock()
defer manager.mutex.Unlock()
keys, err := manager.store.List()
if err != nil {
return []string{}, fmt.Errorf("failed to list checkpoint store: %v", err)
}
return keys, nil
}

View File

@ -0,0 +1,48 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checksum
import (
"hash/fnv"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
hashutil "k8s.io/kubernetes/pkg/util/hash"
)
// Checksum is the data to be stored as checkpoint
type Checksum uint64
// Verify verifies that passed checksum is same as calculated checksum
func (cs Checksum) Verify(data interface{}) error {
actualCS := New(data)
if cs != actualCS {
return &errors.CorruptCheckpointError{ActualCS: uint64(actualCS), ExpectedCS: uint64(cs)}
}
return nil
}
// New returns the Checksum of checkpoint data
func New(data interface{}) Checksum {
return Checksum(getChecksum(data))
}
// Get returns calculated checksum of checkpoint data
func getChecksum(data interface{}) uint64 {
hash := fnv.New32a()
hashutil.DeepHashObject(hash, data)
return uint64(hash.Sum32())
}

View File

@ -0,0 +1,45 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package errors
import "fmt"
// ErrCorruptCheckpoint error is reported when checksum does not match.
// Check for it with:
//
// var csErr *CorruptCheckpointError
// if errors.As(err, &csErr) { ... }
// if errors.Is(err, CorruptCheckpointError{}) { ... }
type CorruptCheckpointError struct {
ActualCS, ExpectedCS uint64
}
func (err CorruptCheckpointError) Error() string {
return "checkpoint is corrupted"
}
func (err CorruptCheckpointError) Is(target error) bool {
switch target.(type) {
case *CorruptCheckpointError, CorruptCheckpointError:
return true
default:
return false
}
}
// ErrCheckpointNotFound is reported when checkpoint is not found for a given key
var ErrCheckpointNotFound = fmt.Errorf("checkpoint is not found")

11
vendor/k8s.io/kubernetes/pkg/kubelet/cm/.mockery.yaml generated vendored Normal file
View File

@ -0,0 +1,11 @@
---
dir: testing
filename: "mock_{{.InterfaceName | snakecase}}.go"
boilerplate-file: ../../../hack/boilerplate/boilerplate.generatego.txt
outpkg: testing
with-expecter: true
packages:
k8s.io/kubernetes/pkg/kubelet/cm:
interfaces:
ContainerManager:
PodContainerManager:

13
vendor/k8s.io/kubernetes/pkg/kubelet/cm/OWNERS generated vendored Normal file
View File

@ -0,0 +1,13 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- Random-Liu
- dchen1107
- derekwaynecarr
- yujuhong
- klueska
reviewers:
- sig-node-reviewers
emeritus_approvers:
- ConnorDoyle
- vishh

View File

@ -0,0 +1,62 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package admission
import (
"errors"
"fmt"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
)
const (
ErrorReasonUnexpected = "UnexpectedAdmissionError"
)
type Error interface {
Error() string
Type() string
}
type unexpectedAdmissionError struct{ Err error }
var _ Error = (*unexpectedAdmissionError)(nil)
func (e *unexpectedAdmissionError) Error() string {
return fmt.Sprintf("Allocate failed due to %v, which is unexpected", e.Err)
}
func (e *unexpectedAdmissionError) Type() string {
return ErrorReasonUnexpected
}
func GetPodAdmitResult(err error) lifecycle.PodAdmitResult {
if err == nil {
return lifecycle.PodAdmitResult{Admit: true}
}
var admissionErr Error
if !errors.As(err, &admissionErr) {
admissionErr = &unexpectedAdmissionError{err}
}
return lifecycle.PodAdmitResult{
Message: admissionErr.Error(),
Reason: admissionErr.Type(),
Admit: false,
}
}

View File

@ -0,0 +1,485 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"os"
"path"
"path/filepath"
"strings"
"sync"
"time"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
libcontainercgroupmanager "github.com/opencontainers/runc/libcontainer/cgroups/manager"
cgroupsystemd "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
libcontainerconfigs "github.com/opencontainers/runc/libcontainer/configs"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
const (
// systemdSuffix is the cgroup name suffix for systemd
systemdSuffix string = ".slice"
// Cgroup2MemoryMin is memory.min for cgroup v2
Cgroup2MemoryMin string = "memory.min"
// Cgroup2MemoryHigh is memory.high for cgroup v2
Cgroup2MemoryHigh string = "memory.high"
Cgroup2MaxCpuLimit string = "max"
Cgroup2MaxSwapFilename string = "memory.swap.max"
)
var RootCgroupName = CgroupName([]string{})
// NewCgroupName composes a new cgroup name.
// Use RootCgroupName as base to start at the root.
// This function does some basic check for invalid characters at the name.
func NewCgroupName(base CgroupName, components ...string) CgroupName {
for _, component := range components {
// Forbit using "_" in internal names. When remapping internal
// names to systemd cgroup driver, we want to remap "-" => "_",
// so we forbid "_" so that we can always reverse the mapping.
if strings.Contains(component, "/") || strings.Contains(component, "_") {
panic(fmt.Errorf("invalid character in component [%q] of CgroupName", component))
}
}
return CgroupName(append(append([]string{}, base...), components...))
}
func escapeSystemdCgroupName(part string) string {
return strings.Replace(part, "-", "_", -1)
}
func unescapeSystemdCgroupName(part string) string {
return strings.Replace(part, "_", "-", -1)
}
// cgroupName.ToSystemd converts the internal cgroup name to a systemd name.
// For example, the name {"kubepods", "burstable", "pod1234-abcd-5678-efgh"} becomes
// "/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod1234_abcd_5678_efgh.slice"
// This function always expands the systemd name into the cgroupfs form. If only
// the last part is needed, use path.Base(...) on it to discard the rest.
func (cgroupName CgroupName) ToSystemd() string {
if len(cgroupName) == 0 || (len(cgroupName) == 1 && cgroupName[0] == "") {
return "/"
}
newparts := []string{}
for _, part := range cgroupName {
part = escapeSystemdCgroupName(part)
newparts = append(newparts, part)
}
result, err := cgroupsystemd.ExpandSlice(strings.Join(newparts, "-") + systemdSuffix)
if err != nil {
// Should never happen...
panic(fmt.Errorf("error converting cgroup name [%v] to systemd format: %v", cgroupName, err))
}
return result
}
func ParseSystemdToCgroupName(name string) CgroupName {
driverName := path.Base(name)
driverName = strings.TrimSuffix(driverName, systemdSuffix)
parts := strings.Split(driverName, "-")
result := []string{}
for _, part := range parts {
result = append(result, unescapeSystemdCgroupName(part))
}
return CgroupName(result)
}
func (cgroupName CgroupName) ToCgroupfs() string {
return "/" + path.Join(cgroupName...)
}
func ParseCgroupfsToCgroupName(name string) CgroupName {
components := strings.Split(strings.TrimPrefix(name, "/"), "/")
if len(components) == 1 && components[0] == "" {
components = []string{}
}
return CgroupName(components)
}
func IsSystemdStyleName(name string) bool {
return strings.HasSuffix(name, systemdSuffix)
}
// CgroupSubsystems holds information about the mounted cgroup subsystems
type CgroupSubsystems struct {
// Cgroup subsystem mounts.
// e.g.: "/sys/fs/cgroup/cpu" -> ["cpu", "cpuacct"]
Mounts []libcontainercgroups.Mount
// Cgroup subsystem to their mount location.
// e.g.: "cpu" -> "/sys/fs/cgroup/cpu"
MountPoints map[string]string
}
// cgroupCommon implements common tasks
// that are valid for both cgroup v1 and v2.
// This prevents duplicating the code between
// v1 and v2 specific implementations.
type cgroupCommon struct {
// subsystems holds information about all the
// mounted cgroup subsystems on the node
subsystems *CgroupSubsystems
// useSystemd tells if systemd cgroup manager should be used.
useSystemd bool
}
// Make sure that cgroupV1impl and cgroupV2impl implement the CgroupManager interface
var _ CgroupManager = &cgroupV1impl{}
var _ CgroupManager = &cgroupV2impl{}
// NewCgroupManager is a factory method that returns a CgroupManager
func NewCgroupManager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
if libcontainercgroups.IsCgroup2UnifiedMode() {
return NewCgroupV2Manager(cs, cgroupDriver)
}
return NewCgroupV1Manager(cs, cgroupDriver)
}
func newCgroupCommon(cs *CgroupSubsystems, cgroupDriver string) cgroupCommon {
return cgroupCommon{
subsystems: cs,
useSystemd: cgroupDriver == "systemd",
}
}
// Name converts the cgroup to the driver specific value in cgroupfs form.
// This always returns a valid cgroupfs path even when systemd driver is in use!
func (m *cgroupCommon) Name(name CgroupName) string {
if m.useSystemd {
return name.ToSystemd()
}
return name.ToCgroupfs()
}
// CgroupName converts the literal cgroupfs name on the host to an internal identifier.
func (m *cgroupCommon) CgroupName(name string) CgroupName {
if m.useSystemd {
return ParseSystemdToCgroupName(name)
}
return ParseCgroupfsToCgroupName(name)
}
// buildCgroupPaths builds a path to each cgroup subsystem for the specified name.
func (m *cgroupCommon) buildCgroupPaths(name CgroupName) map[string]string {
cgroupFsAdaptedName := m.Name(name)
cgroupPaths := make(map[string]string, len(m.subsystems.MountPoints))
for key, val := range m.subsystems.MountPoints {
cgroupPaths[key] = path.Join(val, cgroupFsAdaptedName)
}
return cgroupPaths
}
// libctCgroupConfig converts CgroupConfig to libcontainer's Cgroup config.
func (m *cgroupCommon) libctCgroupConfig(in *CgroupConfig, needResources bool) *libcontainerconfigs.Cgroup {
config := &libcontainerconfigs.Cgroup{
Systemd: m.useSystemd,
}
if needResources {
config.Resources = m.toResources(in.ResourceParameters)
} else {
config.Resources = &libcontainerconfigs.Resources{}
}
if !config.Systemd {
// For fs cgroup manager, we can either set Path or Name and Parent.
// Setting Path is easier.
config.Path = in.Name.ToCgroupfs()
return config
}
// For systemd, we have to set Name and Parent, as they are needed to talk to systemd.
// Setting Path is optional as it can be deduced from Name and Parent.
// TODO(filbranden): This logic belongs in libcontainer/cgroup/systemd instead.
// It should take a libcontainerconfigs.Cgroup.Path field (rather than Name and Parent)
// and split it appropriately, using essentially the logic below.
// This was done for cgroupfs in opencontainers/runc#497 but a counterpart
// for systemd was never introduced.
dir, base := path.Split(in.Name.ToSystemd())
if dir == "/" {
dir = "-.slice"
} else {
dir = path.Base(dir)
}
config.Parent = dir
config.Name = base
return config
}
// Destroy destroys the specified cgroup
func (m *cgroupCommon) Destroy(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerDuration.WithLabelValues("destroy").Observe(metrics.SinceInSeconds(start))
}()
libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, false)
manager, err := libcontainercgroupmanager.New(libcontainerCgroupConfig)
if err != nil {
return err
}
// Delete cgroups using libcontainers Managers Destroy() method
if err = manager.Destroy(); err != nil {
return fmt.Errorf("unable to destroy cgroup paths for cgroup %v : %v", cgroupConfig.Name, err)
}
return nil
}
func (m *cgroupCommon) SetCgroupConfig(name CgroupName, resourceConfig *ResourceConfig) error {
containerConfig := &CgroupConfig{
Name: name,
ResourceParameters: resourceConfig,
}
return m.Update(containerConfig)
}
// getCPUWeight converts from the range [2, 262144] to [1, 10000]
func getCPUWeight(cpuShares *uint64) uint64 {
if cpuShares == nil {
return 0
}
if *cpuShares >= 262144 {
return 10000
}
return 1 + ((*cpuShares-2)*9999)/262142
}
var (
availableRootControllersOnce sync.Once
availableRootControllers sets.Set[string]
)
func (m *cgroupCommon) toResources(resourceConfig *ResourceConfig) *libcontainerconfigs.Resources {
resources := &libcontainerconfigs.Resources{
SkipDevices: true,
SkipFreezeOnSet: true,
}
if resourceConfig == nil {
return resources
}
if resourceConfig.Memory != nil {
resources.Memory = *resourceConfig.Memory
}
if resourceConfig.CPUShares != nil {
if libcontainercgroups.IsCgroup2UnifiedMode() {
resources.CpuWeight = getCPUWeight(resourceConfig.CPUShares)
} else {
resources.CpuShares = *resourceConfig.CPUShares
}
}
if resourceConfig.CPUQuota != nil {
resources.CpuQuota = *resourceConfig.CPUQuota
}
if resourceConfig.CPUPeriod != nil {
resources.CpuPeriod = *resourceConfig.CPUPeriod
}
if resourceConfig.PidsLimit != nil {
resources.PidsLimit = *resourceConfig.PidsLimit
}
if !resourceConfig.CPUSet.IsEmpty() {
resources.CpusetCpus = resourceConfig.CPUSet.String()
}
m.maybeSetHugetlb(resourceConfig, resources)
// Ideally unified is used for all the resources when running on cgroup v2.
// It doesn't make difference for the memory.max limit, but for e.g. the cpu controller
// you can specify the correct setting without relying on the conversions performed by the OCI runtime.
if resourceConfig.Unified != nil && libcontainercgroups.IsCgroup2UnifiedMode() {
resources.Unified = make(map[string]string)
for k, v := range resourceConfig.Unified {
resources.Unified[k] = v
}
}
return resources
}
func (m *cgroupCommon) maybeSetHugetlb(resourceConfig *ResourceConfig, resources *libcontainerconfigs.Resources) {
// Check if hugetlb is supported.
if libcontainercgroups.IsCgroup2UnifiedMode() {
if !getSupportedUnifiedControllers().Has("hugetlb") {
klog.V(6).InfoS("Optional subsystem not supported: hugetlb")
return
}
} else if _, ok := m.subsystems.MountPoints["hugetlb"]; !ok {
klog.V(6).InfoS("Optional subsystem not supported: hugetlb")
return
}
// For each page size enumerated, set that value.
pageSizes := sets.New[string]()
for pageSize, limit := range resourceConfig.HugePageLimit {
sizeString, err := v1helper.HugePageUnitSizeFromByteSize(pageSize)
if err != nil {
klog.InfoS("Invalid pageSize", "err", err)
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: sizeString,
Limit: uint64(limit),
})
pageSizes.Insert(sizeString)
}
// for each page size omitted, limit to 0
for _, pageSize := range libcontainercgroups.HugePageSizes() {
if pageSizes.Has(pageSize) {
continue
}
resources.HugetlbLimit = append(resources.HugetlbLimit, &libcontainerconfigs.HugepageLimit{
Pagesize: pageSize,
Limit: uint64(0),
})
}
}
// Update updates the cgroup with the specified Cgroup Configuration
func (m *cgroupCommon) Update(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerDuration.WithLabelValues("update").Observe(metrics.SinceInSeconds(start))
}()
libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true)
manager, err := libcontainercgroupmanager.New(libcontainerCgroupConfig)
if err != nil {
return fmt.Errorf("failed to create cgroup manager: %v", err)
}
return manager.Set(libcontainerCgroupConfig.Resources)
}
// Create creates the specified cgroup
func (m *cgroupCommon) Create(cgroupConfig *CgroupConfig) error {
start := time.Now()
defer func() {
metrics.CgroupManagerDuration.WithLabelValues("create").Observe(metrics.SinceInSeconds(start))
}()
libcontainerCgroupConfig := m.libctCgroupConfig(cgroupConfig, true)
manager, err := libcontainercgroupmanager.New(libcontainerCgroupConfig)
if err != nil {
return err
}
// Apply(-1) is a hack to create the cgroup directories for each resource
// subsystem. The function [cgroups.Manager.apply()] applies cgroup
// configuration to the process with the specified pid.
// It creates cgroup files for each subsystems and writes the pid
// in the tasks file. We use the function to create all the required
// cgroup files but not attach any "real" pid to the cgroup.
if err := manager.Apply(-1); err != nil {
return err
}
// it may confuse why we call set after we do apply, but the issue is that runc
// follows a similar pattern. it's needed to ensure cpu quota is set properly.
if err := manager.Set(libcontainerCgroupConfig.Resources); err != nil {
utilruntime.HandleError(fmt.Errorf("cgroup manager.Set failed: %w", err))
}
return nil
}
// Scans through all subsystems to find pids associated with specified cgroup.
func (m *cgroupCommon) Pids(name CgroupName) []int {
// we need the driver specific name
cgroupFsName := m.Name(name)
// Get a list of processes that we need to kill
pidsToKill := sets.New[int]()
var pids []int
for _, val := range m.subsystems.MountPoints {
dir := path.Join(val, cgroupFsName)
_, err := os.Stat(dir)
if os.IsNotExist(err) {
// The subsystem pod cgroup is already deleted
// do nothing, continue
continue
}
// Get a list of pids that are still charged to the pod's cgroup
pids, err = getCgroupProcs(dir)
if err != nil {
continue
}
pidsToKill.Insert(pids...)
// WalkFunc which is called for each file and directory in the pod cgroup dir
visitor := func(path string, info os.FileInfo, err error) error {
if err != nil {
klog.V(4).InfoS("Cgroup manager encountered error scanning cgroup path", "path", path, "err", err)
return filepath.SkipDir
}
if !info.IsDir() {
return nil
}
pids, err = getCgroupProcs(path)
if err != nil {
klog.V(4).InfoS("Cgroup manager encountered error getting procs for cgroup path", "path", path, "err", err)
return filepath.SkipDir
}
pidsToKill.Insert(pids...)
return nil
}
// Walk through the pod cgroup directory to check if
// container cgroups haven't been GCed yet. Get attached processes to
// all such unwanted containers under the pod cgroup
if err = filepath.Walk(dir, visitor); err != nil {
klog.V(4).InfoS("Cgroup manager encountered error scanning pids for directory", "path", dir, "err", err)
}
}
return sets.List(pidsToKill)
}
// ReduceCPULimits reduces the cgroup's cpu shares to the lowest possible value
func (m *cgroupCommon) ReduceCPULimits(cgroupName CgroupName) error {
// Set lowest possible CpuShares value for the cgroup
minimumCPUShares := uint64(MinShares)
resources := &ResourceConfig{
CPUShares: &minimumCPUShares,
}
containerConfig := &CgroupConfig{
Name: cgroupName,
ResourceParameters: resources,
}
return m.Update(containerConfig)
}
func readCgroupMemoryConfig(cgroupPath string, memLimitFile string) (*ResourceConfig, error) {
memLimit, err := fscommon.GetCgroupParamUint(cgroupPath, memLimitFile)
if err != nil {
return nil, fmt.Errorf("failed to read %s for cgroup %v: %v", memLimitFile, cgroupPath, err)
}
mLim := int64(memLimit)
//TODO(vinaykul,InPlacePodVerticalScaling): Add memory request support
return &ResourceConfig{Memory: &mLim}, nil
}

View File

@ -0,0 +1,120 @@
//go:build !linux
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"errors"
v1 "k8s.io/api/core/v1"
)
type unsupportedCgroupManager struct{}
var errNotSupported = errors.New("Cgroup Manager is not supported in this build")
// Make sure that unsupportedCgroupManager implements the CgroupManager interface
var _ CgroupManager = &unsupportedCgroupManager{}
type CgroupSubsystems struct {
Mounts []interface{}
MountPoints map[string]string
}
func NewCgroupManager(_ interface{}) CgroupManager {
return &unsupportedCgroupManager{}
}
func (m *unsupportedCgroupManager) Version() int {
return 0
}
func (m *unsupportedCgroupManager) Name(_ CgroupName) string {
return ""
}
func (m *unsupportedCgroupManager) Validate(_ CgroupName) error {
return errNotSupported
}
func (m *unsupportedCgroupManager) Exists(_ CgroupName) bool {
return false
}
func (m *unsupportedCgroupManager) Destroy(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Update(_ *CgroupConfig) error {
return nil
}
func (m *unsupportedCgroupManager) Create(_ *CgroupConfig) error {
return errNotSupported
}
func (m *unsupportedCgroupManager) MemoryUsage(_ CgroupName) (int64, error) {
return -1, errNotSupported
}
func (m *unsupportedCgroupManager) Pids(_ CgroupName) []int {
return nil
}
func (m *unsupportedCgroupManager) CgroupName(name string) CgroupName {
return CgroupName([]string{})
}
func (m *unsupportedCgroupManager) ReduceCPULimits(cgroupName CgroupName) error {
return nil
}
func (m *unsupportedCgroupManager) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
return nil, errNotSupported
}
func (m *unsupportedCgroupManager) SetCgroupConfig(name CgroupName, resourceConfig *ResourceConfig) error {
return errNotSupported
}
var RootCgroupName = CgroupName([]string{})
func NewCgroupName(base CgroupName, components ...string) CgroupName {
return append(append([]string{}, base...), components...)
}
func (cgroupName CgroupName) ToSystemd() string {
return ""
}
func ParseSystemdToCgroupName(name string) CgroupName {
return nil
}
func (cgroupName CgroupName) ToCgroupfs() string {
return ""
}
func ParseCgroupfsToCgroupName(name string) CgroupName {
return nil
}
func IsSystemdStyleName(name string) bool {
return false
}

View File

@ -0,0 +1,145 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"errors"
"fmt"
"strconv"
"strings"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
)
const cgroupv1MemLimitFile string = "memory.limit_in_bytes"
// cgroupV1impl implements the CgroupManager interface
// for cgroup v1.
// It's a stateless object which can be used to
// update, create or delete any number of cgroups
// It relies on runc/libcontainer cgroup managers.
type cgroupV1impl struct {
cgroupCommon
}
func NewCgroupV1Manager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
return &cgroupV1impl{
cgroupCommon: newCgroupCommon(cs, cgroupDriver),
}
}
// Version of the cgroup implementation on the host
func (c *cgroupV1impl) Version() int {
return 1
}
// Validate checks if all subsystem cgroups are valid
func (c *cgroupV1impl) Validate(name CgroupName) error {
// Get map of all cgroup paths on the system for the particular cgroup
cgroupPaths := c.buildCgroupPaths(name)
// the presence of alternative control groups not known to runc confuses
// the kubelet existence checks.
// ideally, we would have a mechanism in runc to support Exists() logic
// scoped to the set control groups it understands. this is being discussed
// in https://github.com/opencontainers/runc/issues/1440
// once resolved, we can remove this code.
allowlistControllers := sets.New[string]("cpu", "cpuacct", "cpuset", "memory", "systemd", "pids")
if _, ok := c.subsystems.MountPoints["hugetlb"]; ok {
allowlistControllers.Insert("hugetlb")
}
var missingPaths []string
// If even one cgroup path doesn't exist, then the cgroup doesn't exist.
for controller, path := range cgroupPaths {
// ignore mounts we don't care about
if !allowlistControllers.Has(controller) {
continue
}
if !libcontainercgroups.PathExists(path) {
missingPaths = append(missingPaths, path)
}
}
if len(missingPaths) > 0 {
return fmt.Errorf("cgroup %q has some missing paths: %v", name, strings.Join(missingPaths, ", "))
}
return nil
}
// Exists checks if all subsystem cgroups already exist
func (c *cgroupV1impl) Exists(name CgroupName) bool {
return c.Validate(name) == nil
}
// MemoryUsage returns the current memory usage of the specified cgroup,
// as read from cgroupfs.
func (c *cgroupV1impl) MemoryUsage(name CgroupName) (int64, error) {
var path, file string
mp, ok := c.subsystems.MountPoints["memory"]
if !ok { // should not happen
return -1, errors.New("no cgroup v1 mountpoint for memory controller found")
}
path = mp + "/" + c.Name(name)
file = "memory.usage_in_bytes"
val, err := fscommon.GetCgroupParamUint(path, file)
return int64(val), err
}
// Get the resource config values applied to the cgroup for specified resource type
func (c *cgroupV1impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
cgroupPaths := c.buildCgroupPaths(name)
cgroupResourcePath, found := cgroupPaths[string(resource)]
if !found {
return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
}
switch resource {
case v1.ResourceCPU:
return c.getCgroupCPUConfig(cgroupResourcePath)
case v1.ResourceMemory:
return c.getCgroupMemoryConfig(cgroupResourcePath)
}
return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name)
}
func (c *cgroupV1impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) {
cpuQuotaStr, errQ := fscommon.GetCgroupParamString(cgroupPath, "cpu.cfs_quota_us")
if errQ != nil {
return nil, fmt.Errorf("failed to read CPU quota for cgroup %v: %w", cgroupPath, errQ)
}
cpuQuota, errInt := strconv.ParseInt(cpuQuotaStr, 10, 64)
if errInt != nil {
return nil, fmt.Errorf("failed to convert CPU quota as integer for cgroup %v: %w", cgroupPath, errInt)
}
cpuPeriod, errP := fscommon.GetCgroupParamUint(cgroupPath, "cpu.cfs_period_us")
if errP != nil {
return nil, fmt.Errorf("failed to read CPU period for cgroup %v: %w", cgroupPath, errP)
}
cpuShares, errS := fscommon.GetCgroupParamUint(cgroupPath, "cpu.shares")
if errS != nil {
return nil, fmt.Errorf("failed to read CPU shares for cgroup %v: %w", cgroupPath, errS)
}
return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuQuota, CPUPeriod: &cpuPeriod}, nil
}
func (c *cgroupV1impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
return readCgroupMemoryConfig(cgroupPath, cgroupv1MemLimitFile)
}

View File

@ -0,0 +1,177 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
)
const cgroupv2MemLimitFile string = "memory.max"
// cgroupV2impl implements the CgroupManager interface
// for cgroup v2.
// It's a stateless object which can be used to
// update, create or delete any number of cgroups
// It relies on runc/libcontainer cgroup managers.
type cgroupV2impl struct {
cgroupCommon
}
func NewCgroupV2Manager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
return &cgroupV2impl{
cgroupCommon: newCgroupCommon(cs, cgroupDriver),
}
}
// Version of the cgroup implementation on the host
func (c *cgroupV2impl) Version() int {
return 2
}
// Validate checks if all subsystem cgroups are valid
func (c *cgroupV2impl) Validate(name CgroupName) error {
cgroupPath := c.buildCgroupUnifiedPath(name)
neededControllers := getSupportedUnifiedControllers()
enabledControllers, err := readUnifiedControllers(cgroupPath)
if err != nil {
return fmt.Errorf("could not read controllers for cgroup %q: %w", name, err)
}
difference := neededControllers.Difference(enabledControllers)
if difference.Len() > 0 {
return fmt.Errorf("cgroup %q has some missing controllers: %v", name, strings.Join(sets.List(difference), ", "))
}
return nil
}
// Exists checks if all subsystem cgroups already exist
func (c *cgroupV2impl) Exists(name CgroupName) bool {
return c.Validate(name) == nil
}
// MemoryUsage returns the current memory usage of the specified cgroup,
// as read from cgroupfs.
func (c *cgroupV2impl) MemoryUsage(name CgroupName) (int64, error) {
var path, file string
path = c.buildCgroupUnifiedPath(name)
file = "memory.current"
val, err := fscommon.GetCgroupParamUint(path, file)
return int64(val), err
}
// Get the resource config values applied to the cgroup for specified resource type
func (c *cgroupV2impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
cgroupPaths := c.buildCgroupPaths(name)
cgroupResourcePath, found := cgroupPaths[string(resource)]
if !found {
return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
}
switch resource {
case v1.ResourceCPU:
return c.getCgroupCPUConfig(cgroupResourcePath)
case v1.ResourceMemory:
return c.getCgroupMemoryConfig(cgroupResourcePath)
}
return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name)
}
func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) {
var cpuLimitStr, cpuPeriodStr string
cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max")
if err != nil {
return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %w", cgroupPath, err)
}
numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr)
if errScan != nil || numItems != 2 {
return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %w",
cpuLimitAndPeriod, cgroupPath, errScan)
}
cpuLimit := int64(-1)
if cpuLimitStr != Cgroup2MaxCpuLimit {
cpuLimit, err = strconv.ParseInt(cpuLimitStr, 10, 64)
if err != nil {
return nil, fmt.Errorf("failed to convert CPU limit as integer for cgroup %v: %w", cgroupPath, err)
}
}
cpuPeriod, errPeriod := strconv.ParseUint(cpuPeriodStr, 10, 64)
if errPeriod != nil {
return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %w", cgroupPath, errPeriod)
}
cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight")
if errWeight != nil {
return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %w", cgroupPath, errWeight)
}
cpuShares := cpuWeightToCPUShares(cpuWeight)
return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuLimit, CPUPeriod: &cpuPeriod}, nil
}
func (c *cgroupV2impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
return readCgroupMemoryConfig(cgroupPath, cgroupv2MemLimitFile)
}
// getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2
func getSupportedUnifiedControllers() sets.Set[string] {
// This is the set of controllers used by the Kubelet
supportedControllers := sets.New("cpu", "cpuset", "memory", "hugetlb", "pids")
// Memoize the set of controllers that are present in the root cgroup
availableRootControllersOnce.Do(func() {
var err error
availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot)
if err != nil {
panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot))
}
})
// Return the set of controllers that are supported both by the Kubelet and by the kernel
return supportedControllers.Intersection(availableRootControllers)
}
// readUnifiedControllers reads the controllers available at the specified cgroup
func readUnifiedControllers(path string) (sets.Set[string], error) {
controllersFileContent, err := os.ReadFile(filepath.Join(path, "cgroup.controllers"))
if err != nil {
return nil, err
}
controllers := strings.Fields(string(controllersFileContent))
return sets.New(controllers...), nil
}
// buildCgroupUnifiedPath builds a path to the specified name.
func (c *cgroupV2impl) buildCgroupUnifiedPath(name CgroupName) string {
cgroupFsAdaptedName := c.Name(name)
return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName)
}
// Convert cgroup v1 cpu.shares value to cgroup v2 cpu.weight
// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
func cpuSharesToCPUWeight(cpuShares uint64) uint64 {
return uint64((((cpuShares - 2) * 9999) / 262142) + 1)
}
// Convert cgroup v2 cpu.weight value to cgroup v1 cpu.shares
// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
func cpuWeightToCPUShares(cpuWeight uint64) uint64 {
return uint64((((cpuWeight - 1) * 262142) / 9999) + 2)
}

View File

@ -0,0 +1,283 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
//go:generate mockery
package cm
import (
"context"
"fmt"
"strconv"
"strings"
"time"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
// TODO: Migrate kubelet to either use its own internal objects or client library.
v1 "k8s.io/api/core/v1"
"k8s.io/apiserver/pkg/server/healthz"
internalapi "k8s.io/cri-api/pkg/apis"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/utils/cpuset"
)
const (
// Warning message for the users still using cgroup v1
CgroupV1MaintenanceModeWarning = "cgroup v1 support is in maintenance mode, please migrate to cgroup v2"
// Warning message for the users using cgroup v2 on kernel doesn't support root `cpu.stat`.
// `cpu.stat` was added to root cgroup in kernel 5.8.
// (ref: https://github.com/torvalds/linux/commit/936f2a70f2077f64fab1dcb3eca71879e82ecd3f)
CgroupV2KernelWarning = "cgroup v2 is being used on a kernel, which doesn't support root `cpu.stat`." +
"Kubelet will continue, but may experience instability or wrong behavior"
)
type ActivePodsFunc func() []*v1.Pod
type GetNodeFunc func() (*v1.Node, error)
// Manages the containers running on a machine.
type ContainerManager interface {
// Runs the container manager's housekeeping.
// - Ensures that the Docker daemon is in a container.
// - Creates the system container where all non-containerized processes run.
Start(context.Context, *v1.Node, ActivePodsFunc, GetNodeFunc, config.SourcesReady, status.PodStatusProvider, internalapi.RuntimeService, bool) error
// SystemCgroupsLimit returns resources allocated to system cgroups in the machine.
// These cgroups include the system and Kubernetes services.
SystemCgroupsLimit() v1.ResourceList
// GetNodeConfig returns a NodeConfig that is being used by the container manager.
GetNodeConfig() NodeConfig
// Status returns internal Status.
Status() Status
// NewPodContainerManager is a factory method which returns a podContainerManager object
// Returns a noop implementation if qos cgroup hierarchy is not enabled
NewPodContainerManager() PodContainerManager
// GetMountedSubsystems returns the mounted cgroup subsystems on the node
GetMountedSubsystems() *CgroupSubsystems
// GetQOSContainersInfo returns the names of top level QoS containers
GetQOSContainersInfo() QOSContainersInfo
// GetNodeAllocatableReservation returns the amount of compute resources that have to be reserved from scheduling.
GetNodeAllocatableReservation() v1.ResourceList
// GetCapacity returns the amount of compute resources tracked by container manager available on the node.
GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList
// GetDevicePluginResourceCapacity returns the node capacity (amount of total device plugin resources),
// node allocatable (amount of total healthy resources reported by device plugin),
// and inactive device plugin resources previously registered on the node.
GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string)
// UpdateQOSCgroups performs housekeeping updates to ensure that the top
// level QoS containers have their desired state in a thread-safe way
UpdateQOSCgroups() error
// GetResources returns RunContainerOptions with devices, mounts, and env fields populated for
// extended resources required by container.
GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error)
// UpdatePluginResources calls Allocate of device plugin handler for potential
// requests for device plugin resources, and returns an error if fails.
// Otherwise, it updates allocatableResource in nodeInfo if necessary,
// to make sure it is at least equal to the pod's requested capacity for
// any registered device plugin resource
UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error
InternalContainerLifecycle() InternalContainerLifecycle
// GetPodCgroupRoot returns the cgroup which contains all pods.
GetPodCgroupRoot() string
// GetPluginRegistrationHandlers returns a set of plugin registration handlers
// The pluginwatcher's Handlers allow to have a single module for handling
// registration.
GetPluginRegistrationHandlers() map[string]cache.PluginHandler
// GetHealthCheckers returns a set of health checkers for all plugins.
// These checkers are integrated into the systemd watchdog to monitor the service's health.
GetHealthCheckers() []healthz.HealthChecker
// ShouldResetExtendedResourceCapacity returns whether or not the extended resources should be zeroed,
// due to node recreation.
ShouldResetExtendedResourceCapacity() bool
// GetAllocateResourcesPodAdmitHandler returns an instance of a PodAdmitHandler responsible for allocating pod resources.
GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
GetNodeAllocatableAbsolute() v1.ResourceList
// PrepareDynamicResource prepares dynamic pod resources
PrepareDynamicResources(context.Context, *v1.Pod) error
// UnprepareDynamicResources unprepares dynamic pod resources
UnprepareDynamicResources(context.Context, *v1.Pod) error
// PodMightNeedToUnprepareResources returns true if the pod with the given UID
// might need to unprepare resources.
PodMightNeedToUnprepareResources(UID types.UID) bool
// UpdateAllocatedResourcesStatus updates the status of allocated resources for the pod.
UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus)
// Updates returns a channel that receives an Update when the device changed its status.
Updates() <-chan resourceupdates.Update
// Implements the PodResources Provider API
podresources.CPUsProvider
podresources.DevicesProvider
podresources.MemoryProvider
podresources.DynamicResourcesProvider
}
type NodeConfig struct {
NodeName types.NodeName
RuntimeCgroupsName string
SystemCgroupsName string
KubeletCgroupsName string
KubeletOOMScoreAdj int32
ContainerRuntime string
CgroupsPerQOS bool
CgroupRoot string
CgroupDriver string
KubeletRootDir string
ProtectKernelDefaults bool
NodeAllocatableConfig
QOSReserved map[v1.ResourceName]int64
CPUManagerPolicy string
CPUManagerPolicyOptions map[string]string
TopologyManagerScope string
CPUManagerReconcilePeriod time.Duration
ExperimentalMemoryManagerPolicy string
ExperimentalMemoryManagerReservedMemory []kubeletconfig.MemoryReservation
PodPidsLimit int64
EnforceCPULimits bool
CPUCFSQuotaPeriod time.Duration
TopologyManagerPolicy string
TopologyManagerPolicyOptions map[string]string
CgroupVersion int
}
type NodeAllocatableConfig struct {
KubeReservedCgroupName string
SystemReservedCgroupName string
ReservedSystemCPUs cpuset.CPUSet
EnforceNodeAllocatable sets.Set[string]
KubeReserved v1.ResourceList
SystemReserved v1.ResourceList
HardEvictionThresholds []evictionapi.Threshold
}
type Status struct {
// Any soft requirements that were unsatisfied.
SoftRequirements error
}
func int64Slice(in []int) []int64 {
out := make([]int64, len(in))
for i := range in {
out[i] = int64(in[i])
}
return out
}
// parsePercentage parses the percentage string to numeric value.
func parsePercentage(v string) (int64, error) {
if !strings.HasSuffix(v, "%") {
return 0, fmt.Errorf("percentage expected, got '%s'", v)
}
percentage, err := strconv.ParseInt(strings.TrimRight(v, "%"), 10, 0)
if err != nil {
return 0, fmt.Errorf("invalid number in percentage '%s'", v)
}
if percentage < 0 || percentage > 100 {
return 0, fmt.Errorf("percentage must be between 0 and 100")
}
return percentage, nil
}
// ParseQOSReserved parses the --qos-reserved option
func ParseQOSReserved(m map[string]string) (*map[v1.ResourceName]int64, error) {
reservations := make(map[v1.ResourceName]int64)
for k, v := range m {
switch v1.ResourceName(k) {
// Only memory resources are supported.
case v1.ResourceMemory:
q, err := parsePercentage(v)
if err != nil {
return nil, fmt.Errorf("failed to parse percentage %q for %q resource: %w", v, k, err)
}
reservations[v1.ResourceName(k)] = q
default:
return nil, fmt.Errorf("cannot reserve %q resource", k)
}
}
return &reservations, nil
}
func containerDevicesFromResourceDeviceInstances(devs devicemanager.ResourceDeviceInstances) []*podresourcesapi.ContainerDevices {
var respDevs []*podresourcesapi.ContainerDevices
for resourceName, resourceDevs := range devs {
for devID, dev := range resourceDevs {
topo := dev.GetTopology()
if topo == nil {
// Some device plugin do not report the topology information.
// This is legal, so we report the devices anyway,
// let the client decide what to do.
respDevs = append(respDevs, &podresourcesapi.ContainerDevices{
ResourceName: resourceName,
DeviceIds: []string{devID},
})
continue
}
for _, node := range topo.GetNodes() {
respDevs = append(respDevs, &podresourcesapi.ContainerDevices{
ResourceName: resourceName,
DeviceIds: []string{devID},
Topology: &podresourcesapi.TopologyInfo{
Nodes: []*podresourcesapi.NUMANode{
{
ID: node.GetID(),
},
},
},
})
}
}
}
return respDevs
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,211 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/server/healthz"
internalapi "k8s.io/cri-api/pkg/apis"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
type containerManagerStub struct {
shouldResetExtendedResourceCapacity bool
extendedPluginResources v1.ResourceList
}
var _ ContainerManager = &containerManagerStub{}
func (cm *containerManagerStub) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ GetNodeFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
klog.V(2).InfoS("Starting stub container manager")
return nil
}
func (cm *containerManagerStub) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (cm *containerManagerStub) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func (cm *containerManagerStub) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (cm *containerManagerStub) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (cm *containerManagerStub) UpdateQOSCgroups() error {
return nil
}
func (cm *containerManagerStub) Status() Status {
return Status{}
}
func (cm *containerManagerStub) GetNodeAllocatableReservation() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList {
if !localStorageCapacityIsolation {
return v1.ResourceList{}
}
c := v1.ResourceList{
v1.ResourceEphemeralStorage: *resource.NewQuantity(
int64(0),
resource.BinarySI),
}
return c
}
func (cm *containerManagerStub) GetPluginRegistrationHandlers() map[string]cache.PluginHandler {
return nil
}
func (cm *containerManagerStub) GetHealthCheckers() []healthz.HealthChecker {
return []healthz.HealthChecker{}
}
func (cm *containerManagerStub) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
return cm.extendedPluginResources, cm.extendedPluginResources, []string{}
}
func (m *podContainerManagerStub) GetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName) (*ResourceConfig, error) {
return nil, fmt.Errorf("not implemented")
}
func (m *podContainerManagerStub) SetPodCgroupConfig(pod *v1.Pod, resourceConfig *ResourceConfig) error {
return fmt.Errorf("not implemented")
}
func (cm *containerManagerStub) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{}
}
func (cm *containerManagerStub) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *containerManagerStub) UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error {
return nil
}
func (cm *containerManagerStub) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager(), memorymanager.NewFakeManager(), topologymanager.NewFakeManager()}
}
func (cm *containerManagerStub) GetPodCgroupRoot() string {
return ""
}
func (cm *containerManagerStub) GetDevices(_, _ string) []*podresourcesapi.ContainerDevices {
return nil
}
func (cm *containerManagerStub) GetAllocatableDevices() []*podresourcesapi.ContainerDevices {
return nil
}
func (cm *containerManagerStub) ShouldResetExtendedResourceCapacity() bool {
return cm.shouldResetExtendedResourceCapacity
}
func (cm *containerManagerStub) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler {
return topologymanager.NewFakeManager()
}
func (cm *containerManagerStub) UpdateAllocatedDevices() {
return
}
func (cm *containerManagerStub) GetCPUs(_, _ string) []int64 {
return nil
}
func (cm *containerManagerStub) GetAllocatableCPUs() []int64 {
return nil
}
func (cm *containerManagerStub) GetMemory(_, _ string) []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerStub) GetAllocatableMemory() []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerStub) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource {
return nil
}
func (cm *containerManagerStub) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil
}
func (cm *containerManagerStub) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerStub) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerStub) PodMightNeedToUnprepareResources(UID types.UID) bool {
return false
}
func (cm *containerManagerStub) UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus) {
}
func (cm *containerManagerStub) Updates() <-chan resourceupdates.Update {
return nil
}
func NewStubContainerManager() ContainerManager {
return &containerManagerStub{shouldResetExtendedResourceCapacity: false}
}
func NewStubContainerManagerWithExtendedResource(shouldResetExtendedResourceCapacity bool) ContainerManager {
return &containerManagerStub{shouldResetExtendedResourceCapacity: shouldResetExtendedResourceCapacity}
}
func NewStubContainerManagerWithDevicePluginResource(extendedPluginResources v1.ResourceList) ContainerManager {
return &containerManagerStub{
shouldResetExtendedResourceCapacity: false,
extendedPluginResources: extendedPluginResources,
}
}

View File

@ -0,0 +1,49 @@
//go:build !linux && !windows
// +build !linux,!windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"fmt"
"k8s.io/mount-utils"
v1 "k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/cri-api/pkg/apis"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
)
type unsupportedContainerManager struct {
containerManagerStub
}
var _ ContainerManager = &unsupportedContainerManager{}
func (unsupportedContainerManager) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ GetNodeFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
return fmt.Errorf("Container Manager is unsupported in this build")
}
func NewContainerManager(_ mount.Interface, _ cadvisor.Interface, _ NodeConfig, failSwapOn bool, recorder record.EventRecorder, kubeClient clientset.Interface) (ContainerManager, error) {
return &unsupportedContainerManager{}, nil
}

View File

@ -0,0 +1,371 @@
//go:build windows
// +build windows
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// containerManagerImpl implements container manager on Windows.
// Only GetNodeAllocatableReservation() and GetCapacity() are implemented now.
package cm
import (
"context"
"fmt"
"sync"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/klog/v2"
"k8s.io/mount-utils"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/server/healthz"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
internalapi "k8s.io/cri-api/pkg/apis"
pluginwatcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
type containerManagerImpl struct {
// Capacity of this node.
capacity v1.ResourceList
// Interface for cadvisor.
cadvisorInterface cadvisor.Interface
// Config of this node.
nodeConfig NodeConfig
// Interface for exporting and allocating devices reported by device plugins.
deviceManager devicemanager.Manager
// Interface for Topology resource co-ordination
topologyManager topologymanager.Manager
cpuManager cpumanager.Manager
memoryManager memorymanager.Manager
nodeInfo *v1.Node
sync.RWMutex
}
func (cm *containerManagerImpl) Start(ctx context.Context, node *v1.Node,
activePods ActivePodsFunc,
getNode GetNodeFunc,
sourcesReady config.SourcesReady,
podStatusProvider status.PodStatusProvider,
runtimeService internalapi.RuntimeService,
localStorageCapacityIsolation bool) error {
klog.V(2).InfoS("Starting Windows container manager")
cm.nodeInfo = node
if localStorageCapacityIsolation {
rootfs, err := cm.cadvisorInterface.RootFsInfo()
if err != nil {
return fmt.Errorf("failed to get rootfs info: %v", err)
}
for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
cm.capacity[rName] = rCap
}
}
containerMap, containerRunningSet := buildContainerMapAndRunningSetFromRuntime(ctx, runtimeService)
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
err := cm.cpuManager.Start(cpumanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap.Clone())
if err != nil {
return fmt.Errorf("start cpu manager error: %v", err)
}
// Initialize memory manager
err = cm.memoryManager.Start(memorymanager.ActivePodsFunc(activePods), sourcesReady, podStatusProvider, runtimeService, containerMap.Clone())
if err != nil {
return fmt.Errorf("start memory manager error: %v", err)
}
}
// Starts device manager.
if err := cm.deviceManager.Start(devicemanager.ActivePodsFunc(activePods), sourcesReady, containerMap.Clone(), containerRunningSet); err != nil {
return err
}
return nil
}
// NewContainerManager creates windows container manager.
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig, failSwapOn bool, recorder record.EventRecorder, kubeClient clientset.Interface) (ContainerManager, error) {
// It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because
// machine info is computed and cached once as part of cAdvisor object creation.
// But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts
machineInfo, err := cadvisorInterface.MachineInfo()
if err != nil {
return nil, err
}
capacity := cadvisor.CapacityFromMachineInfo(machineInfo)
cm := &containerManagerImpl{
capacity: capacity,
nodeConfig: nodeConfig,
cadvisorInterface: cadvisorInterface,
}
cm.topologyManager = topologymanager.NewFakeManager()
cm.cpuManager = cpumanager.NewFakeManager()
cm.memoryManager = memorymanager.NewFakeManager()
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
klog.InfoS("Creating topology manager")
cm.topologyManager, err = topologymanager.NewManager(machineInfo.Topology,
nodeConfig.TopologyManagerPolicy,
nodeConfig.TopologyManagerScope,
nodeConfig.TopologyManagerPolicyOptions)
if err != nil {
klog.ErrorS(err, "Failed to initialize topology manager")
return nil, err
}
klog.InfoS("Creating cpu manager")
cm.cpuManager, err = cpumanager.NewManager(
nodeConfig.CPUManagerPolicy,
nodeConfig.CPUManagerPolicyOptions,
nodeConfig.CPUManagerReconcilePeriod,
machineInfo,
nodeConfig.NodeAllocatableConfig.ReservedSystemCPUs,
cm.GetNodeAllocatableReservation(),
nodeConfig.KubeletRootDir,
cm.topologyManager,
)
if err != nil {
klog.ErrorS(err, "Failed to initialize cpu manager")
return nil, err
}
cm.topologyManager.AddHintProvider(cm.cpuManager)
klog.InfoS("Creating memory manager")
cm.memoryManager, err = memorymanager.NewManager(
nodeConfig.ExperimentalMemoryManagerPolicy,
machineInfo,
cm.GetNodeAllocatableReservation(),
nodeConfig.ExperimentalMemoryManagerReservedMemory,
nodeConfig.KubeletRootDir,
cm.topologyManager,
)
if err != nil {
klog.ErrorS(err, "Failed to initialize memory manager")
return nil, err
}
cm.topologyManager.AddHintProvider(cm.memoryManager)
}
klog.InfoS("Creating device plugin manager")
cm.deviceManager, err = devicemanager.NewManagerImpl(nil, cm.topologyManager)
if err != nil {
return nil, err
}
cm.topologyManager.AddHintProvider(cm.deviceManager)
return cm, nil
}
func (cm *containerManagerImpl) SystemCgroupsLimit() v1.ResourceList {
return v1.ResourceList{}
}
func (cm *containerManagerImpl) GetNodeConfig() NodeConfig {
cm.RLock()
defer cm.RUnlock()
return cm.nodeConfig
}
func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems {
return &CgroupSubsystems{}
}
func (cm *containerManagerImpl) GetQOSContainersInfo() QOSContainersInfo {
return QOSContainersInfo{}
}
func (cm *containerManagerImpl) UpdateQOSCgroups() error {
return nil
}
func (cm *containerManagerImpl) Status() Status {
return Status{}
}
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
evictionReservation := hardEvictionReservation(cm.nodeConfig.HardEvictionThresholds, cm.capacity)
result := make(v1.ResourceList)
for k := range cm.capacity {
value := resource.NewQuantity(0, resource.DecimalSI)
if cm.nodeConfig.SystemReserved != nil {
value.Add(cm.nodeConfig.SystemReserved[k])
}
if cm.nodeConfig.KubeReserved != nil {
value.Add(cm.nodeConfig.KubeReserved[k])
}
if evictionReservation != nil {
value.Add(evictionReservation[k])
}
if !value.IsZero() {
result[k] = *value
}
}
return result
}
func (cm *containerManagerImpl) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList {
return cm.capacity
}
func (cm *containerManagerImpl) GetPluginRegistrationHandlers() map[string]cache.PluginHandler {
// DRA is not supported on Windows, only device plugin is supported
return map[string]cache.PluginHandler{pluginwatcherapi.DevicePlugin: cm.deviceManager.GetWatcherHandler()}
}
func (cm *containerManagerImpl) GetHealthCheckers() []healthz.HealthChecker {
return []healthz.HealthChecker{cm.deviceManager.GetHealthChecker()}
}
func (cm *containerManagerImpl) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
return cm.deviceManager.GetCapacity()
}
func (cm *containerManagerImpl) NewPodContainerManager() PodContainerManager {
return &podContainerManagerStub{}
}
func (cm *containerManagerImpl) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
opts := &kubecontainer.RunContainerOptions{}
// Allocate should already be called during predicateAdmitHandler.Admit(),
// just try to fetch device runtime information from cached state here
devOpts, err := cm.deviceManager.GetDeviceRunContainerOptions(pod, container)
if err != nil {
return nil, err
} else if devOpts == nil {
return opts, nil
}
opts.Devices = append(opts.Devices, devOpts.Devices...)
opts.Mounts = append(opts.Mounts, devOpts.Mounts...)
opts.Envs = append(opts.Envs, devOpts.Envs...)
opts.Annotations = append(opts.Annotations, devOpts.Annotations...)
return opts, nil
}
func (cm *containerManagerImpl) UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus) {
// For now we only support Device Plugin
cm.deviceManager.UpdateAllocatedResourcesStatus(pod, status)
// TODO(SergeyKanzhelev, https://kep.k8s.io/4680): add support for DRA resources when DRA supports Windows
}
func (cm *containerManagerImpl) Updates() <-chan resourceupdates.Update {
// TODO(SergeyKanzhelev, https://kep.k8s.io/4680): add support for DRA resources, for now only use device plugin updates
return cm.deviceManager.Updates()
}
func (cm *containerManagerImpl) UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error {
return cm.deviceManager.UpdatePluginResources(node, attrs)
}
func (cm *containerManagerImpl) InternalContainerLifecycle() InternalContainerLifecycle {
return &internalContainerLifecycleImpl{cm.cpuManager, cm.memoryManager, cm.topologyManager}
}
func (cm *containerManagerImpl) GetPodCgroupRoot() string {
return ""
}
func (cm *containerManagerImpl) GetDevices(podUID, containerName string) []*podresourcesapi.ContainerDevices {
return containerDevicesFromResourceDeviceInstances(cm.deviceManager.GetDevices(podUID, containerName))
}
func (cm *containerManagerImpl) GetAllocatableDevices() []*podresourcesapi.ContainerDevices {
return nil
}
func (cm *containerManagerImpl) ShouldResetExtendedResourceCapacity() bool {
return cm.deviceManager.ShouldResetExtendedResourceCapacity()
}
func (cm *containerManagerImpl) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler {
return cm.topologyManager
}
func (cm *containerManagerImpl) UpdateAllocatedDevices() {
return
}
func (cm *containerManagerImpl) GetCPUs(podUID, containerName string) []int64 {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
if cm.cpuManager != nil {
return int64Slice(cm.cpuManager.GetExclusiveCPUs(podUID, containerName).UnsortedList())
}
return []int64{}
}
return nil
}
func (cm *containerManagerImpl) GetAllocatableCPUs() []int64 {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
if cm.cpuManager != nil {
return int64Slice(cm.cpuManager.GetAllocatableCPUs().UnsortedList())
}
return []int64{}
}
return nil
}
func (cm *containerManagerImpl) GetMemory(_, _ string) []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerImpl) GetAllocatableMemory() []*podresourcesapi.ContainerMemory {
return nil
}
func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList {
return nil
}
func (cm *containerManagerImpl) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource {
return nil
}
func (cm *containerManagerImpl) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerImpl) UnprepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *containerManagerImpl) PodMightNeedToUnprepareResources(UID types.UID) bool {
return false
}

View File

@ -0,0 +1,90 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package containermap
import (
"fmt"
)
// cmItem (ContainerMap ITEM) is a pair podUID, containerName
type cmItem struct {
podUID string
containerName string
}
// ContainerMap maps (containerID)->(podUID, containerName)
type ContainerMap map[string]cmItem
// NewContainerMap creates a new ContainerMap struct
func NewContainerMap() ContainerMap {
return make(ContainerMap)
}
// Clone creates a deep copy of the ContainerMap
func (cm ContainerMap) Clone() ContainerMap {
ret := make(ContainerMap, len(cm))
for key, val := range cm {
ret[key] = val
}
return ret
}
// Add adds a mapping of (containerID)->(podUID, containerName) to the ContainerMap
func (cm ContainerMap) Add(podUID, containerName, containerID string) {
cm[containerID] = cmItem{
podUID: podUID,
containerName: containerName,
}
}
// RemoveByContainerID removes a mapping of (containerID)->(podUID, containerName) from the ContainerMap
func (cm ContainerMap) RemoveByContainerID(containerID string) {
delete(cm, containerID)
}
// RemoveByContainerRef removes a mapping of (containerID)->(podUID, containerName) from the ContainerMap
func (cm ContainerMap) RemoveByContainerRef(podUID, containerName string) {
containerID, err := cm.GetContainerID(podUID, containerName)
if err == nil {
cm.RemoveByContainerID(containerID)
}
}
// GetContainerID retrieves a ContainerID from the ContainerMap
func (cm ContainerMap) GetContainerID(podUID, containerName string) (string, error) {
for key, val := range cm {
if val.podUID == podUID && val.containerName == containerName {
return key, nil
}
}
return "", fmt.Errorf("container %s not in ContainerMap for pod %s", containerName, podUID)
}
// GetContainerRef retrieves a (podUID, containerName) pair from the ContainerMap
func (cm ContainerMap) GetContainerRef(containerID string) (string, string, error) {
if _, exists := cm[containerID]; !exists {
return "", "", fmt.Errorf("containerID %s not in ContainerMap", containerID)
}
return cm[containerID].podUID, cm[containerID].containerName, nil
}
// Visit invoke visitor function to walks all of the entries in the container map
func (cm ContainerMap) Visit(visitor func(podUID, containerName, containerID string)) {
for k, v := range cm {
visitor(v.podUID, v.containerName, k)
}
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- derekwaynecarr
reviewers:
- klueska
emeritus_approvers:
- balajismaniam
- ConnorDoyle
- vishh

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,525 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"context"
"fmt"
"math"
"sync"
"time"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/utils/cpuset"
)
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
type runtimeService interface {
UpdateContainerResources(ctx context.Context, id string, resources *runtimeapi.ContainerResources) error
}
type policyName string
// cpuManagerStateFileName is the file name where cpu manager stores its state
const cpuManagerStateFileName = "cpu_manager_state"
// Manager interface provides methods for Kubelet to manage pod cpus.
type Manager interface {
// Start is called during Kubelet initialization.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error
// Called to trigger the allocation of CPUs to a container. This must be
// called at some point prior to the AddContainer() call for a container,
// e.g. at pod admission time.
Allocate(pod *v1.Pod, container *v1.Container) error
// AddContainer adds the mapping between container ID to pod UID and the container name
// The mapping used to remove the CPU allocation during the container removal
AddContainer(p *v1.Pod, c *v1.Container, containerID string)
// RemoveContainer is called after Kubelet decides to kill or delete a
// container. After this call, the CPU manager stops trying to reconcile
// that container and any CPUs dedicated to the container are freed.
RemoveContainer(containerID string) error
// State returns a read-only interface to the internal CPU manager state.
State() state.Reader
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(*v1.Pod, *v1.Container) map[string][]topologymanager.TopologyHint
// GetExclusiveCPUs implements the podresources.CPUsProvider interface to provide
// exclusively allocated cpus for the container
GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment per Pod
// among this and other resource controllers.
GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint
// GetAllocatableCPUs returns the total set of CPUs available for allocation.
GetAllocatableCPUs() cpuset.CPUSet
// GetCPUAffinity returns cpuset which includes cpus from shared pools
// as well as exclusively allocated cpus
GetCPUAffinity(podUID, containerName string) cpuset.CPUSet
// GetAllCPUs returns all the CPUs known by cpumanager, as reported by the
// hardware discovery. Maps to the CPU capacity.
GetAllCPUs() cpuset.CPUSet
}
type manager struct {
sync.Mutex
policy Policy
// reconcilePeriod is the duration between calls to reconcileState.
reconcilePeriod time.Duration
// state allows pluggable CPU assignment policies while sharing a common
// representation of state for the system to inspect and reconcile.
state state.State
// lastUpdatedstate holds state for each container from the last time it was updated.
lastUpdateState state.State
// containerRuntime is the container runtime service interface needed
// to make UpdateContainerResources() calls against the containers.
containerRuntime runtimeService
// activePods is a method for listing active pods on the node
// so all the containers can be updated in the reconciliation loop.
activePods ActivePodsFunc
// podStatusProvider provides a method for obtaining pod statuses
// and the containerID of their containers
podStatusProvider status.PodStatusProvider
// containerMap provides a mapping from (pod, container) -> containerID
// for all containers a pod
containerMap containermap.ContainerMap
topology *topology.CPUTopology
nodeAllocatableReservation v1.ResourceList
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can purge inactive pods from checkpointed state.
sourcesReady config.SourcesReady
// stateFileDirectory holds the directory where the state file for checkpoints is held.
stateFileDirectory string
// allCPUs is the set of online CPUs as reported by the system
allCPUs cpuset.CPUSet
// allocatableCPUs is the set of online CPUs as reported by the system,
// and available for allocation, minus the reserved set
allocatableCPUs cpuset.CPUSet
}
var _ Manager = &manager{}
type sourcesReadyStub struct{}
func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// NewManager creates new cpu manager based on provided policy
func NewManager(cpuPolicyName string, cpuPolicyOptions map[string]string, reconcilePeriod time.Duration, machineInfo *cadvisorapi.MachineInfo, specificCPUs cpuset.CPUSet, nodeAllocatableReservation v1.ResourceList, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
var topo *topology.CPUTopology
var policy Policy
var err error
topo, err = topology.Discover(machineInfo)
if err != nil {
return nil, err
}
switch policyName(cpuPolicyName) {
case PolicyNone:
policy, err = NewNonePolicy(cpuPolicyOptions)
if err != nil {
return nil, fmt.Errorf("new none policy error: %w", err)
}
case PolicyStatic:
klog.InfoS("Detected CPU topology", "topology", topo)
reservedCPUs, ok := nodeAllocatableReservation[v1.ResourceCPU]
if !ok {
// The static policy cannot initialize without this information.
return nil, fmt.Errorf("[cpumanager] unable to determine reserved CPU resources for static policy")
}
if reservedCPUs.IsZero() {
// The static policy requires this to be nonzero. Zero CPU reservation
// would allow the shared pool to be completely exhausted. At that point
// either we would violate our guarantee of exclusivity or need to evict
// any pod that has at least one container that requires zero CPUs.
// See the comments in policy_static.go for more details.
return nil, fmt.Errorf("[cpumanager] the static policy requires systemreserved.cpu + kubereserved.cpu to be greater than zero")
}
// Take the ceiling of the reservation, since fractional CPUs cannot be
// exclusively allocated.
reservedCPUsFloat := float64(reservedCPUs.MilliValue()) / 1000
numReservedCPUs := int(math.Ceil(reservedCPUsFloat))
policy, err = NewStaticPolicy(topo, numReservedCPUs, specificCPUs, affinity, cpuPolicyOptions)
if err != nil {
return nil, fmt.Errorf("new static policy error: %w", err)
}
default:
return nil, fmt.Errorf("unknown policy: \"%s\"", cpuPolicyName)
}
manager := &manager{
policy: policy,
reconcilePeriod: reconcilePeriod,
lastUpdateState: state.NewMemoryState(),
topology: topo,
nodeAllocatableReservation: nodeAllocatableReservation,
stateFileDirectory: stateFileDirectory,
allCPUs: topo.CPUDetails.CPUs(),
}
manager.sourcesReady = &sourcesReadyStub{}
return manager, nil
}
func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Starting CPU manager", "policy", m.policy.Name())
klog.InfoS("Reconciling", "reconcilePeriod", m.reconcilePeriod)
m.sourcesReady = sourcesReady
m.activePods = activePods
m.podStatusProvider = podStatusProvider
m.containerRuntime = containerRuntime
m.containerMap = initialContainers
stateImpl, err := state.NewCheckpointState(m.stateFileDirectory, cpuManagerStateFileName, m.policy.Name(), m.containerMap)
if err != nil {
klog.ErrorS(err, "Could not initialize checkpoint manager, please drain node and remove policy state file")
return err
}
m.state = stateImpl
err = m.policy.Start(m.state)
if err != nil {
klog.ErrorS(err, "Policy start error")
return err
}
m.allocatableCPUs = m.policy.GetAllocatableCPUs(m.state)
if m.policy.Name() == string(PolicyNone) {
return nil
}
// Periodically call m.reconcileState() to continue to keep the CPU sets of
// all pods in sync with and guaranteed CPUs handed out among them.
go wait.Until(func() { m.reconcileState() }, m.reconcilePeriod, wait.NeverStop)
return nil
}
func (m *manager) Allocate(p *v1.Pod, c *v1.Container) error {
// Garbage collect any stranded resources before allocating CPUs.
m.removeStaleState()
m.Lock()
defer m.Unlock()
// Call down into the policy to assign this container CPUs if required.
err := m.policy.Allocate(m.state, p, c)
if err != nil {
klog.ErrorS(err, "Allocate error")
return err
}
return nil
}
func (m *manager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
m.Lock()
defer m.Unlock()
if cset, exists := m.state.GetCPUSet(string(pod.UID), container.Name); exists {
m.lastUpdateState.SetCPUSet(string(pod.UID), container.Name, cset)
}
m.containerMap.Add(string(pod.UID), container.Name, containerID)
}
func (m *manager) RemoveContainer(containerID string) error {
m.Lock()
defer m.Unlock()
err := m.policyRemoveContainerByID(containerID)
if err != nil {
klog.ErrorS(err, "RemoveContainer error")
return err
}
return nil
}
func (m *manager) policyRemoveContainerByID(containerID string) error {
podUID, containerName, err := m.containerMap.GetContainerRef(containerID)
if err != nil {
return nil
}
err = m.policy.RemoveContainer(m.state, podUID, containerName)
if err == nil {
m.lastUpdateState.Delete(podUID, containerName)
m.containerMap.RemoveByContainerID(containerID)
}
return err
}
func (m *manager) policyRemoveContainerByRef(podUID string, containerName string) error {
err := m.policy.RemoveContainer(m.state, podUID, containerName)
if err == nil {
m.lastUpdateState.Delete(podUID, containerName)
m.containerMap.RemoveByContainerRef(podUID, containerName)
}
return err
}
func (m *manager) State() state.Reader {
return m.state
}
func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetTopologyHints(m.state, pod, container)
}
func (m *manager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetPodTopologyHints(m.state, pod)
}
func (m *manager) GetAllocatableCPUs() cpuset.CPUSet {
return m.allocatableCPUs.Clone()
}
func (m *manager) GetAllCPUs() cpuset.CPUSet {
return m.allCPUs.Clone()
}
type reconciledContainer struct {
podName string
containerName string
containerID string
}
func (m *manager) removeStaleState() {
// Only once all sources are ready do we attempt to remove any stale state.
// This ensures that the call to `m.activePods()` below will succeed with
// the actual active pods list.
if !m.sourcesReady.AllReady() {
return
}
// We grab the lock to ensure that no new containers will grab CPUs while
// executing the code below. Without this lock, its possible that we end up
// removing state that is newly added by an asynchronous call to
// AddContainer() during the execution of this code.
m.Lock()
defer m.Unlock()
// Get the list of active pods.
activePods := m.activePods()
// Build a list of (podUID, containerName) pairs for all containers in all active Pods.
activeContainers := make(map[string]map[string]struct{})
for _, pod := range activePods {
activeContainers[string(pod.UID)] = make(map[string]struct{})
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
activeContainers[string(pod.UID)][container.Name] = struct{}{}
}
}
// Loop through the CPUManager state. Remove any state for containers not
// in the `activeContainers` list built above.
assignments := m.state.GetCPUAssignments()
for podUID := range assignments {
for containerName := range assignments[podUID] {
if _, ok := activeContainers[podUID][containerName]; ok {
klog.V(5).InfoS("RemoveStaleState: container still active", "podUID", podUID, "containerName", containerName)
continue
}
klog.V(2).InfoS("RemoveStaleState: removing container", "podUID", podUID, "containerName", containerName)
err := m.policyRemoveContainerByRef(podUID, containerName)
if err != nil {
klog.ErrorS(err, "RemoveStaleState: failed to remove container", "podUID", podUID, "containerName", containerName)
}
}
}
m.containerMap.Visit(func(podUID, containerName, containerID string) {
if _, ok := activeContainers[podUID][containerName]; ok {
klog.V(5).InfoS("RemoveStaleState: containerMap: container still active", "podUID", podUID, "containerName", containerName)
return
}
klog.V(2).InfoS("RemoveStaleState: containerMap: removing container", "podUID", podUID, "containerName", containerName)
err := m.policyRemoveContainerByRef(podUID, containerName)
if err != nil {
klog.ErrorS(err, "RemoveStaleState: containerMap: failed to remove container", "podUID", podUID, "containerName", containerName)
}
})
}
func (m *manager) reconcileState() (success []reconciledContainer, failure []reconciledContainer) {
ctx := context.Background()
success = []reconciledContainer{}
failure = []reconciledContainer{}
m.removeStaleState()
for _, pod := range m.activePods() {
pstatus, ok := m.podStatusProvider.GetPodStatus(pod.UID)
if !ok {
klog.V(5).InfoS("ReconcileState: skipping pod; status not found", "pod", klog.KObj(pod))
failure = append(failure, reconciledContainer{pod.Name, "", ""})
continue
}
allContainers := pod.Spec.InitContainers
allContainers = append(allContainers, pod.Spec.Containers...)
for _, container := range allContainers {
containerID, err := findContainerIDByName(&pstatus, container.Name)
if err != nil {
klog.V(5).InfoS("ReconcileState: skipping container; ID not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
cstatus, err := findContainerStatusByName(&pstatus, container.Name)
if err != nil {
klog.V(5).InfoS("ReconcileState: skipping container; container status not found in pod status", "pod", klog.KObj(pod), "containerName", container.Name, "err", err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
if cstatus.State.Waiting != nil ||
(cstatus.State.Waiting == nil && cstatus.State.Running == nil && cstatus.State.Terminated == nil) {
klog.V(4).InfoS("ReconcileState: skipping container; container still in the waiting state", "pod", klog.KObj(pod), "containerName", container.Name, "err", err)
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
continue
}
m.Lock()
if cstatus.State.Terminated != nil {
// The container is terminated but we can't call m.RemoveContainer()
// here because it could remove the allocated cpuset for the container
// which may be in the process of being restarted. That would result
// in the container losing any exclusively-allocated CPUs that it
// was allocated.
_, _, err := m.containerMap.GetContainerRef(containerID)
if err == nil {
klog.V(4).InfoS("ReconcileState: ignoring terminated container", "pod", klog.KObj(pod), "containerID", containerID)
}
m.Unlock()
continue
}
// Once we make it here we know we have a running container.
// Idempotently add it to the containerMap incase it is missing.
// This can happen after a kubelet restart, for example.
m.containerMap.Add(string(pod.UID), container.Name, containerID)
m.Unlock()
cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name)
if cset.IsEmpty() {
// NOTE: This should not happen outside of tests.
klog.V(2).InfoS("ReconcileState: skipping container; assigned cpuset is empty", "pod", klog.KObj(pod), "containerName", container.Name)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
lcset := m.lastUpdateState.GetCPUSetOrDefault(string(pod.UID), container.Name)
if !cset.Equals(lcset) {
klog.V(5).InfoS("ReconcileState: updating container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID, "cpuSet", cset)
err = m.updateContainerCPUSet(ctx, containerID, cset)
if err != nil {
klog.ErrorS(err, "ReconcileState: failed to update container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID, "cpuSet", cset)
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
continue
}
m.lastUpdateState.SetCPUSet(string(pod.UID), container.Name, cset)
}
success = append(success, reconciledContainer{pod.Name, container.Name, containerID})
}
}
return success, failure
}
func findContainerIDByName(status *v1.PodStatus, name string) (string, error) {
allStatuses := status.InitContainerStatuses
allStatuses = append(allStatuses, status.ContainerStatuses...)
for _, container := range allStatuses {
if container.Name == name && container.ContainerID != "" {
cid := &kubecontainer.ContainerID{}
err := cid.ParseString(container.ContainerID)
if err != nil {
return "", err
}
return cid.ID, nil
}
}
return "", fmt.Errorf("unable to find ID for container with name %v in pod status (it may not be running)", name)
}
func findContainerStatusByName(status *v1.PodStatus, name string) (*v1.ContainerStatus, error) {
for _, containerStatus := range append(status.InitContainerStatuses, status.ContainerStatuses...) {
if containerStatus.Name == name {
return &containerStatus, nil
}
}
return nil, fmt.Errorf("unable to find status for container with name %v in pod status (it may not be running)", name)
}
func (m *manager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet {
if result, ok := m.state.GetCPUSet(podUID, containerName); ok {
return result
}
return cpuset.CPUSet{}
}
func (m *manager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet {
return m.state.GetCPUSetOrDefault(podUID, containerName)
}

View File

@ -0,0 +1,43 @@
//go:build !windows
// +build !windows
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"context"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/utils/cpuset"
)
func (m *manager) updateContainerCPUSet(ctx context.Context, containerID string, cpus cpuset.CPUSet) error {
// TODO: Consider adding a `ResourceConfigForContainer` helper in
// helpers_linux.go similar to what exists for pods.
// It would be better to pass the full container resources here instead of
// this patch-like partial resources.
return m.containerRuntime.UpdateContainerResources(
ctx,
containerID,
&runtimeapi.ContainerResources{
Linux: &runtimeapi.LinuxContainerResources{
CpusetCpus: cpus.String(),
},
})
}

View File

@ -0,0 +1,49 @@
//go:build windows
// +build windows
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"context"
utilfeature "k8s.io/apiserver/pkg/util/feature"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/winstats"
"k8s.io/utils/cpuset"
)
func (m *manager) updateContainerCPUSet(ctx context.Context, containerID string, cpus cpuset.CPUSet) error {
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
return nil
}
affinities := winstats.CpusToGroupAffinity(cpus.List())
var cpuGroupAffinities []*runtimeapi.WindowsCpuGroupAffinity
for _, affinity := range affinities {
cpuGroupAffinities = append(cpuGroupAffinities, &runtimeapi.WindowsCpuGroupAffinity{
CpuGroup: uint32(affinity.Group),
CpuMask: uint64(affinity.Mask),
})
}
return m.containerRuntime.UpdateContainerResources(ctx, containerID, &runtimeapi.ContainerResources{
Windows: &runtimeapi.WindowsContainerResources{
AffinityCpus: cpuGroupAffinities,
},
})
}

View File

@ -0,0 +1,98 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/utils/cpuset"
)
type fakeManager struct {
state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Start()")
return nil
}
func (m *fakeManager) Policy() Policy {
klog.InfoS("Policy()")
pol, _ := NewNonePolicy(nil)
return pol
}
func (m *fakeManager) Allocate(pod *v1.Pod, container *v1.Container) error {
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
klog.InfoS("AddContainer", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID)
}
func (m *fakeManager) RemoveContainer(containerID string) error {
klog.InfoS("RemoveContainer", "containerID", containerID)
return nil
}
func (m *fakeManager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get container topology hints")
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get pod topology hints")
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) State() state.Reader {
return m.state
}
func (m *fakeManager) GetExclusiveCPUs(podUID, containerName string) cpuset.CPUSet {
klog.InfoS("GetExclusiveCPUs", "podUID", podUID, "containerName", containerName)
return cpuset.CPUSet{}
}
func (m *fakeManager) GetAllocatableCPUs() cpuset.CPUSet {
klog.InfoS("Get Allocatable CPUs")
return cpuset.CPUSet{}
}
func (m *fakeManager) GetCPUAffinity(podUID, containerName string) cpuset.CPUSet {
klog.InfoS("GetCPUAffinity", "podUID", podUID, "containerName", containerName)
return cpuset.CPUSet{}
}
func (m *fakeManager) GetAllCPUs() cpuset.CPUSet {
klog.InfoS("GetAllCPUs")
return cpuset.CPUSet{}
}
// NewFakeManager creates empty/fake cpu manager
func NewFakeManager() Manager {
return &fakeManager{
state: state.NewMemoryState(),
}
}

View File

@ -0,0 +1,45 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/utils/cpuset"
)
// Policy implements logic for pod container to CPU assignment.
type Policy interface {
Name() string
Start(s state.State) error
// Allocate call is idempotent
Allocate(s state.State, pod *v1.Pod, container *v1.Container) error
// RemoveContainer call is idempotent
RemoveContainer(s state.State, podUID string, containerName string) error
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment per Pod
// among this and other resource controllers.
GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint
// GetAllocatableCPUs returns the total set of CPUs available for allocation.
GetAllocatableCPUs(m state.State) cpuset.CPUSet
}

View File

@ -0,0 +1,76 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"k8s.io/api/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/utils/cpuset"
)
type nonePolicy struct{}
var _ Policy = &nonePolicy{}
// PolicyNone name of none policy
const PolicyNone policyName = "none"
// NewNonePolicy returns a cpuset manager policy that does nothing
func NewNonePolicy(cpuPolicyOptions map[string]string) (Policy, error) {
if len(cpuPolicyOptions) > 0 {
return nil, fmt.Errorf("None policy: received unsupported options=%v", cpuPolicyOptions)
}
return &nonePolicy{}, nil
}
func (p *nonePolicy) Name() string {
return string(PolicyNone)
}
func (p *nonePolicy) Start(s state.State) error {
klog.InfoS("None policy: Start")
return nil
}
func (p *nonePolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
return nil
}
func (p *nonePolicy) RemoveContainer(s state.State, podUID string, containerName string) error {
return nil
}
func (p *nonePolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
return nil
}
func (p *nonePolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
return nil
}
// Assignable CPUs are the ones that can be exclusively allocated to pods that meet the exclusivity requirement
// (ie guaranteed QoS class and integral CPU request).
// Assignability of CPUs as a concept is only applicable in case of static policy i.e. scenarios where workloads
// CAN get exclusive access to core(s).
// Hence, we return empty set here: no cpus are assignable according to above definition with this policy.
func (p *nonePolicy) GetAllocatableCPUs(m state.State) cpuset.CPUSet {
return cpuset.New()
}

View File

@ -0,0 +1,185 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
"strconv"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
// Names of the options, as part of the user interface.
const (
FullPCPUsOnlyOption string = "full-pcpus-only"
DistributeCPUsAcrossNUMAOption string = "distribute-cpus-across-numa"
AlignBySocketOption string = "align-by-socket"
DistributeCPUsAcrossCoresOption string = "distribute-cpus-across-cores"
StrictCPUReservationOption string = "strict-cpu-reservation"
PreferAlignByUnCoreCacheOption string = "prefer-align-cpus-by-uncorecache"
)
var (
alphaOptions = sets.New[string](
DistributeCPUsAcrossNUMAOption,
AlignBySocketOption,
DistributeCPUsAcrossCoresOption,
StrictCPUReservationOption,
PreferAlignByUnCoreCacheOption,
)
betaOptions = sets.New[string](
FullPCPUsOnlyOption,
)
stableOptions = sets.New[string]()
)
// CheckPolicyOptionAvailable verifies if the given option can be used depending on the Feature Gate Settings.
// returns nil on success, or an error describing the failure on error.
func CheckPolicyOptionAvailable(option string) error {
if !alphaOptions.Has(option) && !betaOptions.Has(option) && !stableOptions.Has(option) {
return fmt.Errorf("unknown CPU Manager Policy option: %q", option)
}
if alphaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManagerPolicyAlphaOptions) {
return fmt.Errorf("CPU Manager Policy Alpha-level Options not enabled, but option %q provided", option)
}
if betaOptions.Has(option) && !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUManagerPolicyBetaOptions) {
return fmt.Errorf("CPU Manager Policy Beta-level Options not enabled, but option %q provided", option)
}
return nil
}
// StaticPolicyOptions holds the parsed value of the policy options, ready to be consumed internally.
type StaticPolicyOptions struct {
// flag to enable extra allocation restrictions to avoid
// different containers to possibly end up on the same core.
// we consider "core" and "physical CPU" synonim here, leaning
// towards the terminoloy k8s hints. We acknowledge this is confusing.
//
// looking at https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/,
// any possible naming scheme will lead to ambiguity to some extent.
// We picked "pcpu" because it the established docs hints at vCPU already.
FullPhysicalCPUsOnly bool
// Flag to evenly distribute CPUs across NUMA nodes in cases where more
// than one NUMA node is required to satisfy the allocation.
DistributeCPUsAcrossNUMA bool
// Flag to ensure CPUs are considered aligned at socket boundary rather than
// NUMA boundary
AlignBySocket bool
// flag to enable extra allocation restrictions to spread
// cpus (HT) on different physical core.
// This is a preferred policy so do not throw error if they have to packed in one physical core.
DistributeCPUsAcrossCores bool
// Flag to remove reserved cores from the list of available cores
StrictCPUReservation bool
// Flag that makes best-effort to align CPUs to a uncorecache boundary
// As long as there are CPUs available, pods will be admitted if the condition is not met.
PreferAlignByUncoreCacheOption bool
}
// NewStaticPolicyOptions creates a StaticPolicyOptions struct from the user configuration.
func NewStaticPolicyOptions(policyOptions map[string]string) (StaticPolicyOptions, error) {
opts := StaticPolicyOptions{}
for name, value := range policyOptions {
if err := CheckPolicyOptionAvailable(name); err != nil {
return opts, err
}
switch name {
case FullPCPUsOnlyOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.FullPhysicalCPUsOnly = optValue
case DistributeCPUsAcrossNUMAOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.DistributeCPUsAcrossNUMA = optValue
case AlignBySocketOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.AlignBySocket = optValue
case DistributeCPUsAcrossCoresOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.DistributeCPUsAcrossCores = optValue
case StrictCPUReservationOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.StrictCPUReservation = optValue
case PreferAlignByUnCoreCacheOption:
optValue, err := strconv.ParseBool(value)
if err != nil {
return opts, fmt.Errorf("bad value for option %q: %w", name, err)
}
opts.PreferAlignByUncoreCacheOption = optValue
default:
// this should never be reached, we already detect unknown options,
// but we keep it as further safety.
return opts, fmt.Errorf("unsupported cpumanager option: %q (%s)", name, value)
}
}
if opts.FullPhysicalCPUsOnly && opts.DistributeCPUsAcrossCores {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", FullPCPUsOnlyOption, DistributeCPUsAcrossCoresOption)
}
// TODO(@Jeffwan): Remove this check after more compatibility tests are done.
if opts.DistributeCPUsAcrossNUMA && opts.DistributeCPUsAcrossCores {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", DistributeCPUsAcrossNUMAOption, DistributeCPUsAcrossCoresOption)
}
if opts.PreferAlignByUncoreCacheOption && opts.DistributeCPUsAcrossCores {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", PreferAlignByUnCoreCacheOption, DistributeCPUsAcrossCoresOption)
}
if opts.PreferAlignByUncoreCacheOption && opts.DistributeCPUsAcrossNUMA {
return opts, fmt.Errorf("static policy options %s and %s can not be used at the same time", PreferAlignByUnCoreCacheOption, DistributeCPUsAcrossNUMAOption)
}
return opts, nil
}
// ValidateStaticPolicyOptions ensures that the requested policy options are compatible with the machine on which the CPUManager is running.
func ValidateStaticPolicyOptions(opts StaticPolicyOptions, topology *topology.CPUTopology, topologyManager topologymanager.Store) error {
if opts.AlignBySocket {
// Not compatible with topology manager single-numa-node policy option.
if topologyManager.GetPolicy().Name() == topologymanager.PolicySingleNumaNode {
return fmt.Errorf("Topolgy manager %s policy is incompatible with CPUManager %s policy option", topologymanager.PolicySingleNumaNode, AlignBySocketOption)
}
// Not compatible with topology when number of sockets are more than number of NUMA nodes.
if topology.NumSockets > topology.NumNUMANodes {
return fmt.Errorf("Align by socket is not compatible with hardware where number of sockets are more than number of NUMA")
}
}
return nil
}

View File

@ -0,0 +1,766 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cpumanager
import (
"fmt"
v1 "k8s.io/api/core/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/utils/cpuset"
)
const (
// PolicyStatic is the name of the static policy.
// Should options be given, these will be ignored and backward (up to 1.21 included)
// compatible behaviour will be enforced
PolicyStatic policyName = "static"
// ErrorSMTAlignment represents the type of an SMTAlignmentError
ErrorSMTAlignment = "SMTAlignmentError"
)
// SMTAlignmentError represents an error due to SMT alignment
type SMTAlignmentError struct {
RequestedCPUs int
CpusPerCore int
AvailablePhysicalCPUs int
CausedByPhysicalCPUs bool
}
func (e SMTAlignmentError) Error() string {
if e.CausedByPhysicalCPUs {
return fmt.Sprintf("SMT Alignment Error: not enough free physical CPUs: available physical CPUs = %d, requested CPUs = %d, CPUs per core = %d", e.AvailablePhysicalCPUs, e.RequestedCPUs, e.CpusPerCore)
}
return fmt.Sprintf("SMT Alignment Error: requested %d cpus not multiple cpus per core = %d", e.RequestedCPUs, e.CpusPerCore)
}
// Type returns human-readable type of this error. Used in the admission control to populate Admission Failure reason.
func (e SMTAlignmentError) Type() string {
return ErrorSMTAlignment
}
// staticPolicy is a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
//
// This policy allocates CPUs exclusively for a container if all the following
// conditions are met:
//
// - The pod QoS class is Guaranteed.
// - The CPU request is a positive integer.
//
// The static policy maintains the following sets of logical CPUs:
//
// - SHARED: Burstable, BestEffort, and non-integral Guaranteed containers
// run here. Initially this contains all CPU IDs on the system. As
// exclusive allocations are created and destroyed, this CPU set shrinks
// and grows, accordingly. This is stored in the state as the default
// CPU set.
//
// - RESERVED: A subset of the shared pool which is not exclusively
// allocatable. The membership of this pool is static for the lifetime of
// the Kubelet. The size of the reserved pool is
// ceil(systemreserved.cpu + kubereserved.cpu).
// Reserved CPUs are taken topologically starting with lowest-indexed
// physical core, as reported by cAdvisor.
//
// - ASSIGNABLE: Equal to SHARED - RESERVED. Exclusive CPUs are allocated
// from this pool.
//
// - EXCLUSIVE ALLOCATIONS: CPU sets assigned exclusively to one container.
// These are stored as explicit assignments in the state.
//
// When an exclusive allocation is made, the static policy also updates the
// default cpuset in the state abstraction. The CPU manager's periodic
// reconcile loop takes care of rewriting the cpuset in cgroupfs for any
// containers that may be running in the shared pool. For this reason,
// applications running within exclusively-allocated containers must tolerate
// potentially sharing their allocated CPUs for up to the CPU manager
// reconcile period.
type staticPolicy struct {
// cpu socket topology
topology *topology.CPUTopology
// set of CPUs that is not available for exclusive assignment
reservedCPUs cpuset.CPUSet
// Superset of reservedCPUs. It includes not just the reservedCPUs themselves,
// but also any siblings of those reservedCPUs on the same physical die.
// NOTE: If the reserved set includes full physical CPUs from the beginning
// (e.g. only reserved pairs of core siblings) this set is expected to be
// identical to the reserved set.
reservedPhysicalCPUs cpuset.CPUSet
// topology manager reference to get container Topology affinity
affinity topologymanager.Store
// set of CPUs to reuse across allocations in a pod
cpusToReuse map[string]cpuset.CPUSet
// options allow to fine-tune the behaviour of the policy
options StaticPolicyOptions
// we compute this value multiple time, and it's not supposed to change
// at runtime - the cpumanager can't deal with runtime topology changes anyway.
cpuGroupSize int
}
// Ensure staticPolicy implements Policy interface
var _ Policy = &staticPolicy{}
// NewStaticPolicy returns a CPU manager policy that does not change CPU
// assignments for exclusively pinned guaranteed containers after the main
// container process starts.
func NewStaticPolicy(topology *topology.CPUTopology, numReservedCPUs int, reservedCPUs cpuset.CPUSet, affinity topologymanager.Store, cpuPolicyOptions map[string]string) (Policy, error) {
opts, err := NewStaticPolicyOptions(cpuPolicyOptions)
if err != nil {
return nil, err
}
err = ValidateStaticPolicyOptions(opts, topology, affinity)
if err != nil {
return nil, err
}
cpuGroupSize := topology.CPUsPerCore()
klog.InfoS("Static policy created with configuration", "options", opts, "cpuGroupSize", cpuGroupSize)
policy := &staticPolicy{
topology: topology,
affinity: affinity,
cpusToReuse: make(map[string]cpuset.CPUSet),
options: opts,
cpuGroupSize: cpuGroupSize,
}
allCPUs := topology.CPUDetails.CPUs()
var reserved cpuset.CPUSet
if reservedCPUs.Size() > 0 {
reserved = reservedCPUs
} else {
// takeByTopology allocates CPUs associated with low-numbered cores from
// allCPUs.
//
// For example: Given a system with 8 CPUs available and HT enabled,
// if numReservedCPUs=2, then reserved={0,4}
reserved, _ = policy.takeByTopology(allCPUs, numReservedCPUs)
}
if reserved.Size() != numReservedCPUs {
err := fmt.Errorf("[cpumanager] unable to reserve the required amount of CPUs (size of %s did not equal %d)", reserved, numReservedCPUs)
return nil, err
}
var reservedPhysicalCPUs cpuset.CPUSet
for _, cpu := range reserved.UnsortedList() {
core, err := topology.CPUCoreID(cpu)
if err != nil {
return nil, fmt.Errorf("[cpumanager] unable to build the reserved physical CPUs from the reserved set: %w", err)
}
reservedPhysicalCPUs = reservedPhysicalCPUs.Union(topology.CPUDetails.CPUsInCores(core))
}
klog.InfoS("Reserved CPUs not available for exclusive assignment", "reservedSize", reserved.Size(), "reserved", reserved, "reservedPhysicalCPUs", reservedPhysicalCPUs)
policy.reservedCPUs = reserved
policy.reservedPhysicalCPUs = reservedPhysicalCPUs
return policy, nil
}
func (p *staticPolicy) Name() string {
return string(PolicyStatic)
}
func (p *staticPolicy) Start(s state.State) error {
if err := p.validateState(s); err != nil {
klog.ErrorS(err, "Static policy invalid state, please drain node and remove policy state file")
return err
}
p.initializeMetrics(s)
return nil
}
func (p *staticPolicy) validateState(s state.State) error {
tmpAssignments := s.GetCPUAssignments()
tmpDefaultCPUset := s.GetDefaultCPUSet()
allCPUs := p.topology.CPUDetails.CPUs()
if p.options.StrictCPUReservation {
allCPUs = allCPUs.Difference(p.reservedCPUs)
}
// Default cpuset cannot be empty when assignments exist
if tmpDefaultCPUset.IsEmpty() {
if len(tmpAssignments) != 0 {
return fmt.Errorf("default cpuset cannot be empty")
}
// state is empty initialize
s.SetDefaultCPUSet(allCPUs)
klog.InfoS("Static policy initialized", "defaultCPUSet", allCPUs)
return nil
}
// State has already been initialized from file (is not empty)
// 1. Check if the reserved cpuset is not part of default cpuset because:
// - kube/system reserved have changed (increased) - may lead to some containers not being able to start
// - user tampered with file
if p.options.StrictCPUReservation {
if !p.reservedCPUs.Intersection(tmpDefaultCPUset).IsEmpty() {
return fmt.Errorf("some of strictly reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
p.reservedCPUs.Intersection(tmpDefaultCPUset).String(), tmpDefaultCPUset.String())
}
} else {
if !p.reservedCPUs.Intersection(tmpDefaultCPUset).Equals(p.reservedCPUs) {
return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"",
p.reservedCPUs.String(), tmpDefaultCPUset.String())
}
}
// 2. Check if state for static policy is consistent
for pod := range tmpAssignments {
for container, cset := range tmpAssignments[pod] {
// None of the cpu in DEFAULT cset should be in s.assignments
if !tmpDefaultCPUset.Intersection(cset).IsEmpty() {
return fmt.Errorf("pod: %s, container: %s cpuset: \"%s\" overlaps with default cpuset \"%s\"",
pod, container, cset.String(), tmpDefaultCPUset.String())
}
}
}
// 3. It's possible that the set of available CPUs has changed since
// the state was written. This can be due to for example
// offlining a CPU when kubelet is not running. If this happens,
// CPU manager will run into trouble when later it tries to
// assign non-existent CPUs to containers. Validate that the
// topology that was received during CPU manager startup matches with
// the set of CPUs stored in the state.
totalKnownCPUs := tmpDefaultCPUset.Clone()
tmpCPUSets := []cpuset.CPUSet{}
for pod := range tmpAssignments {
for _, cset := range tmpAssignments[pod] {
tmpCPUSets = append(tmpCPUSets, cset)
}
}
totalKnownCPUs = totalKnownCPUs.Union(tmpCPUSets...)
if !totalKnownCPUs.Equals(allCPUs) {
return fmt.Errorf("current set of available CPUs \"%s\" doesn't match with CPUs in state \"%s\"",
allCPUs.String(), totalKnownCPUs.String())
}
return nil
}
// GetAllocatableCPUs returns the total set of CPUs available for allocation.
func (p *staticPolicy) GetAllocatableCPUs(s state.State) cpuset.CPUSet {
return p.topology.CPUDetails.CPUs().Difference(p.reservedCPUs)
}
// GetAvailableCPUs returns the set of unassigned CPUs minus the reserved set.
func (p *staticPolicy) GetAvailableCPUs(s state.State) cpuset.CPUSet {
return s.GetDefaultCPUSet().Difference(p.reservedCPUs)
}
func (p *staticPolicy) GetAvailablePhysicalCPUs(s state.State) cpuset.CPUSet {
return s.GetDefaultCPUSet().Difference(p.reservedPhysicalCPUs)
}
func (p *staticPolicy) updateCPUsToReuse(pod *v1.Pod, container *v1.Container, cset cpuset.CPUSet) {
// If pod entries to m.cpusToReuse other than the current pod exist, delete them.
for podUID := range p.cpusToReuse {
if podUID != string(pod.UID) {
delete(p.cpusToReuse, podUID)
}
}
// If no cpuset exists for cpusToReuse by this pod yet, create one.
if _, ok := p.cpusToReuse[string(pod.UID)]; !ok {
p.cpusToReuse[string(pod.UID)] = cpuset.New()
}
// Check if the container is an init container.
// If so, add its cpuset to the cpuset of reusable CPUs for any new allocations.
for _, initContainer := range pod.Spec.InitContainers {
if container.Name == initContainer.Name {
if podutil.IsRestartableInitContainer(&initContainer) {
// If the container is a restartable init container, we should not
// reuse its cpuset, as a restartable init container can run with
// regular containers.
break
}
p.cpusToReuse[string(pod.UID)] = p.cpusToReuse[string(pod.UID)].Union(cset)
return
}
}
// Otherwise it is an app container.
// Remove its cpuset from the cpuset of reusable CPUs for any new allocations.
p.cpusToReuse[string(pod.UID)] = p.cpusToReuse[string(pod.UID)].Difference(cset)
}
func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
numCPUs := p.guaranteedCPUs(pod, container)
if numCPUs == 0 {
// container belongs in the shared pool (nothing to do; use default cpuset)
return nil
}
klog.InfoS("Static policy: Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
// container belongs in an exclusively allocated pool
metrics.CPUManagerPinningRequestsTotal.Inc()
defer func() {
if rerr != nil {
metrics.CPUManagerPinningErrorsTotal.Inc()
return
}
if !p.options.FullPhysicalCPUsOnly {
// increment only if we know we allocate aligned resources
return
}
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
}()
if p.options.FullPhysicalCPUsOnly {
if (numCPUs % p.cpuGroupSize) != 0 {
// Since CPU Manager has been enabled requesting strict SMT alignment, it means a guaranteed pod can only be admitted
// if the CPU requested is a multiple of the number of virtual cpus per physical cores.
// In case CPU request is not a multiple of the number of virtual cpus per physical cores the Pod will be put
// in Failed state, with SMTAlignmentError as reason. Since the allocation happens in terms of physical cores
// and the scheduler is responsible for ensuring that the workload goes to a node that has enough CPUs,
// the pod would be placed on a node where there are enough physical cores available to be allocated.
// Just like the behaviour in case of static policy, takeByTopology will try to first allocate CPUs from the same socket
// and only in case the request cannot be sattisfied on a single socket, CPU allocation is done for a workload to occupy all
// CPUs on a physical core. Allocation of individual threads would never have to occur.
return SMTAlignmentError{
RequestedCPUs: numCPUs,
CpusPerCore: p.cpuGroupSize,
CausedByPhysicalCPUs: false,
}
}
availablePhysicalCPUs := p.GetAvailablePhysicalCPUs(s).Size()
// It's legal to reserve CPUs which are not core siblings. In this case the CPU allocator can descend to single cores
// when picking CPUs. This will void the guarantee of FullPhysicalCPUsOnly. To prevent this, we need to additionally consider
// all the core siblings of the reserved CPUs as unavailable when computing the free CPUs, before to start the actual allocation.
// This way, by construction all possible CPUs allocation whose number is multiple of the SMT level are now correct again.
if numCPUs > availablePhysicalCPUs {
return SMTAlignmentError{
RequestedCPUs: numCPUs,
CpusPerCore: p.cpuGroupSize,
AvailablePhysicalCPUs: availablePhysicalCPUs,
CausedByPhysicalCPUs: true,
}
}
}
if cpuset, ok := s.GetCPUSet(string(pod.UID), container.Name); ok {
p.updateCPUsToReuse(pod, container, cpuset)
klog.InfoS("Static policy: container already present in state, skipping", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
// Call Topology Manager to get the aligned socket affinity across all hint providers.
hint := p.affinity.GetAffinity(string(pod.UID), container.Name)
klog.InfoS("Topology Affinity", "pod", klog.KObj(pod), "containerName", container.Name, "affinity", hint)
// Allocate CPUs according to the NUMA affinity contained in the hint.
cpuset, err := p.allocateCPUs(s, numCPUs, hint.NUMANodeAffinity, p.cpusToReuse[string(pod.UID)])
if err != nil {
klog.ErrorS(err, "Unable to allocate CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "numCPUs", numCPUs)
return err
}
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
p.updateCPUsToReuse(pod, container, cpuset)
p.updateMetricsOnAllocate(cpuset)
return nil
}
// getAssignedCPUsOfSiblings returns assigned cpus of given container's siblings(all containers other than the given container) in the given pod `podUID`.
func getAssignedCPUsOfSiblings(s state.State, podUID string, containerName string) cpuset.CPUSet {
assignments := s.GetCPUAssignments()
cset := cpuset.New()
for name, cpus := range assignments[podUID] {
if containerName == name {
continue
}
cset = cset.Union(cpus)
}
return cset
}
func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerName string) error {
klog.InfoS("Static policy: RemoveContainer", "podUID", podUID, "containerName", containerName)
cpusInUse := getAssignedCPUsOfSiblings(s, podUID, containerName)
if toRelease, ok := s.GetCPUSet(podUID, containerName); ok {
s.Delete(podUID, containerName)
// Mutate the shared pool, adding released cpus.
toRelease = toRelease.Difference(cpusInUse)
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
p.updateMetricsOnRelease(toRelease)
}
return nil
}
func (p *staticPolicy) allocateCPUs(s state.State, numCPUs int, numaAffinity bitmask.BitMask, reusableCPUs cpuset.CPUSet) (cpuset.CPUSet, error) {
klog.InfoS("AllocateCPUs", "numCPUs", numCPUs, "socket", numaAffinity)
allocatableCPUs := p.GetAvailableCPUs(s).Union(reusableCPUs)
// If there are aligned CPUs in numaAffinity, attempt to take those first.
result := cpuset.New()
if numaAffinity != nil {
alignedCPUs := p.getAlignedCPUs(numaAffinity, allocatableCPUs)
numAlignedToAlloc := alignedCPUs.Size()
if numCPUs < numAlignedToAlloc {
numAlignedToAlloc = numCPUs
}
alignedCPUs, err := p.takeByTopology(alignedCPUs, numAlignedToAlloc)
if err != nil {
return cpuset.New(), err
}
result = result.Union(alignedCPUs)
}
// Get any remaining CPUs from what's leftover after attempting to grab aligned ones.
remainingCPUs, err := p.takeByTopology(allocatableCPUs.Difference(result), numCPUs-result.Size())
if err != nil {
return cpuset.New(), err
}
result = result.Union(remainingCPUs)
// Remove allocated CPUs from the shared CPUSet.
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result))
klog.InfoS("AllocateCPUs", "result", result)
return result, nil
}
func (p *staticPolicy) guaranteedCPUs(pod *v1.Pod, container *v1.Container) int {
if v1qos.GetPodQOS(pod) != v1.PodQOSGuaranteed {
return 0
}
cpuQuantity := container.Resources.Requests[v1.ResourceCPU]
// In-place pod resize feature makes Container.Resources field mutable for CPU & memory.
// AllocatedResources holds the value of Container.Resources.Requests when the pod was admitted.
// We should return this value because this is what kubelet agreed to allocate for the container
// and the value configured with runtime.
if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
if cs, ok := podutil.GetContainerStatus(pod.Status.ContainerStatuses, container.Name); ok {
cpuQuantity = cs.AllocatedResources[v1.ResourceCPU]
}
}
if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() {
return 0
}
// Safe downcast to do for all systems with < 2.1 billion CPUs.
// Per the language spec, `int` is guaranteed to be at least 32 bits wide.
// https://golang.org/ref/spec#Numeric_types
return int(cpuQuantity.Value())
}
func (p *staticPolicy) podGuaranteedCPUs(pod *v1.Pod) int {
// The maximum of requested CPUs by init containers.
requestedByInitContainers := 0
requestedByRestartableInitContainers := 0
for _, container := range pod.Spec.InitContainers {
if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
continue
}
requestedCPU := p.guaranteedCPUs(pod, &container)
// See https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/753-sidecar-containers#resources-calculation-for-scheduling-and-pod-admission
// for the detail.
if podutil.IsRestartableInitContainer(&container) {
requestedByRestartableInitContainers += requestedCPU
} else if requestedByRestartableInitContainers+requestedCPU > requestedByInitContainers {
requestedByInitContainers = requestedByRestartableInitContainers + requestedCPU
}
}
// The sum of requested CPUs by app containers.
requestedByAppContainers := 0
for _, container := range pod.Spec.Containers {
if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
continue
}
requestedByAppContainers += p.guaranteedCPUs(pod, &container)
}
requestedByLongRunningContainers := requestedByAppContainers + requestedByRestartableInitContainers
if requestedByInitContainers > requestedByLongRunningContainers {
return requestedByInitContainers
}
return requestedByLongRunningContainers
}
func (p *staticPolicy) takeByTopology(availableCPUs cpuset.CPUSet, numCPUs int) (cpuset.CPUSet, error) {
cpuSortingStrategy := CPUSortingStrategyPacked
if p.options.DistributeCPUsAcrossCores {
cpuSortingStrategy = CPUSortingStrategySpread
}
if p.options.DistributeCPUsAcrossNUMA {
cpuGroupSize := 1
if p.options.FullPhysicalCPUsOnly {
cpuGroupSize = p.cpuGroupSize
}
return takeByTopologyNUMADistributed(p.topology, availableCPUs, numCPUs, cpuGroupSize, cpuSortingStrategy)
}
return takeByTopologyNUMAPacked(p.topology, availableCPUs, numCPUs, cpuSortingStrategy, p.options.PreferAlignByUncoreCacheOption)
}
func (p *staticPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Get a count of how many guaranteed CPUs have been requested.
requested := p.guaranteedCPUs(pod, container)
// Number of required CPUs is not an integer or a container is not part of the Guaranteed QoS class.
// It will be treated by the TopologyManager as having no preference and cause it to ignore this
// resource when considering pod alignment.
// In terms of hints, this is equal to: TopologyHints[NUMANodeAffinity: nil, Preferred: true].
if requested == 0 {
return nil
}
// Short circuit to regenerate the same hints if there are already
// guaranteed CPUs allocated to the Container. This might happen after a
// kubelet restart, for example.
if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists {
if allocated.Size() != requested {
klog.InfoS("CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "requestedSize", requested, "allocatedSize", allocated.Size())
// An empty list of hints will be treated as a preference that cannot be satisfied.
// In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false].
// For all but the best-effort policy, the Topology Manager will throw a pod-admission error.
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): {},
}
}
klog.InfoS("Regenerating TopologyHints for CPUs already allocated", "pod", klog.KObj(pod), "containerName", container.Name)
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): p.generateCPUTopologyHints(allocated, cpuset.CPUSet{}, requested),
}
}
// Get a list of available CPUs.
available := p.GetAvailableCPUs(s)
// Get a list of reusable CPUs (e.g. CPUs reused from initContainers).
// It should be an empty CPUSet for a newly created pod.
reusable := p.cpusToReuse[string(pod.UID)]
// Generate hints.
cpuHints := p.generateCPUTopologyHints(available, reusable, requested)
klog.InfoS("TopologyHints generated", "pod", klog.KObj(pod), "containerName", container.Name, "cpuHints", cpuHints)
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): cpuHints,
}
}
func (p *staticPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Get a count of how many guaranteed CPUs have been requested by Pod.
requested := p.podGuaranteedCPUs(pod)
// Number of required CPUs is not an integer or a pod is not part of the Guaranteed QoS class.
// It will be treated by the TopologyManager as having no preference and cause it to ignore this
// resource when considering pod alignment.
// In terms of hints, this is equal to: TopologyHints[NUMANodeAffinity: nil, Preferred: true].
if requested == 0 {
return nil
}
assignedCPUs := cpuset.New()
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
requestedByContainer := p.guaranteedCPUs(pod, &container)
// Short circuit to regenerate the same hints if there are already
// guaranteed CPUs allocated to the Container. This might happen after a
// kubelet restart, for example.
if allocated, exists := s.GetCPUSet(string(pod.UID), container.Name); exists {
if allocated.Size() != requestedByContainer {
klog.InfoS("CPUs already allocated to container with different number than request", "pod", klog.KObj(pod), "containerName", container.Name, "allocatedSize", requested, "requestedByContainer", requestedByContainer, "allocatedSize", allocated.Size())
// An empty list of hints will be treated as a preference that cannot be satisfied.
// In definition of hints this is equal to: TopologyHint[NUMANodeAffinity: nil, Preferred: false].
// For all but the best-effort policy, the Topology Manager will throw a pod-admission error.
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): {},
}
}
// A set of CPUs already assigned to containers in this pod
assignedCPUs = assignedCPUs.Union(allocated)
}
}
if assignedCPUs.Size() == requested {
klog.InfoS("Regenerating TopologyHints for CPUs already allocated", "pod", klog.KObj(pod))
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): p.generateCPUTopologyHints(assignedCPUs, cpuset.CPUSet{}, requested),
}
}
// Get a list of available CPUs.
available := p.GetAvailableCPUs(s)
// Get a list of reusable CPUs (e.g. CPUs reused from initContainers).
// It should be an empty CPUSet for a newly created pod.
reusable := p.cpusToReuse[string(pod.UID)]
// Ensure any CPUs already assigned to containers in this pod are included as part of the hint generation.
reusable = reusable.Union(assignedCPUs)
// Generate hints.
cpuHints := p.generateCPUTopologyHints(available, reusable, requested)
klog.InfoS("TopologyHints generated", "pod", klog.KObj(pod), "cpuHints", cpuHints)
return map[string][]topologymanager.TopologyHint{
string(v1.ResourceCPU): cpuHints,
}
}
// generateCPUTopologyHints generates a set of TopologyHints given the set of
// available CPUs and the number of CPUs being requested.
//
// It follows the convention of marking all hints that have the same number of
// bits set as the narrowest matching NUMANodeAffinity with 'Preferred: true', and
// marking all others with 'Preferred: false'.
func (p *staticPolicy) generateCPUTopologyHints(availableCPUs cpuset.CPUSet, reusableCPUs cpuset.CPUSet, request int) []topologymanager.TopologyHint {
// Initialize minAffinitySize to include all NUMA Nodes.
minAffinitySize := p.topology.CPUDetails.NUMANodes().Size()
// Iterate through all combinations of numa nodes bitmask and build hints from them.
hints := []topologymanager.TopologyHint{}
bitmask.IterateBitMasks(p.topology.CPUDetails.NUMANodes().List(), func(mask bitmask.BitMask) {
// First, update minAffinitySize for the current request size.
cpusInMask := p.topology.CPUDetails.CPUsInNUMANodes(mask.GetBits()...).Size()
if cpusInMask >= request && mask.Count() < minAffinitySize {
minAffinitySize = mask.Count()
}
// Then check to see if we have enough CPUs available on the current
// numa node bitmask to satisfy the CPU request.
numMatching := 0
for _, c := range reusableCPUs.List() {
// Disregard this mask if its NUMANode isn't part of it.
if !mask.IsSet(p.topology.CPUDetails[c].NUMANodeID) {
return
}
numMatching++
}
// Finally, check to see if enough available CPUs remain on the current
// NUMA node combination to satisfy the CPU request.
for _, c := range availableCPUs.List() {
if mask.IsSet(p.topology.CPUDetails[c].NUMANodeID) {
numMatching++
}
}
// If they don't, then move onto the next combination.
if numMatching < request {
return
}
// Otherwise, create a new hint from the numa node bitmask and add it to the
// list of hints. We set all hint preferences to 'false' on the first
// pass through.
hints = append(hints, topologymanager.TopologyHint{
NUMANodeAffinity: mask,
Preferred: false,
})
})
// Loop back through all hints and update the 'Preferred' field based on
// counting the number of bits sets in the affinity mask and comparing it
// to the minAffinitySize. Only those with an equal number of bits set (and
// with a minimal set of numa nodes) will be considered preferred.
for i := range hints {
if p.options.AlignBySocket && p.isHintSocketAligned(hints[i], minAffinitySize) {
hints[i].Preferred = true
continue
}
if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
hints[i].Preferred = true
}
}
return hints
}
// isHintSocketAligned function return true if numa nodes in hint are socket aligned.
func (p *staticPolicy) isHintSocketAligned(hint topologymanager.TopologyHint, minAffinitySize int) bool {
numaNodesBitMask := hint.NUMANodeAffinity.GetBits()
numaNodesPerSocket := p.topology.NumNUMANodes / p.topology.NumSockets
if numaNodesPerSocket == 0 {
return false
}
// minSockets refers to minimum number of socket required to satify allocation.
// A hint is considered socket aligned if sockets across which numa nodes span is equal to minSockets
minSockets := (minAffinitySize + numaNodesPerSocket - 1) / numaNodesPerSocket
return p.topology.CPUDetails.SocketsInNUMANodes(numaNodesBitMask...).Size() == minSockets
}
// getAlignedCPUs return set of aligned CPUs based on numa affinity mask and configured policy options.
func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableCPUs cpuset.CPUSet) cpuset.CPUSet {
alignedCPUs := cpuset.New()
numaBits := numaAffinity.GetBits()
// If align-by-socket policy option is enabled, NUMA based hint is expanded to
// socket aligned hint. It will ensure that first socket aligned available CPUs are
// allocated before we try to find CPUs across socket to satisfy allocation request.
if p.options.AlignBySocket {
socketBits := p.topology.CPUDetails.SocketsInNUMANodes(numaBits...).UnsortedList()
for _, socketID := range socketBits {
alignedCPUs = alignedCPUs.Union(allocatableCPUs.Intersection(p.topology.CPUDetails.CPUsInSockets(socketID)))
}
return alignedCPUs
}
for _, numaNodeID := range numaBits {
alignedCPUs = alignedCPUs.Union(allocatableCPUs.Intersection(p.topology.CPUDetails.CPUsInNUMANodes(numaNodeID)))
}
return alignedCPUs
}
func (p *staticPolicy) initializeMetrics(s state.State) {
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
}
func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {
ncpus := cset.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000))
}
func (p *staticPolicy) updateMetricsOnRelease(cset cpuset.CPUSet) {
ncpus := cset.Size()
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus))
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000))
}
func countExclusiveCPUs(s state.State) int {
exclusiveCPUs := 0
for _, cpuAssign := range s.GetCPUAssignments() {
for _, cset := range cpuAssign {
exclusiveCPUs += cset.Size()
}
}
return exclusiveCPUs
}

View File

@ -0,0 +1,135 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"fmt"
"hash/fnv"
"strings"
"k8s.io/apimachinery/pkg/util/dump"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
var _ checkpointmanager.Checkpoint = &CPUManagerCheckpointV1{}
var _ checkpointmanager.Checkpoint = &CPUManagerCheckpointV2{}
var _ checkpointmanager.Checkpoint = &CPUManagerCheckpoint{}
// CPUManagerCheckpoint struct is used to store cpu/pod assignments in a checkpoint in v2 format
type CPUManagerCheckpoint struct {
PolicyName string `json:"policyName"`
DefaultCPUSet string `json:"defaultCpuSet"`
Entries map[string]map[string]string `json:"entries,omitempty"`
Checksum checksum.Checksum `json:"checksum"`
}
// CPUManagerCheckpointV1 struct is used to store cpu/pod assignments in a checkpoint in v1 format
type CPUManagerCheckpointV1 struct {
PolicyName string `json:"policyName"`
DefaultCPUSet string `json:"defaultCpuSet"`
Entries map[string]string `json:"entries,omitempty"`
Checksum checksum.Checksum `json:"checksum"`
}
// CPUManagerCheckpointV2 struct is used to store cpu/pod assignments in a checkpoint in v2 format
type CPUManagerCheckpointV2 = CPUManagerCheckpoint
// NewCPUManagerCheckpoint returns an instance of Checkpoint
func NewCPUManagerCheckpoint() *CPUManagerCheckpoint {
//nolint:staticcheck // unexported-type-in-api user-facing error message
return newCPUManagerCheckpointV2()
}
func newCPUManagerCheckpointV1() *CPUManagerCheckpointV1 {
return &CPUManagerCheckpointV1{
Entries: make(map[string]string),
}
}
func newCPUManagerCheckpointV2() *CPUManagerCheckpointV2 {
return &CPUManagerCheckpointV2{
Entries: make(map[string]map[string]string),
}
}
// MarshalCheckpoint returns marshalled checkpoint in v1 format
func (cp *CPUManagerCheckpointV1) MarshalCheckpoint() ([]byte, error) {
// make sure checksum wasn't set before so it doesn't affect output checksum
cp.Checksum = 0
cp.Checksum = checksum.New(cp)
return json.Marshal(*cp)
}
// MarshalCheckpoint returns marshalled checkpoint in v2 format
func (cp *CPUManagerCheckpointV2) MarshalCheckpoint() ([]byte, error) {
// make sure checksum wasn't set before so it doesn't affect output checksum
cp.Checksum = 0
cp.Checksum = checksum.New(cp)
return json.Marshal(*cp)
}
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint in v1 format
func (cp *CPUManagerCheckpointV1) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint in v2 format
func (cp *CPUManagerCheckpointV2) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// VerifyChecksum verifies that current checksum of checkpoint is valid in v1 format
func (cp *CPUManagerCheckpointV1) VerifyChecksum() error {
if cp.Checksum == 0 {
// accept empty checksum for compatibility with old file backend
return nil
}
ck := cp.Checksum
cp.Checksum = 0
object := dump.ForHash(cp)
object = strings.Replace(object, "CPUManagerCheckpointV1", "CPUManagerCheckpoint", 1)
cp.Checksum = ck
hash := fnv.New32a()
fmt.Fprintf(hash, "%v", object)
actualCS := checksum.Checksum(hash.Sum32())
if cp.Checksum != actualCS {
return &errors.CorruptCheckpointError{
ActualCS: uint64(actualCS),
ExpectedCS: uint64(cp.Checksum),
}
}
return nil
}
// VerifyChecksum verifies that current checksum of checkpoint is valid in v2 format
func (cp *CPUManagerCheckpointV2) VerifyChecksum() error {
if cp.Checksum == 0 {
// accept empty checksum for compatibility with old file backend
return nil
}
ck := cp.Checksum
cp.Checksum = 0
err := ck.Verify(cp)
cp.Checksum = ck
return err
}

View File

@ -0,0 +1,58 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"k8s.io/utils/cpuset"
)
// ContainerCPUAssignments type used in cpu manager state
type ContainerCPUAssignments map[string]map[string]cpuset.CPUSet
// Clone returns a copy of ContainerCPUAssignments
func (as ContainerCPUAssignments) Clone() ContainerCPUAssignments {
ret := make(ContainerCPUAssignments, len(as))
for pod := range as {
ret[pod] = make(map[string]cpuset.CPUSet, len(as[pod]))
for container, cset := range as[pod] {
ret[pod][container] = cset
}
}
return ret
}
// Reader interface used to read current cpu/pod assignment state
type Reader interface {
GetCPUSet(podUID string, containerName string) (cpuset.CPUSet, bool)
GetDefaultCPUSet() cpuset.CPUSet
GetCPUSetOrDefault(podUID string, containerName string) cpuset.CPUSet
GetCPUAssignments() ContainerCPUAssignments
}
type writer interface {
SetCPUSet(podUID string, containerName string, cpuset cpuset.CPUSet)
SetDefaultCPUSet(cpuset cpuset.CPUSet)
SetCPUAssignments(ContainerCPUAssignments)
Delete(podUID string, containerName string)
ClearState()
}
// State interface provides methods for tracking and setting cpu/pod assignment
type State interface {
Reader
writer
}

View File

@ -0,0 +1,250 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"fmt"
"path/filepath"
"sync"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/utils/cpuset"
)
var _ State = &stateCheckpoint{}
type stateCheckpoint struct {
mux sync.RWMutex
policyName string
cache State
checkpointManager checkpointmanager.CheckpointManager
checkpointName string
initialContainers containermap.ContainerMap
}
// NewCheckpointState creates new State for keeping track of cpu/pod assignment with checkpoint backend
func NewCheckpointState(stateDir, checkpointName, policyName string, initialContainers containermap.ContainerMap) (State, error) {
checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
}
stateCheckpoint := &stateCheckpoint{
cache: NewMemoryState(),
policyName: policyName,
checkpointManager: checkpointManager,
checkpointName: checkpointName,
initialContainers: initialContainers,
}
if err := stateCheckpoint.restoreState(); err != nil {
//nolint:staticcheck // ST1005 user-facing error message
return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete the CPU manager checkpoint file %q before restarting Kubelet",
err, filepath.Join(stateDir, checkpointName))
}
return stateCheckpoint, nil
}
// migrateV1CheckpointToV2Checkpoint() converts checkpoints from the v1 format to the v2 format
func (sc *stateCheckpoint) migrateV1CheckpointToV2Checkpoint(src *CPUManagerCheckpointV1, dst *CPUManagerCheckpointV2) error {
if src.PolicyName != "" {
dst.PolicyName = src.PolicyName
}
if src.DefaultCPUSet != "" {
dst.DefaultCPUSet = src.DefaultCPUSet
}
for containerID, cset := range src.Entries {
podUID, containerName, err := sc.initialContainers.GetContainerRef(containerID)
if err != nil {
return fmt.Errorf("containerID '%v' not found in initial containers list", containerID)
}
if dst.Entries == nil {
dst.Entries = make(map[string]map[string]string)
}
if _, exists := dst.Entries[podUID]; !exists {
dst.Entries[podUID] = make(map[string]string)
}
dst.Entries[podUID][containerName] = cset
}
return nil
}
// restores state from a checkpoint and creates it if it doesn't exist
func (sc *stateCheckpoint) restoreState() error {
sc.mux.Lock()
defer sc.mux.Unlock()
var err error
checkpointV1 := newCPUManagerCheckpointV1()
checkpointV2 := newCPUManagerCheckpointV2()
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpointV1); err != nil {
checkpointV1 = &CPUManagerCheckpointV1{} // reset it back to 0
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpointV2); err != nil {
if err == errors.ErrCheckpointNotFound {
return sc.storeState()
}
return err
}
}
if err = sc.migrateV1CheckpointToV2Checkpoint(checkpointV1, checkpointV2); err != nil {
return fmt.Errorf("error migrating v1 checkpoint state to v2 checkpoint state: %s", err)
}
if sc.policyName != checkpointV2.PolicyName {
return fmt.Errorf("configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpointV2.PolicyName)
}
var tmpDefaultCPUSet cpuset.CPUSet
if tmpDefaultCPUSet, err = cpuset.Parse(checkpointV2.DefaultCPUSet); err != nil {
return fmt.Errorf("could not parse default cpu set %q: %v", checkpointV2.DefaultCPUSet, err)
}
var tmpContainerCPUSet cpuset.CPUSet
tmpAssignments := ContainerCPUAssignments{}
for pod := range checkpointV2.Entries {
tmpAssignments[pod] = make(map[string]cpuset.CPUSet, len(checkpointV2.Entries[pod]))
for container, cpuString := range checkpointV2.Entries[pod] {
if tmpContainerCPUSet, err = cpuset.Parse(cpuString); err != nil {
return fmt.Errorf("could not parse cpuset %q for container %q in pod %q: %v", cpuString, container, pod, err)
}
tmpAssignments[pod][container] = tmpContainerCPUSet
}
}
sc.cache.SetDefaultCPUSet(tmpDefaultCPUSet)
sc.cache.SetCPUAssignments(tmpAssignments)
klog.V(2).InfoS("State checkpoint: restored state from checkpoint")
klog.V(2).InfoS("State checkpoint: defaultCPUSet", "defaultCpuSet", tmpDefaultCPUSet.String())
return nil
}
// saves state to a checkpoint, caller is responsible for locking
func (sc *stateCheckpoint) storeState() error {
checkpoint := NewCPUManagerCheckpoint()
checkpoint.PolicyName = sc.policyName
checkpoint.DefaultCPUSet = sc.cache.GetDefaultCPUSet().String()
assignments := sc.cache.GetCPUAssignments()
for pod := range assignments {
checkpoint.Entries[pod] = make(map[string]string, len(assignments[pod]))
for container, cset := range assignments[pod] {
checkpoint.Entries[pod][container] = cset.String()
}
}
err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
if err != nil {
klog.ErrorS(err, "Failed to save checkpoint")
return err
}
return nil
}
// GetCPUSet returns current CPU set
func (sc *stateCheckpoint) GetCPUSet(podUID string, containerName string) (cpuset.CPUSet, bool) {
sc.mux.RLock()
defer sc.mux.RUnlock()
res, ok := sc.cache.GetCPUSet(podUID, containerName)
return res, ok
}
// GetDefaultCPUSet returns default CPU set
func (sc *stateCheckpoint) GetDefaultCPUSet() cpuset.CPUSet {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetDefaultCPUSet()
}
// GetCPUSetOrDefault returns current CPU set, or default one if it wasn't changed
func (sc *stateCheckpoint) GetCPUSetOrDefault(podUID string, containerName string) cpuset.CPUSet {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetCPUSetOrDefault(podUID, containerName)
}
// GetCPUAssignments returns current CPU to pod assignments
func (sc *stateCheckpoint) GetCPUAssignments() ContainerCPUAssignments {
sc.mux.RLock()
defer sc.mux.RUnlock()
return sc.cache.GetCPUAssignments()
}
// SetCPUSet sets CPU set
func (sc *stateCheckpoint) SetCPUSet(podUID string, containerName string, cset cpuset.CPUSet) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetCPUSet(podUID, containerName, cset)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetDefaultCPUSet sets default CPU set
func (sc *stateCheckpoint) SetDefaultCPUSet(cset cpuset.CPUSet) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetDefaultCPUSet(cset)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetCPUAssignments sets CPU to pod assignments
func (sc *stateCheckpoint) SetCPUAssignments(a ContainerCPUAssignments) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.SetCPUAssignments(a)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// Delete deletes assignment for specified pod
func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.Delete(podUID, containerName)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// ClearState clears the state and saves it in a checkpoint
func (sc *stateCheckpoint) ClearState() {
sc.mux.Lock()
defer sc.mux.Unlock()
sc.cache.ClearState()
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}

View File

@ -0,0 +1,117 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"sync"
"k8s.io/klog/v2"
"k8s.io/utils/cpuset"
)
type stateMemory struct {
sync.RWMutex
assignments ContainerCPUAssignments
defaultCPUSet cpuset.CPUSet
}
var _ State = &stateMemory{}
// NewMemoryState creates new State for keeping track of cpu/pod assignment
func NewMemoryState() State {
klog.InfoS("Initialized new in-memory state store")
return &stateMemory{
assignments: ContainerCPUAssignments{},
defaultCPUSet: cpuset.New(),
}
}
func (s *stateMemory) GetCPUSet(podUID string, containerName string) (cpuset.CPUSet, bool) {
s.RLock()
defer s.RUnlock()
res, ok := s.assignments[podUID][containerName]
return res.Clone(), ok
}
func (s *stateMemory) GetDefaultCPUSet() cpuset.CPUSet {
s.RLock()
defer s.RUnlock()
return s.defaultCPUSet.Clone()
}
func (s *stateMemory) GetCPUSetOrDefault(podUID string, containerName string) cpuset.CPUSet {
if res, ok := s.GetCPUSet(podUID, containerName); ok {
return res
}
return s.GetDefaultCPUSet()
}
func (s *stateMemory) GetCPUAssignments() ContainerCPUAssignments {
s.RLock()
defer s.RUnlock()
return s.assignments.Clone()
}
func (s *stateMemory) SetCPUSet(podUID string, containerName string, cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
if _, ok := s.assignments[podUID]; !ok {
s.assignments[podUID] = make(map[string]cpuset.CPUSet)
}
s.assignments[podUID][containerName] = cset
klog.InfoS("Updated desired CPUSet", "podUID", podUID, "containerName", containerName, "cpuSet", cset)
}
func (s *stateMemory) SetDefaultCPUSet(cset cpuset.CPUSet) {
s.Lock()
defer s.Unlock()
s.defaultCPUSet = cset
klog.InfoS("Updated default CPUSet", "cpuSet", cset)
}
func (s *stateMemory) SetCPUAssignments(a ContainerCPUAssignments) {
s.Lock()
defer s.Unlock()
s.assignments = a.Clone()
klog.InfoS("Updated CPUSet assignments", "assignments", a)
}
func (s *stateMemory) Delete(podUID string, containerName string) {
s.Lock()
defer s.Unlock()
delete(s.assignments[podUID], containerName)
if len(s.assignments[podUID]) == 0 {
delete(s.assignments, podUID)
}
klog.V(2).InfoS("Deleted CPUSet assignment", "podUID", podUID, "containerName", containerName)
}
func (s *stateMemory) ClearState() {
s.Lock()
defer s.Unlock()
s.defaultCPUSet = cpuset.CPUSet{}
s.assignments = make(ContainerCPUAssignments)
klog.V(2).InfoS("Cleared state")
}

View File

@ -0,0 +1,18 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package topology contains helpers for the CPU manager.
package topology // import "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"

View File

@ -0,0 +1,389 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package topology
import (
"fmt"
cadvisorapi "github.com/google/cadvisor/info/v1"
"k8s.io/klog/v2"
"k8s.io/utils/cpuset"
)
// NUMANodeInfo is a map from NUMANode ID to a list of CPU IDs associated with
// that NUMANode.
type NUMANodeInfo map[int]cpuset.CPUSet
// CPUDetails is a map from CPU ID to Core ID, Socket ID, and NUMA ID.
type CPUDetails map[int]CPUInfo
// CPUTopology contains details of node cpu, where :
// CPU - logical CPU, cadvisor - thread
// Core - physical CPU, cadvisor - Core
// Socket - socket, cadvisor - Socket
// NUMA Node - NUMA cell, cadvisor - Node
// UncoreCache - Split L3 Cache Topology, cadvisor
type CPUTopology struct {
NumCPUs int
NumCores int
NumUncoreCache int
NumSockets int
NumNUMANodes int
CPUDetails CPUDetails
}
// CPUsPerCore returns the number of logical CPUs are associated with
// each core.
func (topo *CPUTopology) CPUsPerCore() int {
if topo.NumCores == 0 {
return 0
}
return topo.NumCPUs / topo.NumCores
}
// CPUsPerSocket returns the number of logical CPUs are associated with
// each socket.
func (topo *CPUTopology) CPUsPerSocket() int {
if topo.NumSockets == 0 {
return 0
}
return topo.NumCPUs / topo.NumSockets
}
// CPUsPerUncore returns the number of logicial CPUs that are associated with
// each UncoreCache
func (topo *CPUTopology) CPUsPerUncore() int {
if topo.NumUncoreCache == 0 {
return 0
}
return topo.NumCPUs / topo.NumUncoreCache
}
// CPUCoreID returns the physical core ID which the given logical CPU
// belongs to.
func (topo *CPUTopology) CPUCoreID(cpu int) (int, error) {
info, ok := topo.CPUDetails[cpu]
if !ok {
return -1, fmt.Errorf("unknown CPU ID: %d", cpu)
}
return info.CoreID, nil
}
// CPUCoreID returns the socket ID which the given logical CPU belongs to.
func (topo *CPUTopology) CPUSocketID(cpu int) (int, error) {
info, ok := topo.CPUDetails[cpu]
if !ok {
return -1, fmt.Errorf("unknown CPU ID: %d", cpu)
}
return info.SocketID, nil
}
// CPUCoreID returns the NUMA node ID which the given logical CPU belongs to.
func (topo *CPUTopology) CPUNUMANodeID(cpu int) (int, error) {
info, ok := topo.CPUDetails[cpu]
if !ok {
return -1, fmt.Errorf("unknown CPU ID: %d", cpu)
}
return info.NUMANodeID, nil
}
// CPUInfo contains the NUMA, socket, UncoreCache and core IDs associated with a CPU.
type CPUInfo struct {
NUMANodeID int
SocketID int
CoreID int
UncoreCacheID int
}
// KeepOnly returns a new CPUDetails object with only the supplied cpus.
func (d CPUDetails) KeepOnly(cpus cpuset.CPUSet) CPUDetails {
result := CPUDetails{}
for cpu, info := range d {
if cpus.Contains(cpu) {
result[cpu] = info
}
}
return result
}
// UncoreCaches returns all the uncorecache Id (L3 Index) associated with the CPUs in this CPUDetails
func (d CPUDetails) UncoreCaches() cpuset.CPUSet {
var numUnCoreIDs []int
for _, info := range d {
numUnCoreIDs = append(numUnCoreIDs, info.UncoreCacheID)
}
return cpuset.New(numUnCoreIDs...)
}
// UnCoresInNUMANodes returns all of the uncore IDs associated with the given
// NUMANode IDs in this CPUDetails.
func (d CPUDetails) UncoreInNUMANodes(ids ...int) cpuset.CPUSet {
var unCoreIDs []int
for _, id := range ids {
for _, info := range d {
if info.NUMANodeID == id {
unCoreIDs = append(unCoreIDs, info.UncoreCacheID)
}
}
}
return cpuset.New(unCoreIDs...)
}
// CoresNeededInUncoreCache returns either the full list of all available unique core IDs associated with the given
// UnCoreCache IDs in this CPUDetails or subset that matches the ask.
func (d CPUDetails) CoresNeededInUncoreCache(numCoresNeeded int, ids ...int) cpuset.CPUSet {
coreIDs := d.coresInUncoreCache(ids...)
if coreIDs.Size() <= numCoresNeeded {
return coreIDs
}
tmpCoreIDs := coreIDs.List()
return cpuset.New(tmpCoreIDs[:numCoresNeeded]...)
}
// Helper function that just gets the cores
func (d CPUDetails) coresInUncoreCache(ids ...int) cpuset.CPUSet {
var coreIDs []int
for _, id := range ids {
for _, info := range d {
if info.UncoreCacheID == id {
coreIDs = append(coreIDs, info.CoreID)
}
}
}
return cpuset.New(coreIDs...)
}
// CPUsInUncoreCaches returns all the logical CPU IDs associated with the given
// UnCoreCache IDs in this CPUDetails
func (d CPUDetails) CPUsInUncoreCaches(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.UncoreCacheID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
// NUMANodes returns all of the NUMANode IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) NUMANodes() cpuset.CPUSet {
var numaNodeIDs []int
for _, info := range d {
numaNodeIDs = append(numaNodeIDs, info.NUMANodeID)
}
return cpuset.New(numaNodeIDs...)
}
// NUMANodesInSockets returns all of the logical NUMANode IDs associated with
// the given socket IDs in this CPUDetails.
func (d CPUDetails) NUMANodesInSockets(ids ...int) cpuset.CPUSet {
var numaNodeIDs []int
for _, id := range ids {
for _, info := range d {
if info.SocketID == id {
numaNodeIDs = append(numaNodeIDs, info.NUMANodeID)
}
}
}
return cpuset.New(numaNodeIDs...)
}
// Sockets returns all of the socket IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Sockets() cpuset.CPUSet {
var socketIDs []int
for _, info := range d {
socketIDs = append(socketIDs, info.SocketID)
}
return cpuset.New(socketIDs...)
}
// CPUsInSockets returns all of the logical CPU IDs associated with the given
// socket IDs in this CPUDetails.
func (d CPUDetails) CPUsInSockets(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.SocketID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
// SocketsInNUMANodes returns all of the logical Socket IDs associated with the
// given NUMANode IDs in this CPUDetails.
func (d CPUDetails) SocketsInNUMANodes(ids ...int) cpuset.CPUSet {
var socketIDs []int
for _, id := range ids {
for _, info := range d {
if info.NUMANodeID == id {
socketIDs = append(socketIDs, info.SocketID)
}
}
}
return cpuset.New(socketIDs...)
}
// Cores returns all of the core IDs associated with the CPUs in this
// CPUDetails.
func (d CPUDetails) Cores() cpuset.CPUSet {
var coreIDs []int
for _, info := range d {
coreIDs = append(coreIDs, info.CoreID)
}
return cpuset.New(coreIDs...)
}
// CoresInNUMANodes returns all of the core IDs associated with the given
// NUMANode IDs in this CPUDetails.
func (d CPUDetails) CoresInNUMANodes(ids ...int) cpuset.CPUSet {
var coreIDs []int
for _, id := range ids {
for _, info := range d {
if info.NUMANodeID == id {
coreIDs = append(coreIDs, info.CoreID)
}
}
}
return cpuset.New(coreIDs...)
}
// CoresInSockets returns all of the core IDs associated with the given socket
// IDs in this CPUDetails.
func (d CPUDetails) CoresInSockets(ids ...int) cpuset.CPUSet {
var coreIDs []int
for _, id := range ids {
for _, info := range d {
if info.SocketID == id {
coreIDs = append(coreIDs, info.CoreID)
}
}
}
return cpuset.New(coreIDs...)
}
// CPUs returns all of the logical CPU IDs in this CPUDetails.
func (d CPUDetails) CPUs() cpuset.CPUSet {
var cpuIDs []int
for cpuID := range d {
cpuIDs = append(cpuIDs, cpuID)
}
return cpuset.New(cpuIDs...)
}
// CPUsInNUMANodes returns all of the logical CPU IDs associated with the given
// NUMANode IDs in this CPUDetails.
func (d CPUDetails) CPUsInNUMANodes(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.NUMANodeID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
// CPUsInCores returns all of the logical CPU IDs associated with the given
// core IDs in this CPUDetails.
func (d CPUDetails) CPUsInCores(ids ...int) cpuset.CPUSet {
var cpuIDs []int
for _, id := range ids {
for cpu, info := range d {
if info.CoreID == id {
cpuIDs = append(cpuIDs, cpu)
}
}
}
return cpuset.New(cpuIDs...)
}
func getUncoreCacheID(core cadvisorapi.Core) int {
if len(core.UncoreCaches) < 1 {
// In case cAdvisor is nil, failback to socket alignment since uncorecache is not shared
return core.SocketID
}
// Even though cadvisor API returns a slice, we only expect either 0 or a 1 uncore caches,
// so everything past the first entry should be discarded or ignored
return core.UncoreCaches[0].Id
}
// Discover returns CPUTopology based on cadvisor node info
func Discover(machineInfo *cadvisorapi.MachineInfo) (*CPUTopology, error) {
if machineInfo.NumCores == 0 {
return nil, fmt.Errorf("could not detect number of cpus")
}
CPUDetails := CPUDetails{}
numPhysicalCores := 0
for _, node := range machineInfo.Topology {
numPhysicalCores += len(node.Cores)
for _, core := range node.Cores {
if coreID, err := getUniqueCoreID(core.Threads); err == nil {
for _, cpu := range core.Threads {
CPUDetails[cpu] = CPUInfo{
CoreID: coreID,
SocketID: core.SocketID,
NUMANodeID: node.Id,
UncoreCacheID: getUncoreCacheID(core),
}
}
} else {
klog.ErrorS(nil, "Could not get unique coreID for socket", "socket", core.SocketID, "core", core.Id, "threads", core.Threads)
return nil, err
}
}
}
return &CPUTopology{
NumCPUs: machineInfo.NumCores,
NumSockets: machineInfo.NumSockets,
NumCores: numPhysicalCores,
NumNUMANodes: CPUDetails.NUMANodes().Size(),
NumUncoreCache: CPUDetails.UncoreCaches().Size(),
CPUDetails: CPUDetails,
}, nil
}
// getUniqueCoreID computes coreId as the lowest cpuID
// for a given Threads []int slice. This will assure that coreID's are
// platform unique (opposite to what cAdvisor reports)
func getUniqueCoreID(threads []int) (coreID int, err error) {
if len(threads) == 0 {
return 0, fmt.Errorf("no cpus provided")
}
if len(threads) != cpuset.New(threads...).Size() {
return 0, fmt.Errorf("cpus provided are not unique")
}
min := threads[0]
for _, thread := range threads[1:] {
if thread < min {
min = thread
}
}
return min, nil
}

View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers: []
reviewers:
- klueska
emeritus_approvers:
- vishh
- jiayingz

View File

@ -0,0 +1,109 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package checkpoint
import (
"encoding/json"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)
// DeviceManagerCheckpoint defines the operations to retrieve pod devices
type DeviceManagerCheckpoint interface {
checkpointmanager.Checkpoint
GetData() ([]PodDevicesEntry, map[string][]string)
}
// DevicesPerNUMA represents device ids obtained from device plugin per NUMA node id
type DevicesPerNUMA map[int64][]string
// PodDevicesEntry connects pod information to devices
type PodDevicesEntry struct {
PodUID string
ContainerName string
ResourceName string
DeviceIDs DevicesPerNUMA
AllocResp []byte
}
// checkpointData struct is used to store pod to device allocation information
// in a checkpoint file.
// TODO: add version control when we need to change checkpoint format.
type checkpointData struct {
PodDeviceEntries []PodDevicesEntry
RegisteredDevices map[string][]string
}
// Data holds checkpoint data and its checksum
type Data struct {
Data checkpointData
Checksum checksum.Checksum
}
// NewDevicesPerNUMA is a function that creates DevicesPerNUMA map
func NewDevicesPerNUMA() DevicesPerNUMA {
return make(DevicesPerNUMA)
}
// Devices is a function that returns all device ids for all NUMA nodes
// and represent it as sets.Set[string]
func (dev DevicesPerNUMA) Devices() sets.Set[string] {
result := sets.New[string]()
for _, devs := range dev {
result.Insert(devs...)
}
return result
}
// New returns an instance of Checkpoint - must be an alias for the most recent version
func New(devEntries []PodDevicesEntry, devices map[string][]string) DeviceManagerCheckpoint {
return newV2(devEntries, devices)
}
func newV2(devEntries []PodDevicesEntry, devices map[string][]string) DeviceManagerCheckpoint {
return &Data{
Data: checkpointData{
PodDeviceEntries: devEntries,
RegisteredDevices: devices,
},
}
}
// MarshalCheckpoint returns marshalled data
func (cp *Data) MarshalCheckpoint() ([]byte, error) {
cp.Checksum = checksum.New(cp.Data)
return json.Marshal(*cp)
}
// UnmarshalCheckpoint returns unmarshalled data
func (cp *Data) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, cp)
}
// VerifyChecksum verifies that passed checksum is same as calculated checksum
func (cp *Data) VerifyChecksum() error {
return cp.Checksum.Verify(cp.Data)
}
// GetData returns device entries and registered devices in the *most recent*
// checkpoint format, *not* in the original format stored on disk.
func (cp *Data) GetData() ([]PodDevicesEntry, map[string][]string) {
return cp.Data.PodDeviceEntries, cp.Data.RegisteredDevices
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"context"
"fmt"
"sync"
"time"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
plugin "k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/plugin/v1beta1"
)
// endpoint maps to a single registered device plugin. It is responsible
// for managing gRPC communications with the device plugin and caching
// device states reported by the device plugin.
type endpoint interface {
getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error)
allocate(devs []string) (*pluginapi.AllocateResponse, error)
preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error)
setStopTime(t time.Time)
isStopped() bool
stopGracePeriodExpired() bool
}
type endpointImpl struct {
mutex sync.Mutex
resourceName string
api pluginapi.DevicePluginClient
stopTime time.Time
client plugin.Client // for testing only
}
// newEndpointImpl creates a new endpoint for the given resourceName.
// This is to be used during normal device plugin registration.
func newEndpointImpl(p plugin.DevicePlugin) *endpointImpl {
return &endpointImpl{
api: p.API(),
resourceName: p.Resource(),
}
}
// newStoppedEndpointImpl creates a new endpoint for the given resourceName with stopTime set.
// This is to be used during Kubelet restart, before the actual device plugin re-registers.
func newStoppedEndpointImpl(resourceName string) *endpointImpl {
return &endpointImpl{
resourceName: resourceName,
stopTime: time.Now(),
}
}
func (e *endpointImpl) isStopped() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero()
}
func (e *endpointImpl) stopGracePeriodExpired() bool {
e.mutex.Lock()
defer e.mutex.Unlock()
return !e.stopTime.IsZero() && time.Since(e.stopTime) > endpointStopGracePeriod
}
func (e *endpointImpl) setStopTime(t time.Time) {
e.mutex.Lock()
defer e.mutex.Unlock()
e.stopTime = t
}
// getPreferredAllocation issues GetPreferredAllocation gRPC call to the device plugin.
func (e *endpointImpl) getPreferredAllocation(available, mustInclude []string, size int) (*pluginapi.PreferredAllocationResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
return e.api.GetPreferredAllocation(context.Background(), &pluginapi.PreferredAllocationRequest{
ContainerRequests: []*pluginapi.ContainerPreferredAllocationRequest{
{
AvailableDeviceIDs: available,
MustIncludeDeviceIDs: mustInclude,
AllocationSize: int32(size),
},
},
})
}
// allocate issues Allocate gRPC call to the device plugin.
func (e *endpointImpl) allocate(devs []string) (*pluginapi.AllocateResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
return e.api.Allocate(context.Background(), &pluginapi.AllocateRequest{
ContainerRequests: []*pluginapi.ContainerAllocateRequest{
{DevicesIDs: devs},
},
})
}
// preStartContainer issues PreStartContainer gRPC call to the device plugin.
func (e *endpointImpl) preStartContainer(devs []string) (*pluginapi.PreStartContainerResponse, error) {
if e.isStopped() {
return nil, fmt.Errorf(errEndpointStopped, e)
}
ctx, cancel := context.WithTimeout(context.Background(), pluginapi.KubeletPreStartContainerRPCTimeoutInSecs*time.Second)
defer cancel()
return e.api.PreStartContainer(ctx, &pluginapi.PreStartContainerRequest{
DevicesIDs: devs,
})
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,49 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
// RegistrationHandler is an interface for handling device plugin registration
// and plugin directory cleanup.
type RegistrationHandler interface {
CleanupPluginDirectory(string) error
}
// ClientHandler is an interface for handling device plugin connections.
type ClientHandler interface {
PluginConnected(string, DevicePlugin) error
PluginDisconnected(string)
PluginListAndWatchReceiver(string, *api.ListAndWatchResponse)
}
// TODO: evaluate whether we need these error definitions.
const (
// errFailedToDialDevicePlugin is the error raised when the device plugin could not be
// reached on the registered socket
errFailedToDialDevicePlugin = "failed to dial device plugin:"
// errUnsupportedVersion is the error raised when the device plugin uses an API version not
// supported by the Kubelet registry
errUnsupportedVersion = "requested API version %q is not supported by kubelet. Supported version is %q"
// errInvalidResourceName is the error raised when a device plugin is registering
// itself with an invalid ResourceName
errInvalidResourceName = "the ResourceName %q is invalid"
// errBadSocket is the error raised when the registry socket path is not absolute
errBadSocket = "bad socketPath, must be an absolute path:"
)

View File

@ -0,0 +1,143 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"fmt"
"net"
"sync"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
)
// DevicePlugin interface provides methods for accessing Device Plugin resources, API and unix socket.
type DevicePlugin interface {
API() api.DevicePluginClient
Resource() string
SocketPath() string
}
// Client interface provides methods for establishing/closing gRPC connection and running the device plugin gRPC client.
type Client interface {
Connect() error
Run()
Disconnect() error
}
type client struct {
mutex sync.Mutex
resource string
socket string
grpc *grpc.ClientConn
handler ClientHandler
client api.DevicePluginClient
}
// NewPluginClient returns an initialized device plugin client.
func NewPluginClient(r string, socketPath string, h ClientHandler) Client {
return &client{
resource: r,
socket: socketPath,
handler: h,
}
}
// Connect is for establishing a gRPC connection between device manager and device plugin.
func (c *client) Connect() error {
client, conn, err := dial(c.socket)
if err != nil {
klog.ErrorS(err, "Unable to connect to device plugin client with socket path", "path", c.socket)
return err
}
c.mutex.Lock()
c.grpc = conn
c.client = client
c.mutex.Unlock()
return c.handler.PluginConnected(c.resource, c)
}
// Run is for running the device plugin gRPC client.
func (c *client) Run() {
stream, err := c.client.ListAndWatch(context.Background(), &api.Empty{})
if err != nil {
klog.ErrorS(err, "ListAndWatch ended unexpectedly for device plugin", "resource", c.resource)
return
}
for {
response, err := stream.Recv()
if err != nil {
klog.ErrorS(err, "ListAndWatch ended unexpectedly for device plugin", "resource", c.resource)
return
}
klog.V(2).InfoS("State pushed for device plugin", "resource", c.resource, "resourceCapacity", len(response.Devices))
c.handler.PluginListAndWatchReceiver(c.resource, response)
}
}
// Disconnect is for closing gRPC connection between device manager and device plugin.
func (c *client) Disconnect() error {
c.mutex.Lock()
if c.grpc != nil {
if err := c.grpc.Close(); err != nil {
klog.V(2).ErrorS(err, "Failed to close grcp connection", "resource", c.Resource())
}
c.grpc = nil
}
c.mutex.Unlock()
c.handler.PluginDisconnected(c.resource)
return nil
}
func (c *client) Resource() string {
return c.resource
}
func (c *client) API() api.DevicePluginClient {
return c.client
}
func (c *client) SocketPath() string {
return c.socket
}
// dial establishes the gRPC communication with the registered device plugin. https://godoc.org/google.golang.org/grpc#Dial
func dial(unixSocketPath string) (api.DevicePluginClient, *grpc.ClientConn, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
c, err := grpc.DialContext(ctx, unixSocketPath,
grpc.WithAuthority("localhost"),
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "unix", addr)
}),
)
if err != nil {
return nil, nil, fmt.Errorf(errFailedToDialDevicePlugin+" %v", err)
}
return api.NewDevicePluginClient(c), c, nil
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"fmt"
"os"
"time"
core "k8s.io/api/core/v1"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
func (s *server) GetPluginHandler() cache.PluginHandler {
if f, err := os.Create(s.socketDir + "DEPRECATION"); err != nil {
klog.ErrorS(err, "Failed to create deprecation file at socket dir", "path", s.socketDir)
} else {
f.Close()
klog.V(4).InfoS("Created deprecation file", "path", f.Name())
}
return s
}
func (s *server) RegisterPlugin(pluginName string, endpoint string, versions []string, pluginClientTimeout *time.Duration) error {
klog.V(2).InfoS("Registering plugin at endpoint", "plugin", pluginName, "endpoint", endpoint)
return s.connectClient(pluginName, endpoint)
}
func (s *server) DeRegisterPlugin(pluginName string) {
klog.V(2).InfoS("Deregistering plugin", "plugin", pluginName)
client := s.getClient(pluginName)
if client != nil {
s.disconnectClient(pluginName, client)
}
}
func (s *server) ValidatePlugin(pluginName string, endpoint string, versions []string) error {
klog.V(2).InfoS("Got plugin at endpoint with versions", "plugin", pluginName, "endpoint", endpoint, "versions", versions)
if !s.isVersionCompatibleWithPlugin(versions...) {
return fmt.Errorf("manager version, %s, is not among plugin supported versions %v", api.Version, versions)
}
if !v1helper.IsExtendedResourceName(core.ResourceName(pluginName)) {
return fmt.Errorf("invalid name of device plugin socket: %s", fmt.Sprintf(errInvalidResourceName, pluginName))
}
return nil
}
func (s *server) connectClient(name string, socketPath string) error {
c := NewPluginClient(name, socketPath, s.chandler)
s.registerClient(name, c)
if err := c.Connect(); err != nil {
s.deregisterClient(name)
klog.ErrorS(err, "Failed to connect to new client", "resource", name)
return err
}
go func() {
s.runClient(name, c)
}()
return nil
}
func (s *server) disconnectClient(name string, c Client) error {
s.deregisterClient(name)
return c.Disconnect()
}
func (s *server) registerClient(name string, c Client) {
s.mutex.Lock()
defer s.mutex.Unlock()
s.clients[name] = c
klog.V(2).InfoS("Registered client", "name", name)
}
func (s *server) deregisterClient(name string) {
s.mutex.Lock()
defer s.mutex.Unlock()
delete(s.clients, name)
klog.V(2).InfoS("Deregistered client", "name", name)
}
func (s *server) runClient(name string, c Client) {
c.Run()
c = s.getClient(name)
if c == nil {
return
}
if err := s.disconnectClient(name, c); err != nil {
klog.V(2).InfoS("Unable to disconnect client", "resource", name, "client", c, "err", err)
}
}
func (s *server) getClient(name string) Client {
s.mutex.Lock()
defer s.mutex.Unlock()
return s.clients[name]
}

View File

@ -0,0 +1,224 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"fmt"
"net"
"net/http"
"os"
"path/filepath"
"sync"
"github.com/opencontainers/selinux/go-selinux"
"google.golang.org/grpc"
core "k8s.io/api/core/v1"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/klog/v2"
api "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// Server interface provides methods for Device plugin registration server.
type Server interface {
cache.PluginHandler
healthz.HealthChecker
Start() error
Stop() error
SocketPath() string
}
type server struct {
socketName string
socketDir string
mutex sync.Mutex
wg sync.WaitGroup
grpc *grpc.Server
rhandler RegistrationHandler
chandler ClientHandler
clients map[string]Client
// isStarted indicates whether the service has started successfully.
isStarted bool
}
// NewServer returns an initialized device plugin registration server.
func NewServer(socketPath string, rh RegistrationHandler, ch ClientHandler) (Server, error) {
if socketPath == "" || !filepath.IsAbs(socketPath) {
return nil, fmt.Errorf(errBadSocket+" %s", socketPath)
}
dir, name := filepath.Split(socketPath)
klog.V(2).InfoS("Creating device plugin registration server", "version", api.Version, "socket", socketPath)
s := &server{
socketName: name,
socketDir: dir,
rhandler: rh,
chandler: ch,
clients: make(map[string]Client),
}
return s, nil
}
func (s *server) Start() error {
klog.V(2).InfoS("Starting device plugin registration server")
if err := os.MkdirAll(s.socketDir, 0750); err != nil {
klog.ErrorS(err, "Failed to create the device plugin socket directory", "directory", s.socketDir)
return err
}
if selinux.GetEnabled() {
if err := selinux.SetFileLabel(s.socketDir, config.KubeletPluginsDirSELinuxLabel); err != nil {
klog.InfoS("Unprivileged containerized plugins might not work. Could not set selinux context on socket dir", "path", s.socketDir, "err", err)
}
}
// For now, we leave cleanup of the *entire* directory up to the Handler
// (even though we should in theory be able to just wipe the whole directory)
// because the Handler stores its checkpoint file (amongst others) in here.
if err := s.rhandler.CleanupPluginDirectory(s.socketDir); err != nil {
klog.ErrorS(err, "Failed to cleanup the device plugin directory", "directory", s.socketDir)
return err
}
ln, err := net.Listen("unix", s.SocketPath())
if err != nil {
klog.ErrorS(err, "Failed to listen to socket while starting device plugin registry")
return err
}
s.wg.Add(1)
s.grpc = grpc.NewServer([]grpc.ServerOption{}...)
api.RegisterRegistrationServer(s.grpc, s)
go func() {
defer s.wg.Done()
s.setHealthy()
if err = s.grpc.Serve(ln); err != nil {
s.setUnhealthy()
klog.ErrorS(err, "Error while serving device plugin registration grpc server")
}
}()
return nil
}
func (s *server) Stop() error {
s.visitClients(func(r string, c Client) {
if err := s.disconnectClient(r, c); err != nil {
klog.InfoS("Error disconnecting device plugin client", "resourceName", r, "err", err)
}
})
s.mutex.Lock()
defer s.mutex.Unlock()
if s.grpc == nil {
return nil
}
s.grpc.Stop()
s.wg.Wait()
s.grpc = nil
// During kubelet termination, we do not need the registration server,
// and we consider the kubelet to be healthy even when it is down.
s.setHealthy()
return nil
}
func (s *server) SocketPath() string {
return filepath.Join(s.socketDir, s.socketName)
}
func (s *server) Register(ctx context.Context, r *api.RegisterRequest) (*api.Empty, error) {
klog.InfoS("Got registration request from device plugin with resource", "resourceName", r.ResourceName)
metrics.DevicePluginRegistrationCount.WithLabelValues(r.ResourceName).Inc()
if !s.isVersionCompatibleWithPlugin(r.Version) {
err := fmt.Errorf(errUnsupportedVersion, r.Version, api.SupportedVersions)
klog.InfoS("Bad registration request from device plugin with resource", "resourceName", r.ResourceName, "err", err)
return &api.Empty{}, err
}
if !v1helper.IsExtendedResourceName(core.ResourceName(r.ResourceName)) {
err := fmt.Errorf(errInvalidResourceName, r.ResourceName)
klog.InfoS("Bad registration request from device plugin", "err", err)
return &api.Empty{}, err
}
if err := s.connectClient(r.ResourceName, filepath.Join(s.socketDir, r.Endpoint)); err != nil {
klog.InfoS("Error connecting to device plugin client", "err", err)
return &api.Empty{}, err
}
return &api.Empty{}, nil
}
func (s *server) isVersionCompatibleWithPlugin(versions ...string) bool {
// TODO(vikasc): Currently this is fine as we only have a single supported version. When we do need to support
// multiple versions in the future, we may need to extend this function to return a supported version.
// E.g., say kubelet supports v1beta1 and v1beta2, and we get v1alpha1 and v1beta1 from a device plugin,
// this function should return v1beta1
for _, version := range versions {
for _, supportedVersion := range api.SupportedVersions {
if version == supportedVersion {
return true
}
}
}
return false
}
func (s *server) visitClients(visit func(r string, c Client)) {
s.mutex.Lock()
for r, c := range s.clients {
s.mutex.Unlock()
visit(r, c)
s.mutex.Lock()
}
s.mutex.Unlock()
}
func (s *server) Name() string {
return "device-plugin"
}
func (s *server) Check(_ *http.Request) error {
if s.isStarted {
return nil
}
return fmt.Errorf("device plugin registration gRPC server failed and no device plugins can register")
}
// setHealthy sets the health status of the gRPC server.
func (s *server) setHealthy() {
s.isStarted = true
}
// setUnhealthy sets the health status of the gRPC server to unhealthy.
func (s *server) setUnhealthy() {
s.isStarted = false
}

View File

@ -0,0 +1,388 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1beta1
import (
"context"
"net"
"os"
"path/filepath"
"sync"
"time"
"github.com/fsnotify/fsnotify"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
watcherapi "k8s.io/kubelet/pkg/apis/pluginregistration/v1"
)
// Stub implementation for DevicePlugin.
type Stub struct {
devs []*pluginapi.Device
socket string
resourceName string
preStartContainerFlag bool
getPreferredAllocationFlag bool
stop chan interface{}
wg sync.WaitGroup
update chan []*pluginapi.Device
server *grpc.Server
// allocFunc is used for handling allocation request
allocFunc stubAllocFunc
// getPreferredAllocFunc is used for handling getPreferredAllocation request
getPreferredAllocFunc stubGetPreferredAllocFunc
// registerControlFunc is used for controlling auto-registration of requests
registerControlFunc stubRegisterControlFunc
registrationStatus chan watcherapi.RegistrationStatus // for testing
endpoint string // for testing
kubeletRestartWatcher *fsnotify.Watcher
}
// stubGetPreferredAllocFunc is the function called when a getPreferredAllocation request is received from Kubelet
type stubGetPreferredAllocFunc func(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error)
func defaultGetPreferredAllocFunc(r *pluginapi.PreferredAllocationRequest, devs map[string]pluginapi.Device) (*pluginapi.PreferredAllocationResponse, error) {
var response pluginapi.PreferredAllocationResponse
return &response, nil
}
// stubAllocFunc is the function called when an allocation request is received from Kubelet
type stubAllocFunc func(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error)
func defaultAllocFunc(r *pluginapi.AllocateRequest, devs map[string]pluginapi.Device) (*pluginapi.AllocateResponse, error) {
var response pluginapi.AllocateResponse
return &response, nil
}
// stubRegisterControlFunc is the function called when a registration request is received from Kubelet
type stubRegisterControlFunc func() bool
func defaultRegisterControlFunc() bool {
return true
}
// NewDevicePluginStub returns an initialized DevicePlugin Stub.
func NewDevicePluginStub(devs []*pluginapi.Device, socket string, name string, preStartContainerFlag bool, getPreferredAllocationFlag bool) *Stub {
watcher, err := fsnotify.NewWatcher()
if err != nil {
klog.ErrorS(err, "Watcher creation failed")
panic(err)
}
return &Stub{
devs: devs,
socket: socket,
resourceName: name,
preStartContainerFlag: preStartContainerFlag,
getPreferredAllocationFlag: getPreferredAllocationFlag,
registerControlFunc: defaultRegisterControlFunc,
stop: make(chan interface{}),
update: make(chan []*pluginapi.Device),
allocFunc: defaultAllocFunc,
getPreferredAllocFunc: defaultGetPreferredAllocFunc,
kubeletRestartWatcher: watcher,
}
}
// SetGetPreferredAllocFunc sets allocFunc of the device plugin
func (m *Stub) SetGetPreferredAllocFunc(f stubGetPreferredAllocFunc) {
m.getPreferredAllocFunc = f
}
// SetAllocFunc sets allocFunc of the device plugin
func (m *Stub) SetAllocFunc(f stubAllocFunc) {
m.allocFunc = f
}
// SetRegisterControlFunc sets RegisterControlFunc of the device plugin
func (m *Stub) SetRegisterControlFunc(f stubRegisterControlFunc) {
m.registerControlFunc = f
}
// Start starts the gRPC server of the device plugin. Can only
// be called once.
func (m *Stub) Start() error {
klog.InfoS("Starting device plugin server")
err := m.cleanup()
if err != nil {
return err
}
sock, err := net.Listen("unix", m.socket)
if err != nil {
return err
}
m.wg.Add(1)
m.server = grpc.NewServer([]grpc.ServerOption{}...)
pluginapi.RegisterDevicePluginServer(m.server, m)
watcherapi.RegisterRegistrationServer(m.server, m)
err = m.kubeletRestartWatcher.Add(filepath.Dir(m.socket))
if err != nil {
klog.ErrorS(err, "Failed to add watch", "devicePluginPath", pluginapi.DevicePluginPath)
return err
}
go func() {
defer m.wg.Done()
if err = m.server.Serve(sock); err != nil {
klog.ErrorS(err, "Error while serving device plugin registration grpc server")
}
}()
var lastDialErr error
wait.PollImmediate(1*time.Second, 10*time.Second, func() (bool, error) {
var conn *grpc.ClientConn
_, conn, lastDialErr = dial(m.socket)
if lastDialErr != nil {
return false, nil
}
conn.Close()
return true, nil
})
if lastDialErr != nil {
return lastDialErr
}
klog.InfoS("Starting to serve on socket", "socket", m.socket)
return nil
}
func (m *Stub) Restart() error {
klog.InfoS("Restarting Device Plugin server")
if m.server == nil {
return nil
}
m.server.Stop()
m.server = nil
return m.Start()
}
// Stop stops the gRPC server. Can be called without a prior Start
// and more than once. Not safe to be called concurrently by different
// goroutines!
func (m *Stub) Stop() error {
klog.InfoS("Stopping device plugin server")
if m.server == nil {
return nil
}
m.kubeletRestartWatcher.Close()
m.server.Stop()
m.wg.Wait()
m.server = nil
close(m.stop) // This prevents re-starting the server.
return m.cleanup()
}
func (m *Stub) Watch(kubeletEndpoint, resourceName, pluginSockDir string) {
for {
select {
// Detect a kubelet restart by watching for a newly created
// 'pluginapi.KubeletSocket' file. When this occurs, restart
// the device plugin server
case event := <-m.kubeletRestartWatcher.Events:
if event.Name == kubeletEndpoint && event.Op&fsnotify.Create == fsnotify.Create {
klog.InfoS("inotify: file created, restarting", "kubeletEndpoint", kubeletEndpoint)
var lastErr error
err := wait.PollUntilContextTimeout(context.Background(), 10*time.Second, 2*time.Minute, false, func(context.Context) (done bool, err error) {
restartErr := m.Restart()
if restartErr == nil {
return true, nil
}
klog.ErrorS(restartErr, "Retrying after error")
lastErr = restartErr
return false, nil
})
if err != nil {
klog.ErrorS(err, "Unable to restart server: wait timed out", "lastErr", lastErr.Error())
panic(err)
}
if ok := m.registerControlFunc(); ok {
if err := m.Register(kubeletEndpoint, resourceName, pluginSockDir); err != nil {
klog.ErrorS(err, "Unable to register to kubelet")
panic(err)
}
}
}
// Watch for any other fs errors and log them.
case err := <-m.kubeletRestartWatcher.Errors:
klog.ErrorS(err, "inotify error")
}
}
}
// GetInfo is the RPC which return pluginInfo
func (m *Stub) GetInfo(ctx context.Context, req *watcherapi.InfoRequest) (*watcherapi.PluginInfo, error) {
klog.InfoS("GetInfo")
return &watcherapi.PluginInfo{
Type: watcherapi.DevicePlugin,
Name: m.resourceName,
Endpoint: m.endpoint,
SupportedVersions: []string{pluginapi.Version}}, nil
}
// NotifyRegistrationStatus receives the registration notification from watcher
func (m *Stub) NotifyRegistrationStatus(ctx context.Context, status *watcherapi.RegistrationStatus) (*watcherapi.RegistrationStatusResponse, error) {
if m.registrationStatus != nil {
m.registrationStatus <- *status
}
if !status.PluginRegistered {
klog.InfoS("Registration failed", "err", status.Error)
}
return &watcherapi.RegistrationStatusResponse{}, nil
}
// Register registers the device plugin for the given resourceName with Kubelet.
func (m *Stub) Register(kubeletEndpoint, resourceName string, pluginSockDir string) error {
klog.InfoS("Register", "kubeletEndpoint", kubeletEndpoint, "resourceName", resourceName, "socket", pluginSockDir)
if pluginSockDir != "" {
if _, err := os.Stat(pluginSockDir + "DEPRECATION"); err == nil {
klog.InfoS("Deprecation file found. Skip registration")
return nil
}
}
klog.InfoS("Deprecation file not found. Invoke registration")
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
conn, err := grpc.DialContext(ctx, kubeletEndpoint,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, "unix", addr)
}))
if err != nil {
return err
}
defer conn.Close()
client := pluginapi.NewRegistrationClient(conn)
reqt := &pluginapi.RegisterRequest{
Version: pluginapi.Version,
Endpoint: filepath.Base(m.socket),
ResourceName: resourceName,
Options: &pluginapi.DevicePluginOptions{
PreStartRequired: m.preStartContainerFlag,
GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
},
}
_, err = client.Register(context.Background(), reqt)
if err != nil {
// Stop server
m.server.Stop()
klog.ErrorS(err, "Client unable to register to kubelet")
return err
}
klog.InfoS("Device Plugin registered with the Kubelet")
return err
}
// GetDevicePluginOptions returns DevicePluginOptions settings for the device plugin.
func (m *Stub) GetDevicePluginOptions(ctx context.Context, e *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
options := &pluginapi.DevicePluginOptions{
PreStartRequired: m.preStartContainerFlag,
GetPreferredAllocationAvailable: m.getPreferredAllocationFlag,
}
return options, nil
}
// PreStartContainer resets the devices received
func (m *Stub) PreStartContainer(ctx context.Context, r *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
klog.InfoS("PreStartContainer", "request", r)
return &pluginapi.PreStartContainerResponse{}, nil
}
// ListAndWatch lists devices and update that list according to the Update call
func (m *Stub) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
klog.InfoS("ListAndWatch")
s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
for {
select {
case <-m.stop:
return nil
case updated := <-m.update:
s.Send(&pluginapi.ListAndWatchResponse{Devices: updated})
}
}
}
// Update allows the device plugin to send new devices through ListAndWatch
func (m *Stub) Update(devs []*pluginapi.Device) {
m.update <- devs
}
// GetPreferredAllocation gets the preferred allocation from a set of available devices
func (m *Stub) GetPreferredAllocation(ctx context.Context, r *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
klog.InfoS("GetPreferredAllocation", "request", r)
devs := make(map[string]pluginapi.Device)
for _, dev := range m.devs {
devs[dev.ID] = *dev
}
return m.getPreferredAllocFunc(r, devs)
}
// Allocate does a mock allocation
func (m *Stub) Allocate(ctx context.Context, r *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
klog.InfoS("Allocate", "request", r)
devs := make(map[string]pluginapi.Device)
for _, dev := range m.devs {
devs[dev.ID] = *dev
}
return m.allocFunc(r, devs)
}
func (m *Stub) cleanup() error {
if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}

View File

@ -0,0 +1,456 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"sync"
"k8s.io/klog/v2"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
type deviceAllocateInfo struct {
// deviceIds contains device Ids allocated to this container for the given resourceName.
deviceIds checkpoint.DevicesPerNUMA
// allocResp contains cached rpc AllocateResponse.
allocResp *pluginapi.ContainerAllocateResponse
}
type resourceAllocateInfo map[string]deviceAllocateInfo // Keyed by resourceName.
type containerDevices map[string]resourceAllocateInfo // Keyed by containerName.
type podDevices struct {
sync.RWMutex
devs map[string]containerDevices // Keyed by podUID.
}
// NewPodDevices is a function that returns object of podDevices type with its own guard
// RWMutex and a map where key is a pod UID and value contains
// container devices information of type containerDevices.
func newPodDevices() *podDevices {
return &podDevices{devs: make(map[string]containerDevices)}
}
func (pdev *podDevices) pods() sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
ret := sets.New[string]()
for k := range pdev.devs {
ret.Insert(k)
}
return ret
}
func (pdev *podDevices) size() int {
pdev.RLock()
defer pdev.RUnlock()
return len(pdev.devs)
}
func (pdev *podDevices) hasPod(podUID string) bool {
pdev.RLock()
defer pdev.RUnlock()
_, podExists := pdev.devs[podUID]
return podExists
}
func (pdev *podDevices) insert(podUID, contName, resource string, devices checkpoint.DevicesPerNUMA, resp *pluginapi.ContainerAllocateResponse) {
pdev.Lock()
defer pdev.Unlock()
if _, podExists := pdev.devs[podUID]; !podExists {
pdev.devs[podUID] = make(containerDevices)
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
pdev.devs[podUID][contName] = make(resourceAllocateInfo)
}
pdev.devs[podUID][contName][resource] = deviceAllocateInfo{
deviceIds: devices,
allocResp: resp,
}
}
func (pdev *podDevices) delete(pods []string) {
pdev.Lock()
defer pdev.Unlock()
for _, uid := range pods {
delete(pdev.devs, uid)
}
}
// Returns list of device Ids allocated to the given pod for the given resource.
// Returns nil if we don't have cached state for the given <podUID, resource>.
func (pdev *podDevices) podDevices(podUID, resource string) sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
ret := sets.New[string]()
for contName := range pdev.devs[podUID] {
ret = ret.Union(pdev.containerDevices(podUID, contName, resource))
}
return ret
}
// Returns list of device Ids allocated to the given container for the given resource.
// Returns nil if we don't have cached state for the given <podUID, contName, resource>.
func (pdev *podDevices) containerDevices(podUID, contName, resource string) sets.Set[string] {
pdev.RLock()
defer pdev.RUnlock()
if _, podExists := pdev.devs[podUID]; !podExists {
return nil
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
return nil
}
devs, resourceExists := pdev.devs[podUID][contName][resource]
if !resourceExists {
return nil
}
return devs.deviceIds.Devices()
}
// Populates allocatedResources with the device resources allocated to the specified <podUID, contName>.
func (pdev *podDevices) addContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.Set[string]) {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Union(devices.deviceIds.Devices())
}
}
// Removes the device resources allocated to the specified <podUID, contName> from allocatedResources.
func (pdev *podDevices) removeContainerAllocatedResources(podUID, contName string, allocatedResources map[string]sets.Set[string]) {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return
}
resources, exists := containers[contName]
if !exists {
return
}
for resource, devices := range resources {
allocatedResources[resource] = allocatedResources[resource].Difference(devices.deviceIds.Devices())
}
}
// Returns all devices allocated to the pods being tracked, keyed by resourceName.
func (pdev *podDevices) devices() map[string]sets.Set[string] {
ret := make(map[string]sets.Set[string])
pdev.RLock()
defer pdev.RUnlock()
for _, containerDevices := range pdev.devs {
for _, resources := range containerDevices {
for resource, devices := range resources {
if _, exists := ret[resource]; !exists {
ret[resource] = sets.New[string]()
}
if devices.allocResp != nil {
ret[resource] = ret[resource].Union(devices.deviceIds.Devices())
}
}
}
}
return ret
}
// Returns podUID and containerName for a device
func (pdev *podDevices) getPodAndContainerForDevice(deviceID string) (string, string) {
pdev.RLock()
defer pdev.RUnlock()
for podUID, containerDevices := range pdev.devs {
for containerName, resources := range containerDevices {
for _, devices := range resources {
if devices.deviceIds.Devices().Has(deviceID) {
return podUID, containerName
}
}
}
}
return "", ""
}
// Turns podDevices to checkpointData.
func (pdev *podDevices) toCheckpointData() []checkpoint.PodDevicesEntry {
var data []checkpoint.PodDevicesEntry
pdev.RLock()
defer pdev.RUnlock()
for podUID, containerDevices := range pdev.devs {
for conName, resources := range containerDevices {
for resource, devices := range resources {
if devices.allocResp == nil {
klog.ErrorS(nil, "Can't marshal allocResp, allocation response is missing", "podUID", podUID, "containerName", conName, "resourceName", resource)
continue
}
allocResp, err := devices.allocResp.Marshal()
if err != nil {
klog.ErrorS(err, "Can't marshal allocResp", "podUID", podUID, "containerName", conName, "resourceName", resource)
continue
}
data = append(data, checkpoint.PodDevicesEntry{
PodUID: podUID,
ContainerName: conName,
ResourceName: resource,
DeviceIDs: devices.deviceIds,
AllocResp: allocResp})
}
}
}
return data
}
// Populates podDevices from the passed in checkpointData.
func (pdev *podDevices) fromCheckpointData(data []checkpoint.PodDevicesEntry) {
for _, entry := range data {
klog.V(2).InfoS("Get checkpoint entry",
"podUID", entry.PodUID, "containerName", entry.ContainerName,
"resourceName", entry.ResourceName, "deviceIDs", entry.DeviceIDs, "allocated", entry.AllocResp)
allocResp := &pluginapi.ContainerAllocateResponse{}
err := allocResp.Unmarshal(entry.AllocResp)
if err != nil {
klog.ErrorS(err, "Can't unmarshal allocResp", "podUID", entry.PodUID, "containerName", entry.ContainerName, "resourceName", entry.ResourceName)
continue
}
pdev.insert(entry.PodUID, entry.ContainerName, entry.ResourceName, entry.DeviceIDs, allocResp)
}
}
// Returns combined container runtime settings to consume the container's allocated devices.
func (pdev *podDevices) deviceRunContainerOptions(podUID, contName string) *DeviceRunContainerOptions {
pdev.RLock()
defer pdev.RUnlock()
containers, exists := pdev.devs[podUID]
if !exists {
return nil
}
resources, exists := containers[contName]
if !exists {
return nil
}
opts := &DeviceRunContainerOptions{}
// Maps to detect duplicate settings.
devsMap := make(map[string]string)
mountsMap := make(map[string]string)
envsMap := make(map[string]string)
annotationsMap := make(map[string]string)
// Keep track of all CDI devices requested for the container.
allCDIDevices := sets.New[string]()
// Loops through AllocationResponses of all cached device resources.
for _, devices := range resources {
resp := devices.allocResp
// Each Allocate response has the following artifacts.
// Environment variables
// Mount points
// Device files
// Container annotations
// CDI device IDs
// These artifacts are per resource per container.
// Updates RunContainerOptions.Envs.
for k, v := range resp.Envs {
if e, ok := envsMap[k]; ok {
klog.V(4).InfoS("Skip existing env", "envKey", k, "envValue", v)
if e != v {
klog.ErrorS(nil, "Environment variable has conflicting setting", "envKey", k, "expected", v, "got", e)
}
continue
}
klog.V(4).InfoS("Add env", "envKey", k, "envValue", v)
envsMap[k] = v
opts.Envs = append(opts.Envs, kubecontainer.EnvVar{Name: k, Value: v})
}
// Updates RunContainerOptions.Devices.
for _, dev := range resp.Devices {
if d, ok := devsMap[dev.ContainerPath]; ok {
klog.V(4).InfoS("Skip existing device", "containerPath", dev.ContainerPath, "hostPath", dev.HostPath)
if d != dev.HostPath {
klog.ErrorS(nil, "Container device has conflicting mapping host devices",
"containerPath", dev.ContainerPath, "got", d, "expected", dev.HostPath)
}
continue
}
klog.V(4).InfoS("Add device", "containerPath", dev.ContainerPath, "hostPath", dev.HostPath)
devsMap[dev.ContainerPath] = dev.HostPath
opts.Devices = append(opts.Devices, kubecontainer.DeviceInfo{
PathOnHost: dev.HostPath,
PathInContainer: dev.ContainerPath,
Permissions: dev.Permissions,
})
}
// Updates RunContainerOptions.Mounts.
for _, mount := range resp.Mounts {
if m, ok := mountsMap[mount.ContainerPath]; ok {
klog.V(4).InfoS("Skip existing mount", "containerPath", mount.ContainerPath, "hostPath", mount.HostPath)
if m != mount.HostPath {
klog.ErrorS(nil, "Container mount has conflicting mapping host mounts",
"containerPath", mount.ContainerPath, "conflictingPath", m, "hostPath", mount.HostPath)
}
continue
}
klog.V(4).InfoS("Add mount", "containerPath", mount.ContainerPath, "hostPath", mount.HostPath)
mountsMap[mount.ContainerPath] = mount.HostPath
opts.Mounts = append(opts.Mounts, kubecontainer.Mount{
Name: mount.ContainerPath,
ContainerPath: mount.ContainerPath,
HostPath: mount.HostPath,
ReadOnly: mount.ReadOnly,
// TODO: This may need to be part of Device plugin API.
SELinuxRelabel: false,
})
}
// Updates for Annotations
for k, v := range resp.Annotations {
if e, ok := annotationsMap[k]; ok {
klog.V(4).InfoS("Skip existing annotation", "annotationKey", k, "annotationValue", v)
if e != v {
klog.ErrorS(nil, "Annotation has conflicting setting", "annotationKey", k, "expected", e, "got", v)
}
continue
}
klog.V(4).InfoS("Add annotation", "annotationKey", k, "annotationValue", v)
annotationsMap[k] = v
opts.Annotations = append(opts.Annotations, kubecontainer.Annotation{Name: k, Value: v})
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.DevicePluginCDIDevices) {
// Updates for CDI devices.
cdiDevices := getCDIDeviceInfo(resp, allCDIDevices)
opts.CDIDevices = append(opts.CDIDevices, cdiDevices...)
}
}
return opts
}
// getCDIDeviceInfo returns CDI devices from an allocate response
func getCDIDeviceInfo(resp *pluginapi.ContainerAllocateResponse, knownCDIDevices sets.Set[string]) []kubecontainer.CDIDevice {
var cdiDevices []kubecontainer.CDIDevice
for _, cdiDevice := range resp.CDIDevices {
if knownCDIDevices.Has(cdiDevice.Name) {
klog.V(4).InfoS("Skip existing CDI Device", "name", cdiDevice.Name)
continue
}
klog.V(4).InfoS("Add CDI device", "name", cdiDevice.Name)
knownCDIDevices.Insert(cdiDevice.Name)
device := kubecontainer.CDIDevice{
Name: cdiDevice.Name,
}
cdiDevices = append(cdiDevices, device)
}
return cdiDevices
}
// getContainerDevices returns the devices assigned to the provided container for all ResourceNames
func (pdev *podDevices) getContainerDevices(podUID, contName string) ResourceDeviceInstances {
pdev.RLock()
defer pdev.RUnlock()
if _, podExists := pdev.devs[podUID]; !podExists {
return nil
}
if _, contExists := pdev.devs[podUID][contName]; !contExists {
return nil
}
resDev := NewResourceDeviceInstances()
for resource, allocateInfo := range pdev.devs[podUID][contName] {
if len(allocateInfo.deviceIds) == 0 {
continue
}
devicePluginMap := make(map[string]pluginapi.Device)
for numaid, devlist := range allocateInfo.deviceIds {
for _, devID := range devlist {
var topology *pluginapi.TopologyInfo
if numaid != nodeWithoutTopology {
NUMANodes := []*pluginapi.NUMANode{{ID: numaid}}
if pDev, ok := devicePluginMap[devID]; ok && pDev.Topology != nil {
if nodes := pDev.Topology.GetNodes(); nodes != nil {
NUMANodes = append(NUMANodes, nodes...)
}
}
// ID and Healthy are not relevant here.
topology = &pluginapi.TopologyInfo{Nodes: NUMANodes}
}
devicePluginMap[devID] = pluginapi.Device{
Topology: topology,
}
}
}
resDev[resource] = devicePluginMap
}
return resDev
}
// DeviceInstances is a mapping device name -> plugin device data
type DeviceInstances map[string]pluginapi.Device
// ResourceDeviceInstances is a mapping resource name -> DeviceInstances
type ResourceDeviceInstances map[string]DeviceInstances
// NewResourceDeviceInstances returns a new ResourceDeviceInstances
func NewResourceDeviceInstances() ResourceDeviceInstances {
return make(ResourceDeviceInstances)
}
// Clone returns a clone of ResourceDeviceInstances
func (rdev ResourceDeviceInstances) Clone() ResourceDeviceInstances {
clone := NewResourceDeviceInstances()
for resourceName, resourceDevs := range rdev {
clone[resourceName] = make(map[string]pluginapi.Device)
for devID, dev := range resourceDevs {
clone[resourceName][devID] = dev
}
}
return clone
}
// Filter takes a condition set expressed as map[string]sets.Set[string] and returns a new
// ResourceDeviceInstances with only the devices matching the condition set.
func (rdev ResourceDeviceInstances) Filter(cond map[string]sets.Set[string]) ResourceDeviceInstances {
filtered := NewResourceDeviceInstances()
for resourceName, filterIDs := range cond {
if _, exists := rdev[resourceName]; !exists {
continue
}
filtered[resourceName] = DeviceInstances{}
for instanceID, instance := range rdev[resourceName] {
if filterIDs.Has(instanceID) {
filtered[resourceName][instanceID] = instance
}
}
}
return filtered
}

View File

@ -0,0 +1,252 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/resource"
"k8s.io/klog/v2"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
)
// GetTopologyHints implements the TopologyManager HintProvider Interface which
// ensures the Device Manager is consulted when Topology Aware Hints for each
// container are created.
func (m *ManagerImpl) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded device resources before providing TopologyHints
m.UpdateAllocatedDevices()
// Loop through all device resources and generate TopologyHints for them.
deviceHints := make(map[string][]topologymanager.TopologyHint)
accumulatedResourceRequests := m.getContainerDeviceRequest(container)
m.mutex.Lock()
defer m.mutex.Unlock()
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
deviceHints[resource] = nil
continue
}
// Short circuit to regenerate the same hints if there are already
// devices allocated to the Container. This might happen after a
// kubelet restart, for example.
allocated := m.podDevices.containerDevices(string(pod.UID), container.Name, resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name, "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod), "containerName", container.Name)
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
// Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource)
reusable := m.devicesToReuse[string(pod.UID)][resource]
if available.Union(reusable).Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Union(reusable).Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
// Generate TopologyHints for this resource given the current
// request size and the list of available devices.
deviceHints[resource] = m.generateDeviceTopologyHints(resource, available, reusable, requested)
}
return deviceHints
}
// GetPodTopologyHints implements the topologymanager.HintProvider Interface which
// ensures the Device Manager is consulted when Topology Aware Hints for Pod are created.
func (m *ManagerImpl) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded device resources before providing TopologyHints
m.UpdateAllocatedDevices()
deviceHints := make(map[string][]topologymanager.TopologyHint)
accumulatedResourceRequests := m.getPodDeviceRequest(pod)
m.mutex.Lock()
defer m.mutex.Unlock()
for resource, requested := range accumulatedResourceRequests {
// Only consider devices that actually contain topology information.
if aligned := m.deviceHasTopologyAlignment(resource); !aligned {
klog.InfoS("Resource does not have a topology preference", "resource", resource)
deviceHints[resource] = nil
continue
}
// Short circuit to regenerate the same hints if there are already
// devices allocated to the Pod. This might happen after a
// kubelet restart, for example.
allocated := m.podDevices.podDevices(string(pod.UID), resource)
if allocated.Len() > 0 {
if allocated.Len() != requested {
klog.ErrorS(nil, "Resource already allocated to pod with different number than request", "resource", resource, "pod", klog.KObj(pod), "request", requested, "allocated", allocated.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
klog.InfoS("Regenerating TopologyHints for resource already allocated to pod", "resource", resource, "pod", klog.KObj(pod))
deviceHints[resource] = m.generateDeviceTopologyHints(resource, allocated, sets.Set[string]{}, requested)
continue
}
// Get the list of available devices, for which TopologyHints should be generated.
available := m.getAvailableDevices(resource)
if available.Len() < requested {
klog.ErrorS(nil, "Unable to generate topology hints: requested number of devices unavailable", "resource", resource, "request", requested, "available", available.Len())
deviceHints[resource] = []topologymanager.TopologyHint{}
continue
}
// Generate TopologyHints for this resource given the current
// request size and the list of available devices.
deviceHints[resource] = m.generateDeviceTopologyHints(resource, available, sets.Set[string]{}, requested)
}
return deviceHints
}
func (m *ManagerImpl) deviceHasTopologyAlignment(resource string) bool {
// If any device has Topology NUMANodes available, we assume they care about alignment.
for _, device := range m.allDevices[resource] {
if device.Topology != nil && len(device.Topology.Nodes) > 0 {
return true
}
}
return false
}
func (m *ManagerImpl) getAvailableDevices(resource string) sets.Set[string] {
// Strip all devices in use from the list of healthy ones.
return m.healthyDevices[resource].Difference(m.allocatedDevices[resource])
}
func (m *ManagerImpl) generateDeviceTopologyHints(resource string, available sets.Set[string], reusable sets.Set[string], request int) []topologymanager.TopologyHint {
// Initialize minAffinitySize to include all NUMA Nodes
minAffinitySize := len(m.numaNodes)
// Iterate through all combinations of NUMA Nodes and build hints from them.
hints := []topologymanager.TopologyHint{}
bitmask.IterateBitMasks(m.numaNodes, func(mask bitmask.BitMask) {
// First, update minAffinitySize for the current request size.
devicesInMask := 0
for _, device := range m.allDevices[resource] {
if mask.AnySet(m.getNUMANodeIds(device.Topology)) {
devicesInMask++
}
}
if devicesInMask >= request && mask.Count() < minAffinitySize {
minAffinitySize = mask.Count()
}
// Then check to see if all the reusable devices are part of the bitmask.
numMatching := 0
for d := range reusable {
// Skip the device if it doesn't specify any topology info.
if m.allDevices[resource][d].Topology == nil {
continue
}
// Otherwise disregard this mask if its NUMANode isn't part of it.
if !mask.AnySet(m.getNUMANodeIds(m.allDevices[resource][d].Topology)) {
return
}
numMatching++
}
// Finally, check to see if enough available devices remain on the
// current NUMA node combination to satisfy the device request.
for d := range available {
if mask.AnySet(m.getNUMANodeIds(m.allDevices[resource][d].Topology)) {
numMatching++
}
}
// If they don't, then move onto the next combination.
if numMatching < request {
return
}
// Otherwise, create a new hint from the NUMA mask and add it to the
// list of hints. We set all hint preferences to 'false' on the first
// pass through.
hints = append(hints, topologymanager.TopologyHint{
NUMANodeAffinity: mask,
Preferred: false,
})
})
// Loop back through all hints and update the 'Preferred' field based on
// counting the number of bits sets in the affinity mask and comparing it
// to the minAffinity. Only those with an equal number of bits set will be
// considered preferred.
for i := range hints {
if hints[i].NUMANodeAffinity.Count() == minAffinitySize {
hints[i].Preferred = true
}
}
return hints
}
func (m *ManagerImpl) getNUMANodeIds(topology *pluginapi.TopologyInfo) []int {
if topology == nil {
return nil
}
var ids []int
for _, n := range topology.Nodes {
ids = append(ids, int(n.ID))
}
return ids
}
func (m *ManagerImpl) getPodDeviceRequest(pod *v1.Pod) map[string]int {
// for these device plugin resources, requests == limits
limits := resource.PodLimits(pod, resource.PodResourcesOptions{
ExcludeOverhead: true,
})
podRequests := make(map[string]int)
for resourceName, quantity := range limits {
if !m.isDevicePluginResource(string(resourceName)) {
continue
}
podRequests[string(resourceName)] = int(quantity.Value())
}
return podRequests
}
func (m *ManagerImpl) getContainerDeviceRequest(container *v1.Container) map[string]int {
containerRequests := make(map[string]int)
for resourceObj, requestedObj := range container.Resources.Limits {
resource := string(resourceObj)
requested := int(requestedObj.Value())
if !m.isDevicePluginResource(resource) {
continue
}
containerRequests[resource] = requested
}
return containerRequests
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package devicemanager
import (
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apiserver/pkg/server/healthz"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
// Manager manages all the Device Plugins running on a node.
type Manager interface {
// Start starts device plugin registration service.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, initialContainers containermap.ContainerMap, initialContainerRunningSet sets.Set[string]) error
// Allocate configures and assigns devices to a container in a pod. From
// the requested device resources, Allocate will communicate with the
// owning device plugin to allow setup procedures to take place, and for
// the device plugin to provide runtime settings to use the device
// (environment variables, mount points and device files).
Allocate(pod *v1.Pod, container *v1.Container) error
// UpdatePluginResources updates node resources based on devices already
// allocated to pods. The node object is provided for the device manager to
// update the node capacity to reflect the currently available devices.
UpdatePluginResources(node *schedulerframework.NodeInfo, attrs *lifecycle.PodAdmitAttributes) error
// Stop stops the manager.
Stop() error
// GetDeviceRunContainerOptions checks whether we have cached containerDevices
// for the passed-in <pod, container> and returns its DeviceRunContainerOptions
// for the found one. An empty struct is returned in case no cached state is found.
GetDeviceRunContainerOptions(pod *v1.Pod, container *v1.Container) (*DeviceRunContainerOptions, error)
// GetCapacity returns the amount of available device plugin resource capacity, resource allocatable
// and inactive device plugin resources previously registered on the node.
GetCapacity() (v1.ResourceList, v1.ResourceList, []string)
// GetWatcherHandler returns the plugin handler for the device manager.
GetWatcherHandler() cache.PluginHandler
GetHealthChecker() healthz.HealthChecker
// GetDevices returns information about the devices assigned to pods and containers
GetDevices(podUID, containerName string) ResourceDeviceInstances
// UpdateAllocatedResourcesStatus updates the status of allocated resources for the pod.
UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus)
// GetAllocatableDevices returns information about all the devices known to the manager
GetAllocatableDevices() ResourceDeviceInstances
// ShouldResetExtendedResourceCapacity returns whether the extended resources should be reset or not,
// depending on the checkpoint file availability. Absence of the checkpoint file strongly indicates
// the node has been recreated.
ShouldResetExtendedResourceCapacity() bool
// TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface
// and is consulted to make Topology aware resource alignments
GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint
// TopologyManager HintProvider provider indicates the Device Manager implements the Topology Manager Interface
// and is consulted to make Topology aware resource alignments per Pod
GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint
// UpdateAllocatedDevices frees any Devices that are bound to terminated pods.
UpdateAllocatedDevices()
// Updates returns a channel that receives an Update when the device changed its status.
Updates() <-chan resourceupdates.Update
}
// DeviceRunContainerOptions contains the combined container runtime settings to consume its allocated devices.
type DeviceRunContainerOptions struct {
// The environment variables list.
Envs []kubecontainer.EnvVar
// The mounts for the container.
Mounts []kubecontainer.Mount
// The host devices mapped into the container.
Devices []kubecontainer.DeviceInfo
// The Annotations for the container
Annotations []kubecontainer.Annotation
// CDI Devices for the container
CDIDevices []kubecontainer.CDIDevice
}
// TODO: evaluate whether we need this error definition.
const (
errEndpointStopped = "endpoint %v has been stopped"
)
// endpointStopGracePeriod indicates the grace period after an endpoint is stopped
// because its device plugin fails. DeviceManager keeps the stopped endpoint in its
// cache during this grace period to cover the time gap for the capacity change to
// take effect.
const endpointStopGracePeriod = time.Duration(5) * time.Minute
// kubeletDeviceManagerCheckpoint is the file name of device plugin checkpoint
const kubeletDeviceManagerCheckpoint = "kubelet_internal_checkpoint"

21
vendor/k8s.io/kubernetes/pkg/kubelet/cm/doc.go generated vendored Normal file
View File

@ -0,0 +1,21 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Package cm (abbreviation of "container manager") and its subpackages contain all the kubelet code
// to manage containers. For example, they contain functions to configure containers' cgroups,
// ensure containers run with the desired QoS, and allocate compute resources like cpus, memory,
// devices...
package cm // import "k8s.io/kubernetes/pkg/kubelet/cm"

2
vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/OWNERS generated vendored Normal file
View File

@ -0,0 +1,2 @@
labels:
- wg/device-management

View File

@ -0,0 +1,222 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dra
import (
"errors"
"fmt"
"slices"
"sync"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
// ClaimInfo holds information required
// to prepare and unprepare a resource claim.
// +k8s:deepcopy-gen=true
type ClaimInfo struct {
state.ClaimInfoState
prepared bool
}
// claimInfoCache is a cache of processed resource claims keyed by namespace/claimname.
type claimInfoCache struct {
sync.RWMutex
checkpointer state.Checkpointer
claimInfo map[string]*ClaimInfo
}
// newClaimInfoFromClaim creates a new claim info from a resource claim.
// It verifies that the kubelet can handle the claim.
func newClaimInfoFromClaim(claim *resourceapi.ResourceClaim) (*ClaimInfo, error) {
claimInfoState := state.ClaimInfoState{
ClaimUID: claim.UID,
ClaimName: claim.Name,
Namespace: claim.Namespace,
PodUIDs: sets.New[string](),
DriverState: make(map[string]state.DriverState),
}
if claim.Status.Allocation == nil {
return nil, errors.New("not allocated")
}
for _, result := range claim.Status.Allocation.Devices.Results {
claimInfoState.DriverState[result.Driver] = state.DriverState{}
}
info := &ClaimInfo{
ClaimInfoState: claimInfoState,
prepared: false,
}
return info, nil
}
// newClaimInfoFromClaim creates a new claim info from a checkpointed claim info state object.
func newClaimInfoFromState(state *state.ClaimInfoState) *ClaimInfo {
info := &ClaimInfo{
ClaimInfoState: *state.DeepCopy(),
prepared: false,
}
return info
}
// setCDIDevices adds a set of CDI devices to the claim info.
func (info *ClaimInfo) addDevice(driverName string, deviceState state.Device) {
if info.DriverState == nil {
info.DriverState = make(map[string]state.DriverState)
}
driverState := info.DriverState[driverName]
driverState.Devices = append(driverState.Devices, deviceState)
info.DriverState[driverName] = driverState
}
// addPodReference adds a pod reference to the claim info.
func (info *ClaimInfo) addPodReference(podUID types.UID) {
info.PodUIDs.Insert(string(podUID))
}
// hasPodReference checks if a pod reference exists in the claim info.
func (info *ClaimInfo) hasPodReference(podUID types.UID) bool {
return info.PodUIDs.Has(string(podUID))
}
// deletePodReference deletes a pod reference from the claim info.
func (info *ClaimInfo) deletePodReference(podUID types.UID) {
info.PodUIDs.Delete(string(podUID))
}
// setPrepared marks the claim info as prepared.
func (info *ClaimInfo) setPrepared() {
info.prepared = true
}
// isPrepared checks if claim info is prepared or not.
func (info *ClaimInfo) isPrepared() bool {
return info.prepared
}
// newClaimInfoCache creates a new claim info cache object, pre-populated from a checkpoint (if present).
func newClaimInfoCache(stateDir, checkpointName string) (*claimInfoCache, error) {
checkpointer, err := state.NewCheckpointer(stateDir, checkpointName)
if err != nil {
return nil, fmt.Errorf("could not initialize checkpoint manager, please drain node and remove dra state file, err: %w", err)
}
checkpoint, err := checkpointer.GetOrCreate()
if err != nil {
return nil, fmt.Errorf("error calling GetOrCreate() on checkpoint state: %w", err)
}
cache := &claimInfoCache{
checkpointer: checkpointer,
claimInfo: make(map[string]*ClaimInfo),
}
entries, err := checkpoint.GetClaimInfoStateList()
if err != nil {
return nil, fmt.Errorf("error calling GetEntries() on checkpoint: %w", err)
}
for _, entry := range entries {
info := newClaimInfoFromState(&entry)
cache.claimInfo[info.Namespace+"/"+info.ClaimName] = info
}
return cache, nil
}
// withLock runs a function while holding the claimInfoCache lock.
func (cache *claimInfoCache) withLock(f func() error) error {
cache.Lock()
defer cache.Unlock()
return f()
}
// withRLock runs a function while holding the claimInfoCache rlock.
func (cache *claimInfoCache) withRLock(f func() error) error {
cache.RLock()
defer cache.RUnlock()
return f()
}
// add adds a new claim info object into the claim info cache.
func (cache *claimInfoCache) add(info *ClaimInfo) *ClaimInfo {
cache.claimInfo[info.Namespace+"/"+info.ClaimName] = info
return info
}
// contains checks to see if a specific claim info object is already in the cache.
func (cache *claimInfoCache) contains(claimName, namespace string) bool {
_, exists := cache.claimInfo[namespace+"/"+claimName]
return exists
}
// get gets a specific claim info object from the cache.
func (cache *claimInfoCache) get(claimName, namespace string) (*ClaimInfo, bool) {
info, exists := cache.claimInfo[namespace+"/"+claimName]
return info, exists
}
// delete deletes a specific claim info object from the cache.
func (cache *claimInfoCache) delete(claimName, namespace string) {
delete(cache.claimInfo, namespace+"/"+claimName)
}
// hasPodReference checks if there is at least one claim
// that is referenced by the pod with the given UID
// This function is used indirectly by the status manager
// to check if pod can enter termination status
func (cache *claimInfoCache) hasPodReference(uid types.UID) bool {
for _, claimInfo := range cache.claimInfo {
if claimInfo.hasPodReference(uid) {
return true
}
}
return false
}
// syncToCheckpoint syncs the full claim info cache state to a checkpoint.
func (cache *claimInfoCache) syncToCheckpoint() error {
claimInfoStateList := make(state.ClaimInfoStateList, 0, len(cache.claimInfo))
for _, infoClaim := range cache.claimInfo {
claimInfoStateList = append(claimInfoStateList, infoClaim.ClaimInfoState)
}
checkpoint, err := state.NewCheckpoint(claimInfoStateList)
if err != nil {
return err
}
return cache.checkpointer.Store(checkpoint)
}
// cdiDevicesAsList returns a list of CDIDevices from the provided claim info.
// When the request name is non-empty, only devices relevant for that request
// are returned.
func (info *ClaimInfo) cdiDevicesAsList(requestName string) []kubecontainer.CDIDevice {
var cdiDevices []kubecontainer.CDIDevice
for _, driverData := range info.DriverState {
for _, device := range driverData.Devices {
if requestName == "" || len(device.RequestNames) == 0 || slices.Contains(device.RequestNames, requestName) {
for _, cdiDeviceID := range device.CDIDeviceIDs {
cdiDevices = append(cdiDevices, kubecontainer.CDIDevice{Name: cdiDeviceID})
}
}
}
}
return cdiDevices
}

553
vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/manager.go generated vendored Normal file
View File

@ -0,0 +1,553 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dra
import (
"context"
"fmt"
"strconv"
"time"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/klog/v2"
drapb "k8s.io/kubelet/pkg/apis/dra/v1beta1"
dra "k8s.io/kubernetes/pkg/kubelet/cm/dra/plugin"
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/metrics"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// draManagerStateFileName is the file name where dra manager stores its state
const draManagerStateFileName = "dra_manager_state"
// defaultReconcilePeriod is the default reconciliation period to keep all claim info state in sync.
const defaultReconcilePeriod = 60 * time.Second
// ActivePodsFunc is a function that returns a list of pods to reconcile.
type ActivePodsFunc func() []*v1.Pod
// GetNodeFunc is a function that returns the node object using the kubelet's node lister.
type GetNodeFunc func() (*v1.Node, error)
// ManagerImpl is the structure in charge of managing DRA drivers.
type ManagerImpl struct {
// cache contains cached claim info
cache *claimInfoCache
// reconcilePeriod is the duration between calls to reconcileLoop.
reconcilePeriod time.Duration
// activePods is a method for listing active pods on the node
// so all claim info state can be updated in the reconciliation loop.
activePods ActivePodsFunc
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can treat pods as inactive and react appropriately.
sourcesReady config.SourcesReady
// KubeClient reference
kubeClient clientset.Interface
// getNode is a function that returns the node object using the kubelet's node lister.
getNode GetNodeFunc
}
// NewManagerImpl creates a new manager.
func NewManagerImpl(kubeClient clientset.Interface, stateFileDirectory string, nodeName types.NodeName) (*ManagerImpl, error) {
claimInfoCache, err := newClaimInfoCache(stateFileDirectory, draManagerStateFileName)
if err != nil {
return nil, fmt.Errorf("failed to create claimInfo cache: %w", err)
}
// TODO: for now the reconcile period is not configurable.
// We should consider making it configurable in the future.
reconcilePeriod := defaultReconcilePeriod
manager := &ManagerImpl{
cache: claimInfoCache,
kubeClient: kubeClient,
reconcilePeriod: reconcilePeriod,
activePods: nil,
sourcesReady: nil,
}
return manager, nil
}
func (m *ManagerImpl) GetWatcherHandler() cache.PluginHandler {
return cache.PluginHandler(dra.NewRegistrationHandler(m.kubeClient, m.getNode))
}
// Start starts the reconcile loop of the manager.
func (m *ManagerImpl) Start(ctx context.Context, activePods ActivePodsFunc, getNode GetNodeFunc, sourcesReady config.SourcesReady) error {
m.activePods = activePods
m.getNode = getNode
m.sourcesReady = sourcesReady
go wait.UntilWithContext(ctx, func(ctx context.Context) { m.reconcileLoop(ctx) }, m.reconcilePeriod)
return nil
}
// reconcileLoop ensures that any stale state in the manager's claimInfoCache gets periodically reconciled.
func (m *ManagerImpl) reconcileLoop(ctx context.Context) {
logger := klog.FromContext(ctx)
// Only once all sources are ready do we attempt to reconcile.
// This ensures that the call to m.activePods() below will succeed with
// the actual active pods list.
if m.sourcesReady == nil || !m.sourcesReady.AllReady() {
return
}
// Get the full list of active pods.
activePods := sets.New[string]()
for _, p := range m.activePods() {
activePods.Insert(string(p.UID))
}
// Get the list of inactive pods still referenced by any claimInfos.
type podClaims struct {
uid types.UID
namespace string
claimNames []string
}
inactivePodClaims := make(map[string]*podClaims)
m.cache.RLock()
for _, claimInfo := range m.cache.claimInfo {
for podUID := range claimInfo.PodUIDs {
if activePods.Has(podUID) {
continue
}
if inactivePodClaims[podUID] == nil {
inactivePodClaims[podUID] = &podClaims{
uid: types.UID(podUID),
namespace: claimInfo.Namespace,
claimNames: []string{},
}
}
inactivePodClaims[podUID].claimNames = append(inactivePodClaims[podUID].claimNames, claimInfo.ClaimName)
}
}
m.cache.RUnlock()
// Loop through all inactive pods and call UnprepareResources on them.
for _, podClaims := range inactivePodClaims {
if err := m.unprepareResources(ctx, podClaims.uid, podClaims.namespace, podClaims.claimNames); err != nil {
logger.Info("Unpreparing pod resources in reconcile loop failed, will retry", "podUID", podClaims.uid, "err", err)
}
}
}
// PrepareResources attempts to prepare all of the required resources
// for the input container, issue NodePrepareResources rpc requests
// for each new resource requirement, process their responses and update the cached
// containerResources on success.
func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
startTime := time.Now()
err := m.prepareResources(ctx, pod)
metrics.DRAOperationsDuration.WithLabelValues("PrepareResources", strconv.FormatBool(err == nil)).Observe(time.Since(startTime).Seconds())
return err
}
func (m *ManagerImpl) prepareResources(ctx context.Context, pod *v1.Pod) error {
logger := klog.FromContext(ctx)
batches := make(map[string][]*drapb.Claim)
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
for i := range pod.Spec.ResourceClaims {
podClaim := &pod.Spec.ResourceClaims[i]
logger.V(3).Info("Processing resource", "pod", klog.KObj(pod), "podClaim", podClaim.Name)
claimName, mustCheckOwner, err := resourceclaim.Name(pod, podClaim)
if err != nil {
return fmt.Errorf("prepare resource claim: %w", err)
}
if claimName == nil {
// Nothing to do.
logger.V(5).Info("No need to prepare resources, no claim generated", "pod", klog.KObj(pod), "podClaim", podClaim.Name)
continue
}
// Query claim object from the API server
resourceClaim, err := m.kubeClient.ResourceV1beta1().ResourceClaims(pod.Namespace).Get(
ctx,
*claimName,
metav1.GetOptions{})
if err != nil {
return fmt.Errorf("failed to fetch ResourceClaim %s referenced by pod %s: %w", *claimName, pod.Name, err)
}
if mustCheckOwner {
if err = resourceclaim.IsForPod(pod, resourceClaim); err != nil {
return err
}
}
// Check if pod is in the ReservedFor for the claim
if !resourceclaim.IsReservedForPod(pod, resourceClaim) {
return fmt.Errorf("pod %s(%s) is not allowed to use resource claim %s(%s)",
pod.Name, pod.UID, *claimName, resourceClaim.UID)
}
// Atomically perform some operations on the claimInfo cache.
err = m.cache.withLock(func() error {
// Get a reference to the claim info for this claim from the cache.
// If there isn't one yet, then add it to the cache.
claimInfo, exists := m.cache.get(resourceClaim.Name, resourceClaim.Namespace)
if !exists {
ci, err := newClaimInfoFromClaim(resourceClaim)
if err != nil {
return fmt.Errorf("claim %s: %w", klog.KObj(resourceClaim), err)
}
claimInfo = m.cache.add(ci)
logger.V(6).Info("Created new claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo)
} else {
logger.V(6).Info("Found existing claim info cache entry", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim), "claimInfoEntry", claimInfo)
}
// Add a reference to the current pod in the claim info.
claimInfo.addPodReference(pod.UID)
// Checkpoint to ensure all claims we plan to prepare are tracked.
// If something goes wrong and the newly referenced pod gets
// deleted without a successful prepare call, we will catch
// that in the reconcile loop and take the appropriate action.
if err := m.cache.syncToCheckpoint(); err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state: %w", err)
}
// If this claim is already prepared, there is no need to prepare it again.
if claimInfo.isPrepared() {
logger.V(5).Info("Resources already prepared", "pod", klog.KObj(pod), "podClaim", podClaim.Name, "claim", klog.KObj(resourceClaim))
return nil
}
// This saved claim will be used to update ClaimInfo cache
// after NodePrepareResources GRPC succeeds
resourceClaims[claimInfo.ClaimUID] = resourceClaim
// Loop through all drivers and prepare for calling NodePrepareResources.
claim := &drapb.Claim{
Namespace: claimInfo.Namespace,
UID: string(claimInfo.ClaimUID),
Name: claimInfo.ClaimName,
}
for driverName := range claimInfo.DriverState {
batches[driverName] = append(batches[driverName], claim)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
}
// Call NodePrepareResources for all claims in each batch.
// If there is any error, processing gets aborted.
// We could try to continue, but that would make the code more complex.
for driverName, claims := range batches {
// Call NodePrepareResources RPC for all resource handles.
client, err := dra.NewDRAPluginClient(driverName)
if err != nil {
return fmt.Errorf("failed to get gRPC client for driver %s: %w", driverName, err)
}
response, err := client.NodePrepareResources(ctx, &drapb.NodePrepareResourcesRequest{Claims: claims})
if err != nil {
// General error unrelated to any particular claim.
return fmt.Errorf("NodePrepareResources failed: %w", err)
}
for claimUID, result := range response.Claims {
reqClaim := lookupClaimRequest(claims, claimUID)
if reqClaim == nil {
return fmt.Errorf("NodePrepareResources returned result for unknown claim UID %s", claimUID)
}
if result.GetError() != "" {
return fmt.Errorf("NodePrepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error)
}
claim := resourceClaims[types.UID(claimUID)]
// Add the prepared CDI devices to the claim info
err := m.cache.withLock(func() error {
info, exists := m.cache.get(claim.Name, claim.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", claim.Name, claim.Namespace)
}
for _, device := range result.GetDevices() {
info.addDevice(driverName, state.Device{PoolName: device.PoolName, DeviceName: device.DeviceName, RequestNames: device.RequestNames, CDIDeviceIDs: device.CDIDeviceIDs})
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
}
unfinished := len(claims) - len(response.Claims)
if unfinished != 0 {
return fmt.Errorf("NodePrepareResources left out %d claims", unfinished)
}
}
// Atomically perform some operations on the claimInfo cache.
err := m.cache.withLock(func() error {
// Mark all pod claims as prepared.
for _, claim := range resourceClaims {
info, exists := m.cache.get(claim.Name, claim.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", claim.Name, claim.Namespace)
}
info.setPrepared()
}
// Checkpoint to ensure all prepared claims are tracked with their list
// of CDI devices attached.
if err := m.cache.syncToCheckpoint(); err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state: %w", err)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
return nil
}
func lookupClaimRequest(claims []*drapb.Claim, claimUID string) *drapb.Claim {
for _, claim := range claims {
if claim.UID == claimUID {
return claim
}
}
return nil
}
// GetResources gets a ContainerInfo object from the claimInfo cache.
// This information is used by the caller to update a container config.
func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*ContainerInfo, error) {
cdiDevices := []kubecontainer.CDIDevice{}
for i := range pod.Spec.ResourceClaims {
podClaim := &pod.Spec.ResourceClaims[i]
claimName, _, err := resourceclaim.Name(pod, podClaim)
if err != nil {
return nil, fmt.Errorf("list resource claims: %w", err)
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it.
if claimName == nil {
continue
}
for _, claim := range container.Resources.Claims {
if podClaim.Name != claim.Name {
continue
}
err := m.cache.withRLock(func() error {
claimInfo, exists := m.cache.get(*claimName, pod.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", *claimName, pod.Namespace)
}
// As of Kubernetes 1.31, CDI device IDs are not passed via annotations anymore.
cdiDevices = append(cdiDevices, claimInfo.cdiDevicesAsList(claim.Request)...)
return nil
})
if err != nil {
return nil, fmt.Errorf("locked cache operation: %w", err)
}
}
}
return &ContainerInfo{CDIDevices: cdiDevices}, nil
}
// UnprepareResources calls a driver's NodeUnprepareResource API for each resource claim owned by a pod.
// This function is idempotent and may be called multiple times against the same pod.
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
// already been successfully unprepared.
func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
var err error = nil
defer func(startTime time.Time) {
metrics.DRAOperationsDuration.WithLabelValues("UnprepareResources", strconv.FormatBool(err != nil)).Observe(time.Since(startTime).Seconds())
}(time.Now())
var claimNames []string
for i := range pod.Spec.ResourceClaims {
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
if err != nil {
return fmt.Errorf("unprepare resource claim: %w", err)
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it.
if claimName == nil {
continue
}
claimNames = append(claimNames, *claimName)
}
err = m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
return err
}
func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {
logger := klog.FromContext(ctx)
batches := make(map[string][]*drapb.Claim)
claimNamesMap := make(map[types.UID]string)
for _, claimName := range claimNames {
// Atomically perform some operations on the claimInfo cache.
err := m.cache.withLock(func() error {
// Get the claim info from the cache
claimInfo, exists := m.cache.get(claimName, namespace)
// Skip calling NodeUnprepareResource if claim info is not cached
if !exists {
return nil
}
// Skip calling NodeUnprepareResource if other pods are still referencing it
if len(claimInfo.PodUIDs) > 1 {
// We delay checkpointing of this change until
// UnprepareResources returns successfully. It is OK to do
// this because we will only return successfully from this call
// if the checkpoint has succeeded. That means if the kubelet
// is ever restarted before this checkpoint succeeds, we will
// simply call into this (idempotent) function again.
claimInfo.deletePodReference(podUID)
return nil
}
// This claimInfo name will be used to update ClaimInfo cache
// after NodeUnprepareResources GRPC succeeds
claimNamesMap[claimInfo.ClaimUID] = claimInfo.ClaimName
// Loop through all drivers and prepare for calling NodeUnprepareResources.
claim := &drapb.Claim{
Namespace: claimInfo.Namespace,
UID: string(claimInfo.ClaimUID),
Name: claimInfo.ClaimName,
}
for driverName := range claimInfo.DriverState {
batches[driverName] = append(batches[driverName], claim)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
}
// Call NodeUnprepareResources for all claims in each batch.
// If there is any error, processing gets aborted.
// We could try to continue, but that would make the code more complex.
for driverName, claims := range batches {
// Call NodeUnprepareResources RPC for all resource handles.
client, err := dra.NewDRAPluginClient(driverName)
if err != nil {
return fmt.Errorf("get gRPC client for DRA driver %s: %w", driverName, err)
}
response, err := client.NodeUnprepareResources(ctx, &drapb.NodeUnprepareResourcesRequest{Claims: claims})
if err != nil {
// General error unrelated to any particular claim.
return fmt.Errorf("NodeUnprepareResources failed: %w", err)
}
for claimUID, result := range response.Claims {
reqClaim := lookupClaimRequest(claims, claimUID)
if reqClaim == nil {
return fmt.Errorf("NodeUnprepareResources returned result for unknown claim UID %s", claimUID)
}
if result.GetError() != "" {
return fmt.Errorf("NodeUnprepareResources failed for claim %s/%s: %s", reqClaim.Namespace, reqClaim.Name, result.Error)
}
}
unfinished := len(claims) - len(response.Claims)
if unfinished != 0 {
return fmt.Errorf("NodeUnprepareResources left out %d claims", unfinished)
}
}
// Atomically perform some operations on the claimInfo cache.
err := m.cache.withLock(func() error {
// Delete all claimInfos from the cache that have just been unprepared.
for _, claimName := range claimNamesMap {
claimInfo, _ := m.cache.get(claimName, namespace)
m.cache.delete(claimName, namespace)
logger.V(6).Info("Deleted claim info cache entry", "claim", klog.KRef(namespace, claimName), "claimInfoEntry", claimInfo)
}
// Atomically sync the cache back to the checkpoint.
if err := m.cache.syncToCheckpoint(); err != nil {
return fmt.Errorf("failed to checkpoint claimInfo state: %w", err)
}
return nil
})
if err != nil {
return fmt.Errorf("locked cache operation: %w", err)
}
return nil
}
// PodMightNeedToUnprepareResources returns true if the pod might need to
// unprepare resources
func (m *ManagerImpl) PodMightNeedToUnprepareResources(uid types.UID) bool {
m.cache.Lock()
defer m.cache.Unlock()
return m.cache.hasPodReference(uid)
}
// GetContainerClaimInfos gets Container's ClaimInfo
func (m *ManagerImpl) GetContainerClaimInfos(pod *v1.Pod, container *v1.Container) ([]*ClaimInfo, error) {
claimInfos := make([]*ClaimInfo, 0, len(pod.Spec.ResourceClaims))
for i, podResourceClaim := range pod.Spec.ResourceClaims {
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
if err != nil {
return nil, fmt.Errorf("determine resource claim information: %w", err)
}
for _, claim := range container.Resources.Claims {
if podResourceClaim.Name != claim.Name {
continue
}
err := m.cache.withRLock(func() error {
claimInfo, exists := m.cache.get(*claimName, pod.Namespace)
if !exists {
return fmt.Errorf("unable to get claim info for claim %s in namespace %s", *claimName, pod.Namespace)
}
claimInfos = append(claimInfos, claimInfo.DeepCopy())
return nil
})
if err != nil {
return nil, fmt.Errorf("locked cache operation: %w", err)
}
}
}
return claimInfos, nil
}

View File

@ -0,0 +1,181 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugin
import (
"context"
"errors"
"fmt"
"net"
"sync"
"time"
"google.golang.org/grpc"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/credentials/insecure"
"google.golang.org/grpc/status"
"k8s.io/klog/v2"
drapbv1alpha4 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
drapbv1beta1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/metrics"
)
// NewDRAPluginClient returns a wrapper around those gRPC methods of a DRA
// driver kubelet plugin which need to be called by kubelet. The wrapper
// handles gRPC connection management and logging. Connections are reused
// across different NewDRAPluginClient calls.
func NewDRAPluginClient(pluginName string) (*Plugin, error) {
if pluginName == "" {
return nil, fmt.Errorf("plugin name is empty")
}
existingPlugin := draPlugins.get(pluginName)
if existingPlugin == nil {
return nil, fmt.Errorf("plugin name %s not found in the list of registered DRA plugins", pluginName)
}
return existingPlugin, nil
}
type Plugin struct {
name string
backgroundCtx context.Context
cancel func(cause error)
mutex sync.Mutex
conn *grpc.ClientConn
endpoint string
chosenService string // e.g. drapbv1beta1.DRAPluginService
clientCallTimeout time.Duration
}
func (p *Plugin) getOrCreateGRPCConn() (*grpc.ClientConn, error) {
p.mutex.Lock()
defer p.mutex.Unlock()
if p.conn != nil {
return p.conn, nil
}
ctx := p.backgroundCtx
logger := klog.FromContext(ctx)
network := "unix"
logger.V(4).Info("Creating new gRPC connection", "protocol", network, "endpoint", p.endpoint)
// grpc.Dial is deprecated. grpc.NewClient should be used instead.
// For now this gets ignored because this function is meant to establish
// the connection, with the one second timeout below. Perhaps that
// approach should be reconsidered?
//nolint:staticcheck
conn, err := grpc.Dial(
p.endpoint,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithContextDialer(func(ctx context.Context, target string) (net.Conn, error) {
return (&net.Dialer{}).DialContext(ctx, network, target)
}),
grpc.WithChainUnaryInterceptor(newMetricsInterceptor(p.name)),
)
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
if ok := conn.WaitForStateChange(ctx, connectivity.Connecting); !ok {
return nil, errors.New("timed out waiting for gRPC connection to be ready")
}
p.conn = conn
return p.conn, nil
}
func (p *Plugin) NodePrepareResources(
ctx context.Context,
req *drapbv1beta1.NodePrepareResourcesRequest,
opts ...grpc.CallOption,
) (*drapbv1beta1.NodePrepareResourcesResponse, error) {
logger := klog.FromContext(ctx)
logger.V(4).Info("Calling NodePrepareResources rpc", "request", req)
conn, err := p.getOrCreateGRPCConn()
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
defer cancel()
var response *drapbv1beta1.NodePrepareResourcesResponse
switch p.chosenService {
case drapbv1beta1.DRAPluginService:
nodeClient := drapbv1beta1.NewDRAPluginClient(conn)
response, err = nodeClient.NodePrepareResources(ctx, req)
case drapbv1alpha4.NodeService:
nodeClient := drapbv1alpha4.NewNodeClient(conn)
response, err = drapbv1alpha4.V1Alpha4ClientWrapper{NodeClient: nodeClient}.NodePrepareResources(ctx, req)
default:
// Shouldn't happen, validateSupportedServices should only
// return services we support here.
return nil, fmt.Errorf("internal error: unsupported chosen service: %q", p.chosenService)
}
logger.V(4).Info("Done calling NodePrepareResources rpc", "response", response, "err", err)
return response, err
}
func (p *Plugin) NodeUnprepareResources(
ctx context.Context,
req *drapbv1beta1.NodeUnprepareResourcesRequest,
opts ...grpc.CallOption,
) (*drapbv1beta1.NodeUnprepareResourcesResponse, error) {
logger := klog.FromContext(ctx)
logger.V(4).Info("Calling NodeUnprepareResource rpc", "request", req)
conn, err := p.getOrCreateGRPCConn()
if err != nil {
return nil, err
}
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
defer cancel()
var response *drapbv1beta1.NodeUnprepareResourcesResponse
switch p.chosenService {
case drapbv1beta1.DRAPluginService:
nodeClient := drapbv1beta1.NewDRAPluginClient(conn)
response, err = nodeClient.NodeUnprepareResources(ctx, req)
case drapbv1alpha4.NodeService:
nodeClient := drapbv1alpha4.NewNodeClient(conn)
response, err = drapbv1alpha4.V1Alpha4ClientWrapper{NodeClient: nodeClient}.NodeUnprepareResources(ctx, req)
default:
// Shouldn't happen, validateSupportedServices should only
// return services we support here.
return nil, fmt.Errorf("internal error: unsupported chosen service: %q", p.chosenService)
}
logger.V(4).Info("Done calling NodeUnprepareResources rpc", "response", response, "err", err)
return response, err
}
func newMetricsInterceptor(pluginName string) grpc.UnaryClientInterceptor {
return func(ctx context.Context, method string, req, reply any, conn *grpc.ClientConn, invoker grpc.UnaryInvoker, opts ...grpc.CallOption) error {
start := time.Now()
err := invoker(ctx, method, req, reply, conn, opts...)
metrics.DRAGRPCOperationsDuration.WithLabelValues(pluginName, method, status.Code(err).String()).Observe(time.Since(start).Seconds())
return err
}
}

View File

@ -0,0 +1,79 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugin
import (
"errors"
"sync"
)
// PluginsStore holds a list of DRA Plugins.
type pluginsStore struct {
sync.RWMutex
store map[string]*Plugin
}
// draPlugins map keeps track of all registered DRA plugins on the node
// and their corresponding sockets.
var draPlugins = &pluginsStore{}
// Get lets you retrieve a DRA Plugin by name.
// This method is protected by a mutex.
func (s *pluginsStore) get(pluginName string) *Plugin {
s.RLock()
defer s.RUnlock()
return s.store[pluginName]
}
// Set lets you save a DRA Plugin to the list and give it a specific name.
// This method is protected by a mutex.
func (s *pluginsStore) add(p *Plugin) (replacedPlugin *Plugin, replaced bool) {
s.Lock()
defer s.Unlock()
if s.store == nil {
s.store = make(map[string]*Plugin)
}
replacedPlugin, exists := s.store[p.name]
s.store[p.name] = p
if replacedPlugin != nil && replacedPlugin.cancel != nil {
replacedPlugin.cancel(errors.New("plugin got replaced"))
}
return replacedPlugin, exists
}
// Delete lets you delete a DRA Plugin by name.
// This method is protected by a mutex.
func (s *pluginsStore) delete(pluginName string) *Plugin {
s.Lock()
defer s.Unlock()
p, exists := s.store[pluginName]
if !exists {
return nil
}
if p.cancel != nil {
p.cancel(errors.New("plugin got removed"))
}
delete(s.store, pluginName)
return p
}

View File

@ -0,0 +1,249 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugin
import (
"context"
"errors"
"fmt"
"slices"
"time"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
"k8s.io/klog/v2"
drapbv1alpha4 "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
drapbv1beta1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// defaultClientCallTimeout is the default amount of time that a DRA driver has
// to respond to any of the gRPC calls. kubelet uses this value by passing nil
// to RegisterPlugin. Some tests use a different, usually shorter timeout to
// speed up testing.
//
// This is half of the kubelet retry period (according to
// https://github.com/kubernetes/kubernetes/commit/0449cef8fd5217d394c5cd331d852bd50983e6b3).
const defaultClientCallTimeout = 45 * time.Second
// RegistrationHandler is the handler which is fed to the pluginwatcher API.
type RegistrationHandler struct {
// backgroundCtx is used for all future activities of the handler.
// This is necessary because it implements APIs which don't
// provide a context.
backgroundCtx context.Context
kubeClient kubernetes.Interface
getNode func() (*v1.Node, error)
}
var _ cache.PluginHandler = &RegistrationHandler{}
// NewPluginHandler returns new registration handler.
//
// Must only be called once per process because it manages global state.
// If a kubeClient is provided, then it synchronizes ResourceSlices
// with the resource information provided by plugins.
func NewRegistrationHandler(kubeClient kubernetes.Interface, getNode func() (*v1.Node, error)) *RegistrationHandler {
handler := &RegistrationHandler{
// The context and thus logger should come from the caller.
backgroundCtx: klog.NewContext(context.TODO(), klog.LoggerWithName(klog.TODO(), "DRA registration handler")),
kubeClient: kubeClient,
getNode: getNode,
}
// When kubelet starts up, no DRA driver has registered yet. None of
// the drivers are usable until they come back, which might not happen
// at all. Therefore it is better to not advertise any local resources
// because pods could get stuck on the node waiting for the driver
// to start up.
//
// This has to run in the background.
go handler.wipeResourceSlices("")
return handler
}
// wipeResourceSlices deletes ResourceSlices of the node, optionally just for a specific driver.
func (h *RegistrationHandler) wipeResourceSlices(driver string) {
if h.kubeClient == nil {
return
}
ctx := h.backgroundCtx
logger := klog.FromContext(ctx)
backoff := wait.Backoff{
Duration: time.Second,
Factor: 2,
Jitter: 0.2,
Cap: 5 * time.Minute,
Steps: 100,
}
// Error logging is done inside the loop. Context cancellation doesn't get logged.
_ = wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) {
node, err := h.getNode()
if apierrors.IsNotFound(err) {
return false, nil
}
if err != nil {
logger.Error(err, "Unexpected error checking for node")
return false, nil
}
fieldSelector := fields.Set{resourceapi.ResourceSliceSelectorNodeName: node.Name}
if driver != "" {
fieldSelector[resourceapi.ResourceSliceSelectorDriver] = driver
}
err = h.kubeClient.ResourceV1beta1().ResourceSlices().DeleteCollection(ctx, metav1.DeleteOptions{}, metav1.ListOptions{FieldSelector: fieldSelector.String()})
switch {
case err == nil:
logger.V(3).Info("Deleted ResourceSlices", "fieldSelector", fieldSelector)
return true, nil
case apierrors.IsUnauthorized(err):
// This can happen while kubelet is still figuring out
// its credentials.
logger.V(5).Info("Deleting ResourceSlice failed, retrying", "fieldSelector", fieldSelector, "err", err)
return false, nil
default:
// Log and retry for other errors.
logger.V(3).Info("Deleting ResourceSlice failed, retrying", "fieldSelector", fieldSelector, "err", err)
return false, nil
}
})
}
// RegisterPlugin is called when a plugin can be registered.
//
// DRA uses the version array in the registration API to enumerate all gRPC
// services that the plugin provides, using the "<gRPC package name>.<service
// name>" format (e.g. "v1beta1.DRAPlugin"). This allows kubelet to determine
// in advance which version to use resp. which optional services the plugin
// supports.
func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string, supportedServices []string, pluginClientTimeout *time.Duration) error {
// Prepare a context with its own logger for the plugin.
//
// The lifecycle of the plugin's background activities is tied to our
// root context, so canceling that will also cancel the plugin.
//
// The logger injects the plugin name as additional value
// into all log output related to the plugin.
ctx := h.backgroundCtx
logger := klog.FromContext(ctx)
logger = klog.LoggerWithValues(logger, "pluginName", pluginName)
ctx = klog.NewContext(ctx, logger)
logger.V(3).Info("Register new DRA plugin", "endpoint", endpoint)
chosenService, err := h.validateSupportedServices(pluginName, supportedServices)
if err != nil {
return fmt.Errorf("version check of plugin %s failed: %w", pluginName, err)
}
var timeout time.Duration
if pluginClientTimeout == nil {
timeout = defaultClientCallTimeout
} else {
timeout = *pluginClientTimeout
}
ctx, cancel := context.WithCancelCause(ctx)
pluginInstance := &Plugin{
name: pluginName,
backgroundCtx: ctx,
cancel: cancel,
conn: nil,
endpoint: endpoint,
chosenService: chosenService,
clientCallTimeout: timeout,
}
// Storing endpoint of newly registered DRA Plugin into the map, where plugin name will be the key
// all other DRA components will be able to get the actual socket of DRA plugins by its name.
if oldPlugin, replaced := draPlugins.add(pluginInstance); replaced {
logger.V(1).Info("DRA plugin already registered, the old plugin was replaced and will be forgotten by the kubelet till the next kubelet restart", "oldEndpoint", oldPlugin.endpoint)
}
return nil
}
// validateSupportedServices identifies the highest supported gRPC service for
// NodePrepareResources and NodeUnprepareResources and returns its name
// (e.g. [drapbv1beta1.DRAPluginService]). An error is returned if the plugin
// is unusable.
func (h *RegistrationHandler) validateSupportedServices(pluginName string, supportedServices []string) (string, error) {
if len(supportedServices) == 0 {
return "", errors.New("empty list of supported gRPC services (aka supported versions)")
}
// Pick most recent version if available.
chosenService := ""
for _, service := range []string{
// Sorted by most recent first, oldest last.
drapbv1beta1.DRAPluginService,
drapbv1alpha4.NodeService,
} {
if slices.Contains(supportedServices, service) {
chosenService = service
break
}
}
// Fall back to alpha if necessary because
// plugins at that time didn't advertise gRPC services.
if chosenService == "" {
chosenService = drapbv1alpha4.NodeService
}
return chosenService, nil
}
// DeRegisterPlugin is called when a plugin has removed its socket,
// signaling it is no longer available.
func (h *RegistrationHandler) DeRegisterPlugin(pluginName string) {
if p := draPlugins.delete(pluginName); p != nil {
logger := klog.FromContext(p.backgroundCtx)
logger.V(3).Info("Deregister DRA plugin", "endpoint", p.endpoint)
// Clean up the ResourceSlices for the deleted Plugin since it
// may have died without doing so itself and might never come
// back.
go h.wipeResourceSlices(pluginName)
return
}
logger := klog.FromContext(h.backgroundCtx)
logger.V(3).Info("Deregister DRA plugin not necessary, was already removed")
}
// ValidatePlugin is called by kubelet's plugin watcher upon detection
// of a new registration socket opened by DRA plugin.
func (h *RegistrationHandler) ValidatePlugin(pluginName string, endpoint string, supportedServices []string) error {
_, err := h.validateSupportedServices(pluginName, supportedServices)
if err != nil {
return fmt.Errorf("invalid versions of plugin %s: %w", pluginName, err)
}
return err
}

View File

@ -0,0 +1,107 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"hash/crc32"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
const (
CheckpointAPIGroup = "checkpoint.dra.kubelet.k8s.io"
CheckpointKind = "DRACheckpoint"
CheckpointAPIVersion = CheckpointAPIGroup + "/v1"
)
// Checkpoint represents a structure to store DRA checkpoint data
type Checkpoint struct {
// Data is a JSON serialized checkpoint data
Data string
// Checksum is a checksum of Data
Checksum uint32
}
type CheckpointData struct {
metav1.TypeMeta
ClaimInfoStateList ClaimInfoStateList
}
// NewCheckpoint creates a new checkpoint from a list of claim info states
func NewCheckpoint(data ClaimInfoStateList) (*Checkpoint, error) {
cpData := &CheckpointData{
TypeMeta: metav1.TypeMeta{
Kind: CheckpointKind,
APIVersion: CheckpointAPIVersion,
},
ClaimInfoStateList: data,
}
cpDataBytes, err := json.Marshal(cpData)
if err != nil {
return nil, err
}
cp := &Checkpoint{
Data: string(cpDataBytes),
Checksum: crc32.ChecksumIEEE(cpDataBytes),
}
return cp, nil
}
// MarshalCheckpoint marshals checkpoint to JSON
func (cp *Checkpoint) MarshalCheckpoint() ([]byte, error) {
return json.Marshal(cp)
}
// UnmarshalCheckpoint unmarshals checkpoint from JSON
// and verifies its data checksum
func (cp *Checkpoint) UnmarshalCheckpoint(blob []byte) error {
if err := json.Unmarshal(blob, cp); err != nil {
return err
}
// verify checksum
if err := cp.VerifyChecksum(); err != nil {
return err
}
return nil
}
// VerifyChecksum verifies that current checksum
// of checkpointed Data is valid
func (cp *Checkpoint) VerifyChecksum() error {
expectedCS := crc32.ChecksumIEEE([]byte(cp.Data))
if expectedCS != cp.Checksum {
return &errors.CorruptCheckpointError{ActualCS: uint64(cp.Checksum), ExpectedCS: uint64(expectedCS)}
}
return nil
}
// GetClaimInfoStateList returns list of claim info states from checkpoint
func (cp *Checkpoint) GetClaimInfoStateList() (ClaimInfoStateList, error) {
var data CheckpointData
if err := json.Unmarshal([]byte(cp.Data), &data); err != nil {
return nil, err
}
return data.ClaimInfoStateList, nil
}

View File

@ -0,0 +1,98 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"errors"
"fmt"
"sync"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
checkpointerrors "k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
type Checkpointer interface {
GetOrCreate() (*Checkpoint, error)
Store(*Checkpoint) error
}
type checkpointer struct {
sync.RWMutex
checkpointManager checkpointmanager.CheckpointManager
checkpointName string
}
// NewCheckpointer creates new checkpointer for keeping track of claim info with checkpoint backend
func NewCheckpointer(stateDir, checkpointName string) (Checkpointer, error) {
if len(checkpointName) == 0 {
return nil, fmt.Errorf("received empty string instead of checkpointName")
}
checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager: %w", err)
}
checkpointer := &checkpointer{
checkpointManager: checkpointManager,
checkpointName: checkpointName,
}
return checkpointer, nil
}
// GetOrCreate gets list of claim info states from a checkpoint
// or creates empty list if checkpoint doesn't exist
func (sc *checkpointer) GetOrCreate() (*Checkpoint, error) {
sc.Lock()
defer sc.Unlock()
checkpoint, err := NewCheckpoint(nil)
if err != nil {
return nil, fmt.Errorf("failed to create new checkpoint: %w", err)
}
err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint)
if errors.Is(err, checkpointerrors.ErrCheckpointNotFound) {
err = sc.store(checkpoint)
if err != nil {
return nil, fmt.Errorf("failed to store checkpoint %v: %w", sc.checkpointName, err)
}
return checkpoint, nil
}
if err != nil {
return nil, fmt.Errorf("failed to get checkpoint %v: %w", sc.checkpointName, err)
}
return checkpoint, nil
}
// Store stores checkpoint to the file
func (sc *checkpointer) Store(checkpoint *Checkpoint) error {
sc.Lock()
defer sc.Unlock()
return sc.store(checkpoint)
}
// store saves state to a checkpoint, caller is responsible for locking
func (sc *checkpointer) store(checkpoint *Checkpoint) error {
if err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint); err != nil {
return fmt.Errorf("could not save checkpoint %s: %w", sc.checkpointName, err)
}
return nil
}

View File

@ -0,0 +1,59 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
)
type ClaimInfoStateList []ClaimInfoState
// +k8s:deepcopy-gen=true
type ClaimInfoState struct {
// ClaimUID is the UID of a resource claim
ClaimUID types.UID
// ClaimName is the name of a resource claim
ClaimName string
// Namespace is a claim namespace
Namespace string
// PodUIDs is a set of pod UIDs that reference a resource
PodUIDs sets.Set[string]
// DriverState contains information about all drivers which have allocation
// results in the claim, even if they don't provide devices for their results.
DriverState map[string]DriverState
}
// DriverState is used to store per-device claim info state in a checkpoint
// +k8s:deepcopy-gen=true
type DriverState struct {
Devices []Device
}
// Device is how a DRA driver described an allocated device in a claim
// to kubelet. RequestName and CDI device IDs are optional.
// +k8s:deepcopy-gen=true
type Device struct {
PoolName string
DeviceName string
RequestNames []string
CDIDeviceIDs []string
}

View File

@ -0,0 +1,105 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by deepcopy-gen. DO NOT EDIT.
package state
import (
sets "k8s.io/apimachinery/pkg/util/sets"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ClaimInfoState) DeepCopyInto(out *ClaimInfoState) {
*out = *in
if in.PodUIDs != nil {
in, out := &in.PodUIDs, &out.PodUIDs
*out = make(sets.Set[string], len(*in))
for key, val := range *in {
(*out)[key] = val
}
}
if in.DriverState != nil {
in, out := &in.DriverState, &out.DriverState
*out = make(map[string]DriverState, len(*in))
for key, val := range *in {
(*out)[key] = *val.DeepCopy()
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClaimInfoState.
func (in *ClaimInfoState) DeepCopy() *ClaimInfoState {
if in == nil {
return nil
}
out := new(ClaimInfoState)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Device) DeepCopyInto(out *Device) {
*out = *in
if in.RequestNames != nil {
in, out := &in.RequestNames, &out.RequestNames
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.CDIDeviceIDs != nil {
in, out := &in.CDIDeviceIDs, &out.CDIDeviceIDs
*out = make([]string, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Device.
func (in *Device) DeepCopy() *Device {
if in == nil {
return nil
}
out := new(Device)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DriverState) DeepCopyInto(out *DriverState) {
*out = *in
if in.Devices != nil {
in, out := &in.Devices, &out.Devices
*out = make([]Device, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DriverState.
func (in *DriverState) DeepCopy() *DriverState {
if in == nil {
return nil
}
out := new(DriverState)
in.DeepCopyInto(out)
return out
}

61
vendor/k8s.io/kubernetes/pkg/kubelet/cm/dra/types.go generated vendored Normal file
View File

@ -0,0 +1,61 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dra
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)
// Manager manages all the DRA resource plugins running on a node.
type Manager interface {
// GetWatcherHandler returns the plugin handler for the DRA.
GetWatcherHandler() cache.PluginHandler
// Start starts the reconcile loop of the manager.
// This will ensure that all claims are unprepared even if pods get deleted unexpectedly.
Start(ctx context.Context, activePods ActivePodsFunc, getNode GetNodeFunc, sourcesReady config.SourcesReady) error
// PrepareResources prepares resources for a pod.
// It communicates with the DRA resource plugin to prepare resources.
PrepareResources(ctx context.Context, pod *v1.Pod) error
// UnprepareResources calls NodeUnprepareResource GRPC from DRA plugin to unprepare pod resources
UnprepareResources(ctx context.Context, pod *v1.Pod) error
// GetResources gets a ContainerInfo object from the claimInfo cache.
// This information is used by the caller to update a container config.
GetResources(pod *v1.Pod, container *v1.Container) (*ContainerInfo, error)
// PodMightNeedToUnprepareResources returns true if the pod with the given UID
// might need to unprepare resources.
PodMightNeedToUnprepareResources(UID types.UID) bool
// GetContainerClaimInfos gets Container ClaimInfo objects
GetContainerClaimInfos(pod *v1.Pod, container *v1.Container) ([]*ClaimInfo, error)
}
// ContainerInfo contains information required by the runtime to consume prepared resources.
type ContainerInfo struct {
// CDI Devices for the container
CDIDevices []kubecontainer.CDIDevice
}

View File

@ -0,0 +1,39 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by deepcopy-gen. DO NOT EDIT.
package dra
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ClaimInfo) DeepCopyInto(out *ClaimInfo) {
*out = *in
in.ClaimInfoState.DeepCopyInto(&out.ClaimInfoState)
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClaimInfo.
func (in *ClaimInfo) DeepCopy() *ClaimInfo {
if in == nil {
return nil
}
out := new(ClaimInfo)
in.DeepCopyInto(out)
return out
}

View File

@ -0,0 +1,270 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apiserver/pkg/server/healthz"
internalapi "k8s.io/cri-api/pkg/apis"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/resourceupdates"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
"k8s.io/kubernetes/pkg/kubelet/status"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)
type FakeContainerManager struct {
sync.Mutex
CalledFunctions []string
PodContainerManager *FakePodContainerManager
shouldResetExtendedResourceCapacity bool
}
var _ ContainerManager = &FakeContainerManager{}
func NewFakeContainerManager() *FakeContainerManager {
return &FakeContainerManager{
PodContainerManager: NewFakePodContainerManager(),
}
}
func (cm *FakeContainerManager) Start(_ context.Context, _ *v1.Node, _ ActivePodsFunc, _ GetNodeFunc, _ config.SourcesReady, _ status.PodStatusProvider, _ internalapi.RuntimeService, _ bool) error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "Start")
return nil
}
func (cm *FakeContainerManager) SystemCgroupsLimit() v1.ResourceList {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "SystemCgroupsLimit")
return v1.ResourceList{}
}
func (cm *FakeContainerManager) GetNodeConfig() NodeConfig {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetNodeConfig")
return NodeConfig{}
}
func (cm *FakeContainerManager) GetMountedSubsystems() *CgroupSubsystems {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetMountedSubsystems")
return &CgroupSubsystems{}
}
func (cm *FakeContainerManager) GetQOSContainersInfo() QOSContainersInfo {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "QOSContainersInfo")
return QOSContainersInfo{}
}
func (cm *FakeContainerManager) UpdateQOSCgroups() error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "UpdateQOSCgroups")
return nil
}
func (cm *FakeContainerManager) Status() Status {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "Status")
return Status{}
}
func (cm *FakeContainerManager) GetNodeAllocatableReservation() v1.ResourceList {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetNodeAllocatableReservation")
return nil
}
func (cm *FakeContainerManager) GetCapacity(localStorageCapacityIsolation bool) v1.ResourceList {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetCapacity")
if !localStorageCapacityIsolation {
return v1.ResourceList{}
}
c := v1.ResourceList{
v1.ResourceEphemeralStorage: *resource.NewQuantity(
int64(0),
resource.BinarySI),
}
return c
}
func (cm *FakeContainerManager) GetPluginRegistrationHandlers() map[string]cache.PluginHandler {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPluginRegistrationHandlers")
return nil
}
func (cm *FakeContainerManager) GetHealthCheckers() []healthz.HealthChecker {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPluginRegistrationServerChecker")
return []healthz.HealthChecker{}
}
func (cm *FakeContainerManager) GetDevicePluginResourceCapacity() (v1.ResourceList, v1.ResourceList, []string) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetDevicePluginResourceCapacity")
return nil, nil, []string{}
}
func (cm *FakeContainerManager) NewPodContainerManager() PodContainerManager {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "PodContainerManager")
return cm.PodContainerManager
}
func (cm *FakeContainerManager) GetResources(ctx context.Context, pod *v1.Pod, container *v1.Container) (*kubecontainer.RunContainerOptions, error) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetResources")
return &kubecontainer.RunContainerOptions{}, nil
}
func (cm *FakeContainerManager) UpdatePluginResources(*schedulerframework.NodeInfo, *lifecycle.PodAdmitAttributes) error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "UpdatePluginResources")
return nil
}
func (cm *FakeContainerManager) InternalContainerLifecycle() InternalContainerLifecycle {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "InternalContainerLifecycle")
return &internalContainerLifecycleImpl{cpumanager.NewFakeManager(), memorymanager.NewFakeManager(), topologymanager.NewFakeManager()}
}
func (cm *FakeContainerManager) GetPodCgroupRoot() string {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupRoot")
return ""
}
func (cm *FakeContainerManager) GetDevices(_, _ string) []*podresourcesapi.ContainerDevices {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetDevices")
return nil
}
func (cm *FakeContainerManager) GetAllocatableDevices() []*podresourcesapi.ContainerDevices {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetAllocatableDevices")
return nil
}
func (cm *FakeContainerManager) ShouldResetExtendedResourceCapacity() bool {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "ShouldResetExtendedResourceCapacity")
return cm.shouldResetExtendedResourceCapacity
}
func (cm *FakeContainerManager) GetAllocateResourcesPodAdmitHandler() lifecycle.PodAdmitHandler {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetAllocateResourcesPodAdmitHandler")
return topologymanager.NewFakeManager()
}
func (cm *FakeContainerManager) UpdateAllocatedDevices() {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "UpdateAllocatedDevices")
return
}
func (cm *FakeContainerManager) GetCPUs(_, _ string) []int64 {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetCPUs")
return nil
}
func (cm *FakeContainerManager) GetAllocatableCPUs() []int64 {
cm.Lock()
defer cm.Unlock()
return nil
}
func (cm *FakeContainerManager) GetMemory(_, _ string) []*podresourcesapi.ContainerMemory {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetMemory")
return nil
}
func (cm *FakeContainerManager) GetAllocatableMemory() []*podresourcesapi.ContainerMemory {
cm.Lock()
defer cm.Unlock()
return nil
}
func (cm *FakeContainerManager) GetDynamicResources(pod *v1.Pod, container *v1.Container) []*podresourcesapi.DynamicResource {
return nil
}
func (cm *FakeContainerManager) GetNodeAllocatableAbsolute() v1.ResourceList {
cm.Lock()
defer cm.Unlock()
return nil
}
func (cm *FakeContainerManager) PrepareDynamicResources(ctx context.Context, pod *v1.Pod) error {
return nil
}
func (cm *FakeContainerManager) UnprepareDynamicResources(context.Context, *v1.Pod) error {
return nil
}
func (cm *FakeContainerManager) PodMightNeedToUnprepareResources(UID types.UID) bool {
return false
}
func (cm *FakeContainerManager) UpdateAllocatedResourcesStatus(pod *v1.Pod, status *v1.PodStatus) {
}
func (cm *FakeContainerManager) Updates() <-chan resourceupdates.Update {
return nil
}

View File

@ -0,0 +1,40 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func NewFakeInternalContainerLifecycle() *fakeInternalContainerLifecycle {
return &fakeInternalContainerLifecycle{}
}
type fakeInternalContainerLifecycle struct{}
func (f *fakeInternalContainerLifecycle) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
return nil
}
func (f *fakeInternalContainerLifecycle) PostStopContainer(containerID string) error {
return nil
}

View File

@ -0,0 +1,127 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"reflect"
"sync"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
)
type FakePodContainerManager struct {
sync.Mutex
CalledFunctions []string
Cgroups map[types.UID]CgroupName
}
var _ PodContainerManager = &FakePodContainerManager{}
func NewFakePodContainerManager() *FakePodContainerManager {
return &FakePodContainerManager{
Cgroups: make(map[types.UID]CgroupName),
}
}
func (m *FakePodContainerManager) AddPodFromCgroups(pod *kubecontainer.Pod) {
m.Lock()
defer m.Unlock()
m.Cgroups[pod.ID] = []string{pod.Name}
}
func (m *FakePodContainerManager) Exists(_ *v1.Pod) bool {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "Exists")
return true
}
func (m *FakePodContainerManager) EnsureExists(_ *v1.Pod) error {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "EnsureExists")
return nil
}
func (m *FakePodContainerManager) GetPodContainerName(_ *v1.Pod) (CgroupName, string) {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "GetPodContainerName")
return nil, ""
}
func (m *FakePodContainerManager) Destroy(name CgroupName) error {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "Destroy")
for key, cgname := range m.Cgroups {
if reflect.DeepEqual(cgname, name) {
delete(m.Cgroups, key)
return nil
}
}
return nil
}
func (m *FakePodContainerManager) ReduceCPULimits(_ CgroupName) error {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "ReduceCPULimits")
return nil
}
func (m *FakePodContainerManager) GetAllPodsFromCgroups() (map[types.UID]CgroupName, error) {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "GetAllPodsFromCgroups")
// return a copy for the race detector
grp := make(map[types.UID]CgroupName)
for key, value := range m.Cgroups {
grp[key] = value
}
return grp, nil
}
func (m *FakePodContainerManager) IsPodCgroup(cgroupfs string) (bool, types.UID) {
m.Lock()
defer m.Unlock()
m.CalledFunctions = append(m.CalledFunctions, "IsPodCgroup")
return false, types.UID("")
}
func (cm *FakePodContainerManager) GetPodCgroupMemoryUsage(_ *v1.Pod) (uint64, error) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupMemoryUsage")
return 0, nil
}
func (cm *FakePodContainerManager) GetPodCgroupConfig(_ *v1.Pod, _ v1.ResourceName) (*ResourceConfig, error) {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "GetPodCgroupConfig")
return nil, nil
}
func (cm *FakePodContainerManager) SetPodCgroupConfig(pod *v1.Pod, resourceConfig *ResourceConfig) error {
cm.Lock()
defer cm.Unlock()
cm.CalledFunctions = append(cm.CalledFunctions, "SetPodCgroupConfig")
return nil
}

89
vendor/k8s.io/kubernetes/pkg/kubelet/cm/helpers.go generated vendored Normal file
View File

@ -0,0 +1,89 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"context"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
internalapi "k8s.io/cri-api/pkg/apis"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
)
// for typecheck across platforms
var _ func(int64, int64) int64 = MilliCPUToQuota
var _ func(int64) uint64 = MilliCPUToShares
var _ func(*v1.Pod, bool, uint64, bool) *ResourceConfig = ResourceConfigForPod
var _ func() (*CgroupSubsystems, error) = GetCgroupSubsystems
var _ func(string) ([]int, error) = getCgroupProcs
var _ func(types.UID) string = GetPodCgroupNameSuffix
var _ func(string, bool, string) string = NodeAllocatableRoot
var _ func(string) (string, error) = GetKubeletContainer
// hardEvictionReservation returns a resourcelist that includes reservation of resources based on hard eviction thresholds.
func hardEvictionReservation(thresholds []evictionapi.Threshold, capacity v1.ResourceList) v1.ResourceList {
if len(thresholds) == 0 {
return nil
}
ret := v1.ResourceList{}
for _, threshold := range thresholds {
if threshold.Operator != evictionapi.OpLessThan {
continue
}
switch threshold.Signal {
case evictionapi.SignalMemoryAvailable:
memoryCapacity := capacity[v1.ResourceMemory]
value := evictionapi.GetThresholdQuantity(threshold.Value, &memoryCapacity)
ret[v1.ResourceMemory] = *value
case evictionapi.SignalNodeFsAvailable:
storageCapacity := capacity[v1.ResourceEphemeralStorage]
value := evictionapi.GetThresholdQuantity(threshold.Value, &storageCapacity)
ret[v1.ResourceEphemeralStorage] = *value
}
}
return ret
}
func buildContainerMapAndRunningSetFromRuntime(ctx context.Context, runtimeService internalapi.RuntimeService) (containermap.ContainerMap, sets.Set[string]) {
podSandboxMap := make(map[string]string)
podSandboxList, _ := runtimeService.ListPodSandbox(ctx, nil)
for _, p := range podSandboxList {
podSandboxMap[p.Id] = p.Metadata.Uid
}
runningSet := sets.New[string]()
containerMap := containermap.NewContainerMap()
containerList, _ := runtimeService.ListContainers(ctx, nil)
for _, c := range containerList {
if _, exists := podSandboxMap[c.PodSandboxId]; !exists {
klog.InfoS("No PodSandBox found for the container", "podSandboxId", c.PodSandboxId, "containerName", c.Metadata.Name, "containerId", c.Id)
continue
}
podUID := podSandboxMap[c.PodSandboxId]
containerMap.Add(podUID, c.Metadata.Name, c.Id)
if c.State == runtimeapi.ContainerState_CONTAINER_RUNNING {
klog.V(4).InfoS("Container reported running", "podSandboxId", c.PodSandboxId, "podUID", podUID, "containerName", c.Metadata.Name, "containerId", c.Id)
runningSet.Insert(c.Id)
}
}
return containerMap, runningSet
}

View File

@ -0,0 +1,343 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
libcontainercgroups "github.com/opencontainers/runc/libcontainer/cgroups"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/resource"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/cm/util"
)
const (
// These limits are defined in the kernel:
// https://github.com/torvalds/linux/blob/0bddd227f3dc55975e2b8dfa7fc6f959b062a2c7/kernel/sched/sched.h#L427-L428
MinShares = 2
MaxShares = 262144
SharesPerCPU = 1024
MilliCPUToCPU = 1000
// 100000 microseconds is equivalent to 100ms
QuotaPeriod = 100000
// 1000 microseconds is equivalent to 1ms
// defined here:
// https://github.com/torvalds/linux/blob/cac03ac368fabff0122853de2422d4e17a32de08/kernel/sched/core.c#L10546
MinQuotaPeriod = 1000
// From the inverse of the conversion in MilliCPUToQuota:
// MinQuotaPeriod * MilliCPUToCPU / QuotaPeriod
MinMilliCPULimit = 10
)
// MilliCPUToQuota converts milliCPU to CFS quota and period values.
// Input parameters and resulting value is number of microseconds.
func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) {
// CFS quota is measured in two values:
// - cfs_period_us=100ms (the amount of time to measure usage across given by period)
// - cfs_quota=20ms (the amount of cpu time allowed to be used across a period)
// so in the above example, you are limited to 20% of a single CPU
// for multi-cpu environments, you just scale equivalent amounts
// see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details
if milliCPU == 0 {
return
}
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod) {
period = QuotaPeriod
}
// we then convert your milliCPU to a value normalized over a period
quota = (milliCPU * period) / MilliCPUToCPU
// quota needs to be a minimum of 1ms.
if quota < MinQuotaPeriod {
quota = MinQuotaPeriod
}
return
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) uint64 {
if milliCPU == 0 {
// Docker converts zero milliCPU to unset, which maps to kernel default
// for unset: 1024. Return 2 here to really match kernel default for
// zero milliCPU.
return MinShares
}
// Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding.
shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU
if shares < MinShares {
return MinShares
}
if shares > MaxShares {
return MaxShares
}
return uint64(shares)
}
// HugePageLimits converts the API representation to a map
// from huge page size (in bytes) to huge page limit (in bytes).
func HugePageLimits(resourceList v1.ResourceList) map[int64]int64 {
hugePageLimits := map[int64]int64{}
for k, v := range resourceList {
if v1helper.IsHugePageResourceName(k) {
pageSize, _ := v1helper.HugePageSizeFromResourceName(k)
if value, exists := hugePageLimits[pageSize.Value()]; exists {
hugePageLimits[pageSize.Value()] = value + v.Value()
} else {
hugePageLimits[pageSize.Value()] = v.Value()
}
}
}
return hugePageLimits
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(allocatedPod *v1.Pod, enforceCPULimits bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
podLevelResourcesEnabled := utilfeature.DefaultFeatureGate.Enabled(kubefeatures.PodLevelResources)
// sum requests and limits.
reqs := resource.PodRequests(allocatedPod, resource.PodResourcesOptions{
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !podLevelResourcesEnabled,
UseStatusResources: false,
})
// track if limits were applied for each resource.
memoryLimitsDeclared := true
cpuLimitsDeclared := true
limits := resource.PodLimits(allocatedPod, resource.PodResourcesOptions{
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !podLevelResourcesEnabled,
ContainerFn: func(res v1.ResourceList, containerType resource.ContainerType) {
if res.Cpu().IsZero() {
cpuLimitsDeclared = false
}
if res.Memory().IsZero() {
memoryLimitsDeclared = false
}
},
})
if podLevelResourcesEnabled && resource.IsPodLevelResourcesSet(allocatedPod) {
if !allocatedPod.Spec.Resources.Limits.Cpu().IsZero() {
cpuLimitsDeclared = true
}
if !allocatedPod.Spec.Resources.Limits.Memory().IsZero() {
memoryLimitsDeclared = true
}
}
// map hugepage pagesize (bytes) to limits (bytes)
hugePageLimits := HugePageLimits(reqs)
cpuRequests := int64(0)
cpuLimits := int64(0)
memoryLimits := int64(0)
if request, found := reqs[v1.ResourceCPU]; found {
cpuRequests = request.MilliValue()
}
if limit, found := limits[v1.ResourceCPU]; found {
cpuLimits = limit.MilliValue()
}
if limit, found := limits[v1.ResourceMemory]; found {
memoryLimits = limit.Value()
}
// convert to CFS values
cpuShares := MilliCPUToShares(cpuRequests)
cpuQuota := MilliCPUToQuota(cpuLimits, int64(cpuPeriod))
// quota is not capped when cfs quota is disabled
if !enforceCPULimits {
cpuQuota = int64(-1)
}
// determine the qos class
qosClass := v1qos.GetPodQOS(allocatedPod)
// build the result
result := &ResourceConfig{}
if qosClass == v1.PodQOSGuaranteed {
result.CPUShares = &cpuShares
result.CPUQuota = &cpuQuota
result.CPUPeriod = &cpuPeriod
result.Memory = &memoryLimits
} else if qosClass == v1.PodQOSBurstable {
result.CPUShares = &cpuShares
if cpuLimitsDeclared {
result.CPUQuota = &cpuQuota
result.CPUPeriod = &cpuPeriod
}
if memoryLimitsDeclared {
result.Memory = &memoryLimits
}
} else {
shares := uint64(MinShares)
result.CPUShares = &shares
}
result.HugePageLimit = hugePageLimits
if enforceMemoryQoS {
memoryMin := int64(0)
if request, found := reqs[v1.ResourceMemory]; found {
memoryMin = request.Value()
}
if memoryMin > 0 {
result.Unified = map[string]string{
Cgroup2MemoryMin: strconv.FormatInt(memoryMin, 10),
}
}
}
return result
}
// getCgroupSubsystemsV1 returns information about the mounted cgroup v1 subsystems
func getCgroupSubsystemsV1() (*CgroupSubsystems, error) {
// get all cgroup mounts.
allCgroups, err := libcontainercgroups.GetCgroupMounts(true)
if err != nil {
return &CgroupSubsystems{}, err
}
if len(allCgroups) == 0 {
return &CgroupSubsystems{}, fmt.Errorf("failed to find cgroup mounts")
}
mountPoints := make(map[string]string, len(allCgroups))
for _, mount := range allCgroups {
// BEFORE kubelet used a random mount point per cgroups subsystem;
// NOW more deterministic: kubelet use mount point with shortest path;
// FUTURE is bright with clear expectation determined in doc.
// ref. issue: https://github.com/kubernetes/kubernetes/issues/95488
for _, subsystem := range mount.Subsystems {
previous := mountPoints[subsystem]
if previous == "" || len(mount.Mountpoint) < len(previous) {
mountPoints[subsystem] = mount.Mountpoint
}
}
}
return &CgroupSubsystems{
Mounts: allCgroups,
MountPoints: mountPoints,
}, nil
}
// getCgroupSubsystemsV2 returns information about the enabled cgroup v2 subsystems
func getCgroupSubsystemsV2() (*CgroupSubsystems, error) {
controllers, err := libcontainercgroups.GetAllSubsystems()
if err != nil {
return nil, err
}
mounts := []libcontainercgroups.Mount{}
mountPoints := make(map[string]string, len(controllers))
for _, controller := range controllers {
mountPoints[controller] = util.CgroupRoot
m := libcontainercgroups.Mount{
Mountpoint: util.CgroupRoot,
Root: util.CgroupRoot,
Subsystems: []string{controller},
}
mounts = append(mounts, m)
}
return &CgroupSubsystems{
Mounts: mounts,
MountPoints: mountPoints,
}, nil
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
if libcontainercgroups.IsCgroup2UnifiedMode() {
return getCgroupSubsystemsV2()
}
return getCgroupSubsystemsV1()
}
// getCgroupProcs takes a cgroup directory name as an argument
// reads through the cgroup's procs file and returns a list of tgid's.
// It returns an empty list if a procs file doesn't exists
func getCgroupProcs(dir string) ([]int, error) {
procsFile := filepath.Join(dir, "cgroup.procs")
f, err := os.Open(procsFile)
if err != nil {
if os.IsNotExist(err) {
// The procsFile does not exist, So no pids attached to this directory
return []int{}, nil
}
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
out := []int{}
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, fmt.Errorf("unexpected line in %v; could not convert to pid: %v", procsFile, err)
}
out = append(out, pid)
}
}
return out, nil
}
// GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
func GetPodCgroupNameSuffix(podUID types.UID) string {
return podCgroupNamePrefix + string(podUID)
}
// NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup
func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string {
nodeAllocatableRoot := ParseCgroupfsToCgroupName(cgroupRoot)
if cgroupsPerQOS {
nodeAllocatableRoot = NewCgroupName(nodeAllocatableRoot, defaultNodeAllocatableCgroupName)
}
if cgroupDriver == "systemd" {
return nodeAllocatableRoot.ToSystemd()
}
return nodeAllocatableRoot.ToCgroupfs()
}
// GetKubeletContainer returns the cgroup the kubelet will use
func GetKubeletContainer(kubeletCgroups string) (string, error) {
if kubeletCgroups == "" {
cont, err := getContainer(os.Getpid())
if err != nil {
return "", err
}
return cont, nil
}
return kubeletCgroups, nil
}

View File

@ -0,0 +1,76 @@
//go:build !linux
// +build !linux
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
const (
MinShares = 0
MaxShares = 0
SharesPerCPU = 0
MilliCPUToCPU = 0
QuotaPeriod = 0
MinQuotaPeriod = 0
MinMilliCPULimit = 0
)
// MilliCPUToQuota converts milliCPU and period to CFS quota values.
func MilliCPUToQuota(milliCPU, period int64) int64 {
return 0
}
// MilliCPUToShares converts the milliCPU to CFS shares.
func MilliCPUToShares(milliCPU int64) uint64 {
return 0
}
// ResourceConfigForPod takes the input pod and outputs the cgroup resource config.
func ResourceConfigForPod(pod *v1.Pod, enforceCPULimit bool, cpuPeriod uint64, enforceMemoryQoS bool) *ResourceConfig {
return nil
}
// GetCgroupSubsystems returns information about the mounted cgroup subsystems
func GetCgroupSubsystems() (*CgroupSubsystems, error) {
return nil, nil
}
func getCgroupProcs(dir string) ([]int, error) {
return nil, nil
}
// GetPodCgroupNameSuffix returns the last element of the pod CgroupName identifier
func GetPodCgroupNameSuffix(podUID types.UID) string {
return ""
}
// NodeAllocatableRoot returns the literal cgroup path for the node allocatable cgroup
func NodeAllocatableRoot(cgroupRoot string, cgroupsPerQOS bool, cgroupDriver string) string {
return ""
}
// GetKubeletContainer returns the cgroup the kubelet will use
func GetKubeletContainer(kubeletCgroups string) (string, error) {
return "", nil
}

View File

@ -0,0 +1,56 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
type InternalContainerLifecycle interface {
PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error
PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error
PostStopContainer(containerID string) error
}
// Implements InternalContainerLifecycle interface.
type internalContainerLifecycleImpl struct {
cpuManager cpumanager.Manager
memoryManager memorymanager.Manager
topologyManager topologymanager.Manager
}
func (i *internalContainerLifecycleImpl) PreStartContainer(pod *v1.Pod, container *v1.Container, containerID string) error {
if i.cpuManager != nil {
i.cpuManager.AddContainer(pod, container, containerID)
}
if i.memoryManager != nil {
i.memoryManager.AddContainer(pod, container, containerID)
}
i.topologyManager.AddContainer(pod, container, containerID)
return nil
}
func (i *internalContainerLifecycleImpl) PostStopContainer(containerID string) error {
return i.topologyManager.RemoveContainer(containerID)
}

View File

@ -0,0 +1,51 @@
//go:build linux
// +build linux
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"strconv"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
if i.cpuManager != nil {
allocatedCPUs := i.cpuManager.GetCPUAffinity(string(pod.UID), container.Name)
if !allocatedCPUs.IsEmpty() {
containerConfig.Linux.Resources.CpusetCpus = allocatedCPUs.String()
}
}
if i.memoryManager != nil {
numaNodes := i.memoryManager.GetMemoryNUMANodes(pod, container)
if numaNodes.Len() > 0 {
var affinity []string
for _, numaNode := range sets.List(numaNodes) {
affinity = append(affinity, strconv.Itoa(numaNode))
}
containerConfig.Linux.Resources.CpusetMems = strings.Join(affinity, ",")
}
}
return nil
}

View File

@ -0,0 +1,29 @@
//go:build !linux && !windows
// +build !linux,!windows
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"k8s.io/api/core/v1"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
return nil
}

View File

@ -0,0 +1,141 @@
//go:build windows
// +build windows
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/winstats"
"k8s.io/utils/cpuset"
)
func (i *internalContainerLifecycleImpl) PreCreateContainer(pod *v1.Pod, container *v1.Container, containerConfig *runtimeapi.ContainerConfig) error {
if !utilfeature.DefaultFeatureGate.Enabled(kubefeatures.WindowsCPUAndMemoryAffinity) {
return nil
}
klog.V(4).Info("PreCreateContainer for Windows")
// retrieve CPU and NUMA affinity from CPU Manager and Memory Manager (if enabled)
var allocatedCPUs cpuset.CPUSet
if i.cpuManager != nil {
allocatedCPUs = i.cpuManager.GetCPUAffinity(string(pod.UID), container.Name)
}
var numaNodes sets.Set[int]
if i.memoryManager != nil {
numaNodes = i.memoryManager.GetMemoryNUMANodes(pod, container)
}
// Gather all CPUs associated with the selected NUMA nodes
var allNumaNodeCPUs []winstats.GroupAffinity
for _, numaNode := range sets.List(numaNodes) {
affinity, err := winstats.GetCPUsforNUMANode(uint16(numaNode))
if err != nil {
return fmt.Errorf("failed to get CPUs for NUMA node %d: %v", numaNode, err)
}
allNumaNodeCPUs = append(allNumaNodeCPUs, *affinity)
}
var finalCPUSet = computeFinalCpuSet(allocatedCPUs, allNumaNodeCPUs)
klog.V(4).InfoS("Setting CPU affinity", "affinity", finalCPUSet, "container", container.Name, "pod", pod.UID)
// Set CPU group affinities in the container config
if finalCPUSet != nil {
var cpusToGroupAffinities []*runtimeapi.WindowsCpuGroupAffinity
for group, mask := range groupMasks(finalCPUSet) {
cpusToGroupAffinities = append(cpusToGroupAffinities, &runtimeapi.WindowsCpuGroupAffinity{
CpuGroup: uint32(group),
CpuMask: uint64(mask),
})
}
containerConfig.Windows.Resources.AffinityCpus = cpusToGroupAffinities
}
// return nil if no CPUs were selected
return nil
}
// computeFinalCpuSet determines the final set of CPUs to use based on the CPU and memory managers
// and is extracted so that it can be tested
func computeFinalCpuSet(allocatedCPUs cpuset.CPUSet, allNumaNodeCPUs []winstats.GroupAffinity) sets.Set[int] {
if !allocatedCPUs.IsEmpty() && len(allNumaNodeCPUs) > 0 {
// Both CPU and memory managers are enabled
numaNodeAffinityCPUSet := computeCPUSet(allNumaNodeCPUs)
cpuManagerAffinityCPUSet := sets.New[int](allocatedCPUs.List()...)
// Determine which set of CPUs to use using the following logic outlined in the KEP:
// Case 1: CPU manager selects more CPUs than those available in the NUMA nodes selected by the memory manager
// Case 2: CPU manager selects fewer CPUs, and they all fall within the CPUs available in the NUMA nodes selected by the memory manager
// Case 3: CPU manager selects fewer CPUs, but some are outside of the CPUs available in the NUMA nodes selected by the memory manager
if cpuManagerAffinityCPUSet.Len() > numaNodeAffinityCPUSet.Len() {
// Case 1, use CPU manager selected CPUs
return cpuManagerAffinityCPUSet
} else if numaNodeAffinityCPUSet.IsSuperset(cpuManagerAffinityCPUSet) {
// case 2, use CPU manager selected CPUstry
return cpuManagerAffinityCPUSet
} else {
// Case 3, merge CPU manager and memory manager selected CPUs
return cpuManagerAffinityCPUSet.Union(numaNodeAffinityCPUSet)
}
} else if !allocatedCPUs.IsEmpty() {
// Only CPU manager is enabled, use CPU manager selected CPUs
return sets.New[int](allocatedCPUs.List()...)
} else if len(allNumaNodeCPUs) > 0 {
// Only memory manager is enabled, use CPUs associated with selected NUMA nodes
return computeCPUSet(allNumaNodeCPUs)
}
return nil
}
// computeCPUSet converts a list of GroupAffinity to a set of CPU IDs
func computeCPUSet(affinities []winstats.GroupAffinity) sets.Set[int] {
cpuSet := sets.New[int]()
for _, affinity := range affinities {
for i := 0; i < 64; i++ {
if (affinity.Mask>>i)&1 == 1 {
cpuID := int(affinity.Group)*64 + i
cpuSet.Insert(cpuID)
}
}
}
return cpuSet
}
// groupMasks converts a set of CPU IDs into group and mask representations
func groupMasks(cpuSet sets.Set[int]) map[int]uint64 {
groupMasks := make(map[int]uint64)
for cpu := range cpuSet {
group := cpu / 64
mask := uint64(1) << (cpu % 64)
groupMasks[group] |= mask
}
return groupMasks
}

View File

@ -0,0 +1,94 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
)
type fakeManager struct {
state state.State
}
func (m *fakeManager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Start()")
return nil
}
func (m *fakeManager) Policy() Policy {
klog.InfoS("Policy()")
return NewPolicyNone()
}
func (m *fakeManager) Allocate(pod *v1.Pod, container *v1.Container) error {
klog.InfoS("Allocate", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
func (m *fakeManager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
klog.InfoS("Add container", "pod", klog.KObj(pod), "containerName", container.Name, "containerID", containerID)
}
func (m *fakeManager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Set[int] {
klog.InfoS("Get MemoryNUMANodes", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
func (m *fakeManager) RemoveContainer(containerID string) error {
klog.InfoS("RemoveContainer", "containerID", containerID)
return nil
}
func (m *fakeManager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get Topology Hints", "pod", klog.KObj(pod), "containerName", container.Name)
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
klog.InfoS("Get Pod Topology Hints", "pod", klog.KObj(pod))
return map[string][]topologymanager.TopologyHint{}
}
func (m *fakeManager) State() state.Reader {
return m.state
}
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
func (m *fakeManager) GetAllocatableMemory() []state.Block {
klog.InfoS("Get Allocatable Memory")
return []state.Block{}
}
// GetMemory returns the memory allocated by a container from NUMA nodes
func (m *fakeManager) GetMemory(podUID, containerName string) []state.Block {
klog.InfoS("Get Memory", "podUID", podUID, "containerName", containerName)
return []state.Block{}
}
// NewFakeManager creates empty/fake memory manager
func NewFakeManager() Manager {
return &fakeManager{
state: state.NewMemoryState(),
}
}

View File

@ -0,0 +1,467 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
"context"
"fmt"
"runtime"
"sync"
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/util/sets"
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
corev1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
"k8s.io/kubernetes/pkg/kubelet/cm/containermap"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
"k8s.io/kubernetes/pkg/kubelet/config"
"k8s.io/kubernetes/pkg/kubelet/status"
)
// memoryManagerStateFileName is the file name where memory manager stores its state
const memoryManagerStateFileName = "memory_manager_state"
// ActivePodsFunc is a function that returns a list of active pods
type ActivePodsFunc func() []*v1.Pod
type runtimeService interface {
UpdateContainerResources(ctx context.Context, id string, resources *runtimeapi.ContainerResources) error
}
type sourcesReadyStub struct{}
func (s *sourcesReadyStub) AddSource(source string) {}
func (s *sourcesReadyStub) AllReady() bool { return true }
// Manager interface provides methods for Kubelet to manage pod memory.
type Manager interface {
// Start is called during Kubelet initialization.
Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error
// AddContainer adds the mapping between container ID to pod UID and the container name
// The mapping used to remove the memory allocation during the container removal
AddContainer(p *v1.Pod, c *v1.Container, containerID string)
// Allocate is called to pre-allocate memory resources during Pod admission.
// This must be called at some point prior to the AddContainer() call for a container, e.g. at pod admission time.
Allocate(pod *v1.Pod, container *v1.Container) error
// RemoveContainer is called after Kubelet decides to kill or delete a
// container. After this call, any memory allocated to the container is freed.
RemoveContainer(containerID string) error
// State returns a read-only interface to the internal memory manager state.
State() state.Reader
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(*v1.Pod, *v1.Container) map[string][]topologymanager.TopologyHint
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetPodTopologyHints(*v1.Pod) map[string][]topologymanager.TopologyHint
// GetMemoryNUMANodes provides NUMA nodes that are used to allocate the container memory
GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Set[int]
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
GetAllocatableMemory() []state.Block
// GetMemory returns the memory allocated by a container from NUMA nodes
GetMemory(podUID, containerName string) []state.Block
}
type manager struct {
sync.Mutex
policy Policy
// state allows to restore information regarding memory allocation for guaranteed pods
// in the case of the kubelet restart
state state.State
// containerRuntime is the container runtime service interface needed
// to make UpdateContainerResources() calls against the containers.
containerRuntime runtimeService
// activePods is a method for listing active pods on the node
// so all the containers can be updated during call to the removeStaleState.
activePods ActivePodsFunc
// podStatusProvider provides a method for obtaining pod statuses
// and the containerID of their containers
podStatusProvider status.PodStatusProvider
// containerMap provides a mapping from (pod, container) -> containerID
// for all containers a pod
containerMap containermap.ContainerMap
// sourcesReady provides the readiness of kubelet configuration sources such as apiserver update readiness.
// We use it to determine when we can purge inactive pods from checkpointed state.
sourcesReady config.SourcesReady
// stateFileDirectory holds the directory where the state file for checkpoints is held.
stateFileDirectory string
// allocatableMemory holds the allocatable memory for each NUMA node
allocatableMemory []state.Block
}
var _ Manager = &manager{}
// NewManager returns new instance of the memory manager
func NewManager(policyName string, machineInfo *cadvisorapi.MachineInfo, nodeAllocatableReservation v1.ResourceList, reservedMemory []kubeletconfig.MemoryReservation, stateFileDirectory string, affinity topologymanager.Store) (Manager, error) {
var policy Policy
switch policyType(policyName) {
case policyTypeNone:
policy = NewPolicyNone()
case policyTypeStatic:
if runtime.GOOS == "windows" {
return nil, fmt.Errorf("policy %q is not available on Windows", policyTypeStatic)
}
systemReserved, err := getSystemReservedMemory(machineInfo, nodeAllocatableReservation, reservedMemory)
if err != nil {
return nil, err
}
policy, err = NewPolicyStatic(machineInfo, systemReserved, affinity)
if err != nil {
return nil, err
}
case policyTypeBestEffort:
if runtime.GOOS == "windows" {
systemReserved, err := getSystemReservedMemory(machineInfo, nodeAllocatableReservation, reservedMemory)
if err != nil {
return nil, err
}
policy, err = NewPolicyBestEffort(machineInfo, systemReserved, affinity)
if err != nil {
return nil, err
}
} else {
return nil, fmt.Errorf("policy %q is not available for platform %q", policyTypeBestEffort, runtime.GOOS)
}
default:
return nil, fmt.Errorf("unknown policy: %q", policyName)
}
manager := &manager{
policy: policy,
stateFileDirectory: stateFileDirectory,
}
manager.sourcesReady = &sourcesReadyStub{}
return manager, nil
}
// Start starts the memory manager under the kubelet and calls policy start
func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesReady, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService, initialContainers containermap.ContainerMap) error {
klog.InfoS("Starting memorymanager", "policy", m.policy.Name())
m.sourcesReady = sourcesReady
m.activePods = activePods
m.podStatusProvider = podStatusProvider
m.containerRuntime = containerRuntime
m.containerMap = initialContainers
stateImpl, err := state.NewCheckpointState(m.stateFileDirectory, memoryManagerStateFileName, m.policy.Name())
if err != nil {
klog.ErrorS(err, "Could not initialize checkpoint manager, please drain node and remove policy state file")
return err
}
m.state = stateImpl
err = m.policy.Start(m.state)
if err != nil {
klog.ErrorS(err, "Policy start error")
return err
}
m.allocatableMemory = m.policy.GetAllocatableMemory(m.state)
return nil
}
// AddContainer saves the value of requested memory for the guaranteed pod under the state and set memory affinity according to the topolgy manager
func (m *manager) AddContainer(pod *v1.Pod, container *v1.Container, containerID string) {
m.Lock()
defer m.Unlock()
m.containerMap.Add(string(pod.UID), container.Name, containerID)
// Since we know that each init container always runs to completion before
// the next container starts, we can safely remove references to any previously
// started init containers. This will free up the memory from these init containers
// for use in other pods. If the current container happens to be an init container,
// we skip deletion of it until the next container is added, and this is called again.
for _, initContainer := range pod.Spec.InitContainers {
if initContainer.Name == container.Name {
break
}
// Since a restartable init container remains running for the full
// duration of the pod's lifecycle, we should not remove it from the
// memory manager state.
if podutil.IsRestartableInitContainer(&initContainer) {
continue
}
m.policyRemoveContainerByRef(string(pod.UID), initContainer.Name)
}
}
// GetMemoryNUMANodes provides NUMA nodes that used to allocate the container memory
func (m *manager) GetMemoryNUMANodes(pod *v1.Pod, container *v1.Container) sets.Set[int] {
// Get NUMA node affinity of blocks assigned to the container during Allocate()
numaNodes := sets.New[int]()
for _, block := range m.state.GetMemoryBlocks(string(pod.UID), container.Name) {
for _, nodeID := range block.NUMAAffinity {
// avoid nodes duplication when hugepages and memory blocks pinned to the same NUMA node
numaNodes.Insert(nodeID)
}
}
if numaNodes.Len() == 0 {
klog.V(5).InfoS("No allocation is available", "pod", klog.KObj(pod), "containerName", container.Name)
return nil
}
klog.InfoS("Memory affinity", "pod", klog.KObj(pod), "containerName", container.Name, "numaNodes", numaNodes)
return numaNodes
}
// Allocate is called to pre-allocate memory resources during Pod admission.
func (m *manager) Allocate(pod *v1.Pod, container *v1.Container) error {
// Garbage collect any stranded resources before allocation
m.removeStaleState()
m.Lock()
defer m.Unlock()
// Call down into the policy to assign this container memory if required.
if err := m.policy.Allocate(m.state, pod, container); err != nil {
klog.ErrorS(err, "Allocate error")
return err
}
return nil
}
// RemoveContainer removes the container from the state
func (m *manager) RemoveContainer(containerID string) error {
m.Lock()
defer m.Unlock()
// if error appears it means container entry already does not exist under the container map
podUID, containerName, err := m.containerMap.GetContainerRef(containerID)
if err != nil {
klog.InfoS("Failed to get container from container map", "containerID", containerID, "err", err)
return nil
}
m.policyRemoveContainerByRef(podUID, containerName)
return nil
}
// State returns the state of the manager
func (m *manager) State() state.Reader {
return m.state
}
// GetPodTopologyHints returns the topology hints for the topology manager
func (m *manager) GetPodTopologyHints(pod *v1.Pod) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetPodTopologyHints(m.state, pod)
}
// GetTopologyHints returns the topology hints for the topology manager
func (m *manager) GetTopologyHints(pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
// Garbage collect any stranded resources before providing TopologyHints
m.removeStaleState()
// Delegate to active policy
return m.policy.GetTopologyHints(m.state, pod, container)
}
// TODO: move the method to the upper level, to re-use it under the CPU and memory managers
func (m *manager) removeStaleState() {
// Only once all sources are ready do we attempt to remove any stale state.
// This ensures that the call to `m.activePods()` below will succeed with
// the actual active pods list.
if !m.sourcesReady.AllReady() {
return
}
// We grab the lock to ensure that no new containers will grab memory block while
// executing the code below. Without this lock, its possible that we end up
// removing state that is newly added by an asynchronous call to
// AddContainer() during the execution of this code.
m.Lock()
defer m.Unlock()
// Get the list of active pods.
activePods := m.activePods()
// Build a list of (podUID, containerName) pairs for all containers in all active Pods.
activeContainers := make(map[string]map[string]struct{})
for _, pod := range activePods {
activeContainers[string(pod.UID)] = make(map[string]struct{})
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
activeContainers[string(pod.UID)][container.Name] = struct{}{}
}
}
// Loop through the MemoryManager state. Remove any state for containers not
// in the `activeContainers` list built above.
assignments := m.state.GetMemoryAssignments()
for podUID := range assignments {
for containerName := range assignments[podUID] {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
}
}
m.containerMap.Visit(func(podUID, containerName, containerID string) {
if _, ok := activeContainers[podUID][containerName]; !ok {
klog.InfoS("RemoveStaleState removing state", "podUID", podUID, "containerName", containerName)
m.policyRemoveContainerByRef(podUID, containerName)
}
})
}
func (m *manager) policyRemoveContainerByRef(podUID string, containerName string) {
m.policy.RemoveContainer(m.state, podUID, containerName)
m.containerMap.RemoveByContainerRef(podUID, containerName)
}
func getTotalMemoryTypeReserved(machineInfo *cadvisorapi.MachineInfo, reservedMemory []kubeletconfig.MemoryReservation) (map[v1.ResourceName]resource.Quantity, error) {
totalMemoryType := map[v1.ResourceName]resource.Quantity{}
numaNodes := map[int]bool{}
for _, numaNode := range machineInfo.Topology {
numaNodes[numaNode.Id] = true
}
for _, reservation := range reservedMemory {
if !numaNodes[int(reservation.NumaNode)] {
return nil, fmt.Errorf("the reserved memory configuration references a NUMA node %d that does not exist on this machine", reservation.NumaNode)
}
for resourceName, q := range reservation.Limits {
if value, ok := totalMemoryType[resourceName]; ok {
q.Add(value)
}
totalMemoryType[resourceName] = q
}
}
return totalMemoryType, nil
}
func validateReservedMemory(machineInfo *cadvisorapi.MachineInfo, nodeAllocatableReservation v1.ResourceList, reservedMemory []kubeletconfig.MemoryReservation) error {
totalMemoryType, err := getTotalMemoryTypeReserved(machineInfo, reservedMemory)
if err != nil {
return err
}
commonMemoryTypeSet := make(map[v1.ResourceName]bool)
for resourceType := range totalMemoryType {
commonMemoryTypeSet[resourceType] = true
}
for resourceType := range nodeAllocatableReservation {
if !(corev1helper.IsHugePageResourceName(resourceType) || resourceType == v1.ResourceMemory) {
continue
}
commonMemoryTypeSet[resourceType] = true
}
for resourceType := range commonMemoryTypeSet {
nodeAllocatableMemory := resource.NewQuantity(0, resource.DecimalSI)
if memValue, set := nodeAllocatableReservation[resourceType]; set {
nodeAllocatableMemory.Add(memValue)
}
reservedMemory := resource.NewQuantity(0, resource.DecimalSI)
if memValue, set := totalMemoryType[resourceType]; set {
reservedMemory.Add(memValue)
}
if !(*nodeAllocatableMemory).Equal(*reservedMemory) {
return fmt.Errorf("the total amount %q of type %q is not equal to the value %q determined by Node Allocatable feature", reservedMemory.String(), resourceType, nodeAllocatableMemory.String())
}
}
return nil
}
func convertReserved(machineInfo *cadvisorapi.MachineInfo, reservedMemory []kubeletconfig.MemoryReservation) (systemReservedMemory, error) {
reservedMemoryConverted := make(map[int]map[v1.ResourceName]uint64)
for _, node := range machineInfo.Topology {
reservedMemoryConverted[node.Id] = make(map[v1.ResourceName]uint64)
}
for _, reservation := range reservedMemory {
for resourceName, q := range reservation.Limits {
val, success := q.AsInt64()
if !success {
return nil, fmt.Errorf("could not covert a variable of type Quantity to int64")
}
reservedMemoryConverted[int(reservation.NumaNode)][resourceName] = uint64(val)
}
}
return reservedMemoryConverted, nil
}
func getSystemReservedMemory(machineInfo *cadvisorapi.MachineInfo, nodeAllocatableReservation v1.ResourceList, reservedMemory []kubeletconfig.MemoryReservation) (systemReservedMemory, error) {
if err := validateReservedMemory(machineInfo, nodeAllocatableReservation, reservedMemory); err != nil {
return nil, err
}
reservedMemoryConverted, err := convertReserved(machineInfo, reservedMemory)
if err != nil {
return nil, err
}
return reservedMemoryConverted, nil
}
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
func (m *manager) GetAllocatableMemory() []state.Block {
return m.allocatableMemory
}
// GetMemory returns the memory allocated by a container from NUMA nodes
func (m *manager) GetMemory(podUID, containerName string) []state.Block {
return m.state.GetMemoryBlocks(podUID, containerName)
}

View File

@ -0,0 +1,46 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
// Type defines the policy type
type policyType string
// Policy implements logic for pod container to a memory assignment.
type Policy interface {
Name() string
Start(s state.State) error
// Allocate call is idempotent
Allocate(s state.State, pod *v1.Pod, container *v1.Container) error
// RemoveContainer call is idempotent
RemoveContainer(s state.State, podUID string, containerName string)
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
GetAllocatableMemory(s state.State) []state.Block
}

View File

@ -0,0 +1,80 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
cadvisorapi "github.com/google/cadvisor/info/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
// On Windows we want to use the same logic as the StaticPolicy to compute the memory topology hints
// but unlike linux based systems, on Windows systems numa nodes cannot be directly assigned or guaranteed via Windows APIs
// (windows scheduler will use the numa node that is closest to the cpu assigned therefor respecting the numa node assignment as a best effort). Because of this we don't want to have users specify "StaticPolicy" for the memory manager
// policy via kubelet configuration. Instead we want to use the "BestEffort" policy which will use the same logic as the StaticPolicy
// and doing so will reduce code duplication.
const policyTypeBestEffort policyType = "BestEffort"
// bestEffortPolicy is implementation of the policy interface for the BestEffort policy
type bestEffortPolicy struct {
static *staticPolicy
}
var _ Policy = &bestEffortPolicy{}
func NewPolicyBestEffort(machineInfo *cadvisorapi.MachineInfo, reserved systemReservedMemory, affinity topologymanager.Store) (Policy, error) {
p, err := NewPolicyStatic(machineInfo, reserved, affinity)
if err != nil {
return nil, err
}
return &bestEffortPolicy{
static: p.(*staticPolicy),
}, nil
}
func (p *bestEffortPolicy) Name() string {
return string(policyTypeBestEffort)
}
func (p *bestEffortPolicy) Start(s state.State) error {
return p.static.Start(s)
}
func (p *bestEffortPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Container) (rerr error) {
return p.static.Allocate(s, pod, container)
}
func (p *bestEffortPolicy) RemoveContainer(s state.State, podUID string, containerName string) {
p.static.RemoveContainer(s, podUID, containerName)
}
func (p *bestEffortPolicy) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
return p.static.GetPodTopologyHints(s, pod)
}
func (p *bestEffortPolicy) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
return p.static.GetTopologyHints(s, pod, container)
}
func (p *bestEffortPolicy) GetAllocatableMemory(s state.State) []state.Block {
return p.static.GetAllocatableMemory(s)
}

View File

@ -0,0 +1,72 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package memorymanager
import (
v1 "k8s.io/api/core/v1"
"k8s.io/kubernetes/pkg/kubelet/cm/memorymanager/state"
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
)
const policyTypeNone policyType = "None"
// none is implementation of the policy interface for the none policy, using none
// policy is the same as disable memory management
type none struct{}
var _ Policy = &none{}
// NewPolicyNone returns new none policy instance
func NewPolicyNone() Policy {
return &none{}
}
func (p *none) Name() string {
return string(policyTypeNone)
}
func (p *none) Start(s state.State) error {
return nil
}
// Allocate call is idempotent
func (p *none) Allocate(s state.State, pod *v1.Pod, container *v1.Container) error {
return nil
}
// RemoveContainer call is idempotent
func (p *none) RemoveContainer(s state.State, podUID string, containerName string) {
}
// GetTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
func (p *none) GetTopologyHints(s state.State, pod *v1.Pod, container *v1.Container) map[string][]topologymanager.TopologyHint {
return nil
}
// GetPodTopologyHints implements the topologymanager.HintProvider Interface
// and is consulted to achieve NUMA aware resource alignment among this
// and other resource controllers.
func (p *none) GetPodTopologyHints(s state.State, pod *v1.Pod) map[string][]topologymanager.TopologyHint {
return nil
}
// GetAllocatableMemory returns the amount of allocatable memory for each NUMA node
func (p *none) GetAllocatableMemory(s state.State) []state.Block {
return []state.Block{}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,65 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"encoding/json"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/checksum"
)
var _ checkpointmanager.Checkpoint = &MemoryManagerCheckpoint{}
// MemoryManagerCheckpoint struct is used to store memory/pod assignments in a checkpoint
type MemoryManagerCheckpoint struct {
PolicyName string `json:"policyName"`
MachineState NUMANodeMap `json:"machineState"`
Entries ContainerMemoryAssignments `json:"entries,omitempty"`
Checksum checksum.Checksum `json:"checksum"`
}
// NewMemoryManagerCheckpoint returns an instance of Checkpoint
func NewMemoryManagerCheckpoint() *MemoryManagerCheckpoint {
//nolint:staticcheck // unexported-type-in-api user-facing error message
return &MemoryManagerCheckpoint{
Entries: ContainerMemoryAssignments{},
MachineState: NUMANodeMap{},
}
}
// MarshalCheckpoint returns marshalled checkpoint
func (mp *MemoryManagerCheckpoint) MarshalCheckpoint() ([]byte, error) {
// make sure checksum wasn't set before so it doesn't affect output checksum
mp.Checksum = 0
mp.Checksum = checksum.New(mp)
return json.Marshal(*mp)
}
// UnmarshalCheckpoint tries to unmarshal passed bytes to checkpoint
func (mp *MemoryManagerCheckpoint) UnmarshalCheckpoint(blob []byte) error {
return json.Unmarshal(blob, mp)
}
// VerifyChecksum verifies that current checksum of checkpoint is valid
func (mp *MemoryManagerCheckpoint) VerifyChecksum() error {
ck := mp.Checksum
mp.Checksum = 0
err := ck.Verify(mp)
mp.Checksum = ck
return err
}

View File

@ -0,0 +1,130 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
v1 "k8s.io/api/core/v1"
)
// MemoryTable contains memory information
type MemoryTable struct {
TotalMemSize uint64 `json:"total"`
SystemReserved uint64 `json:"systemReserved"`
Allocatable uint64 `json:"allocatable"`
Reserved uint64 `json:"reserved"`
Free uint64 `json:"free"`
}
// NUMANodeState contains NUMA node related information
type NUMANodeState struct {
// NumberOfAssignments contains a number memory assignments from this node
// When the container requires memory and hugepages it will increase number of assignments by two
NumberOfAssignments int `json:"numberOfAssignments"`
// MemoryTable contains NUMA node memory related information
MemoryMap map[v1.ResourceName]*MemoryTable `json:"memoryMap"`
// Cells contains the current NUMA node and all other nodes that are in a group with current NUMA node
// This parameter indicates if the current node is used for the multiple NUMA node memory allocation
// For example if some container has pinning 0,1,2, NUMA nodes 0,1,2 under the state will have
// this parameter equals to [0, 1, 2]
Cells []int `json:"cells"`
}
// NUMANodeMap contains memory information for each NUMA node.
type NUMANodeMap map[int]*NUMANodeState
// Clone returns a copy of NUMANodeMap
func (nm NUMANodeMap) Clone() NUMANodeMap {
clone := make(NUMANodeMap)
for node, s := range nm {
if s == nil {
clone[node] = nil
continue
}
clone[node] = &NUMANodeState{}
clone[node].NumberOfAssignments = s.NumberOfAssignments
clone[node].Cells = append([]int{}, s.Cells...)
if s.MemoryMap == nil {
continue
}
clone[node].MemoryMap = map[v1.ResourceName]*MemoryTable{}
for memoryType, memoryTable := range s.MemoryMap {
clone[node].MemoryMap[memoryType] = &MemoryTable{
Allocatable: memoryTable.Allocatable,
Free: memoryTable.Free,
Reserved: memoryTable.Reserved,
SystemReserved: memoryTable.SystemReserved,
TotalMemSize: memoryTable.TotalMemSize,
}
}
}
return clone
}
// Block is a data structure used to represent a certain amount of memory
type Block struct {
// NUMAAffinity contains the string that represents NUMA affinity bitmask
NUMAAffinity []int `json:"numaAffinity"`
Type v1.ResourceName `json:"type"`
Size uint64 `json:"size"`
}
// ContainerMemoryAssignments stores memory assignments of containers
type ContainerMemoryAssignments map[string]map[string][]Block
// Clone returns a copy of ContainerMemoryAssignments
func (as ContainerMemoryAssignments) Clone() ContainerMemoryAssignments {
clone := make(ContainerMemoryAssignments)
for pod := range as {
clone[pod] = make(map[string][]Block)
for container, blocks := range as[pod] {
clone[pod][container] = append([]Block{}, blocks...)
}
}
return clone
}
// Reader interface used to read current memory/pod assignment state
type Reader interface {
// GetMachineState returns Memory Map stored in the State
GetMachineState() NUMANodeMap
// GetMemoryBlocks returns memory assignments of a container
GetMemoryBlocks(podUID string, containerName string) []Block
// GetMemoryAssignments returns ContainerMemoryAssignments
GetMemoryAssignments() ContainerMemoryAssignments
}
type writer interface {
// SetMachineState stores NUMANodeMap in State
SetMachineState(memoryMap NUMANodeMap)
// SetMemoryBlocks stores memory assignments of a container
SetMemoryBlocks(podUID string, containerName string, blocks []Block)
// SetMemoryAssignments sets ContainerMemoryAssignments by using the passed parameter
SetMemoryAssignments(assignments ContainerMemoryAssignments)
// Delete deletes corresponding Blocks from ContainerMemoryAssignments
Delete(podUID string, containerName string)
// ClearState clears machineState and ContainerMemoryAssignments
ClearState()
}
// State interface provides methods for tracking and setting memory/pod assignment
type State interface {
Reader
writer
}

View File

@ -0,0 +1,184 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"fmt"
"path/filepath"
"sync"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
)
var _ State = &stateCheckpoint{}
type stateCheckpoint struct {
sync.RWMutex
cache State
policyName string
checkpointManager checkpointmanager.CheckpointManager
checkpointName string
}
// NewCheckpointState creates new State for keeping track of memory/pod assignment with checkpoint backend
func NewCheckpointState(stateDir, checkpointName, policyName string) (State, error) {
checkpointManager, err := checkpointmanager.NewCheckpointManager(stateDir)
if err != nil {
return nil, fmt.Errorf("failed to initialize checkpoint manager: %v", err)
}
stateCheckpoint := &stateCheckpoint{
cache: NewMemoryState(),
policyName: policyName,
checkpointManager: checkpointManager,
checkpointName: checkpointName,
}
if err := stateCheckpoint.restoreState(); err != nil {
//nolint:staticcheck // ST1005 user-facing error message
return nil, fmt.Errorf("could not restore state from checkpoint: %v, please drain this node and delete the memory manager checkpoint file %q before restarting Kubelet",
err, filepath.Join(stateDir, checkpointName))
}
return stateCheckpoint, nil
}
// restores state from a checkpoint and creates it if it doesn't exist
func (sc *stateCheckpoint) restoreState() error {
sc.Lock()
defer sc.Unlock()
var err error
checkpoint := NewMemoryManagerCheckpoint()
if err = sc.checkpointManager.GetCheckpoint(sc.checkpointName, checkpoint); err != nil {
if err == errors.ErrCheckpointNotFound {
return sc.storeState()
}
return err
}
if sc.policyName != checkpoint.PolicyName {
return fmt.Errorf("[memorymanager] configured policy %q differs from state checkpoint policy %q", sc.policyName, checkpoint.PolicyName)
}
sc.cache.SetMachineState(checkpoint.MachineState)
sc.cache.SetMemoryAssignments(checkpoint.Entries)
klog.V(2).InfoS("State checkpoint: restored state from checkpoint")
return nil
}
// saves state to a checkpoint, caller is responsible for locking
func (sc *stateCheckpoint) storeState() error {
checkpoint := NewMemoryManagerCheckpoint()
checkpoint.PolicyName = sc.policyName
checkpoint.MachineState = sc.cache.GetMachineState()
checkpoint.Entries = sc.cache.GetMemoryAssignments()
err := sc.checkpointManager.CreateCheckpoint(sc.checkpointName, checkpoint)
if err != nil {
klog.ErrorS(err, "Could not save checkpoint")
return err
}
return nil
}
// GetMemoryState returns Memory Map stored in the State
func (sc *stateCheckpoint) GetMachineState() NUMANodeMap {
sc.RLock()
defer sc.RUnlock()
return sc.cache.GetMachineState()
}
// GetMemoryBlocks returns memory assignments of a container
func (sc *stateCheckpoint) GetMemoryBlocks(podUID string, containerName string) []Block {
sc.RLock()
defer sc.RUnlock()
return sc.cache.GetMemoryBlocks(podUID, containerName)
}
// GetMemoryAssignments returns ContainerMemoryAssignments
func (sc *stateCheckpoint) GetMemoryAssignments() ContainerMemoryAssignments {
sc.RLock()
defer sc.RUnlock()
return sc.cache.GetMemoryAssignments()
}
// SetMachineState stores NUMANodeMap in State
func (sc *stateCheckpoint) SetMachineState(memoryMap NUMANodeMap) {
sc.Lock()
defer sc.Unlock()
sc.cache.SetMachineState(memoryMap)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetMemoryBlocks stores memory assignments of container
func (sc *stateCheckpoint) SetMemoryBlocks(podUID string, containerName string, blocks []Block) {
sc.Lock()
defer sc.Unlock()
sc.cache.SetMemoryBlocks(podUID, containerName, blocks)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// SetMemoryAssignments sets ContainerMemoryAssignments by using the passed parameter
func (sc *stateCheckpoint) SetMemoryAssignments(assignments ContainerMemoryAssignments) {
sc.Lock()
defer sc.Unlock()
sc.cache.SetMemoryAssignments(assignments)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// Delete deletes corresponding Blocks from ContainerMemoryAssignments
func (sc *stateCheckpoint) Delete(podUID string, containerName string) {
sc.Lock()
defer sc.Unlock()
sc.cache.Delete(podUID, containerName)
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}
// ClearState clears machineState and ContainerMemoryAssignments
func (sc *stateCheckpoint) ClearState() {
sc.Lock()
defer sc.Unlock()
sc.cache.ClearState()
err := sc.storeState()
if err != nil {
klog.InfoS("Store state to checkpoint error", "err", err)
}
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package state
import (
"sync"
"k8s.io/klog/v2"
)
type stateMemory struct {
sync.RWMutex
assignments ContainerMemoryAssignments
machineState NUMANodeMap
}
var _ State = &stateMemory{}
// NewMemoryState creates new State for keeping track of cpu/pod assignment
func NewMemoryState() State {
klog.InfoS("Initializing new in-memory state store")
return &stateMemory{
assignments: ContainerMemoryAssignments{},
machineState: NUMANodeMap{},
}
}
// GetMemoryState returns Memory Map stored in the State
func (s *stateMemory) GetMachineState() NUMANodeMap {
s.RLock()
defer s.RUnlock()
return s.machineState.Clone()
}
// GetMemoryBlocks returns memory assignments of a container
func (s *stateMemory) GetMemoryBlocks(podUID string, containerName string) []Block {
s.RLock()
defer s.RUnlock()
if res, ok := s.assignments[podUID][containerName]; ok {
return append([]Block{}, res...)
}
return nil
}
// GetMemoryAssignments returns ContainerMemoryAssignments
func (s *stateMemory) GetMemoryAssignments() ContainerMemoryAssignments {
s.RLock()
defer s.RUnlock()
return s.assignments.Clone()
}
// SetMachineState stores NUMANodeMap in State
func (s *stateMemory) SetMachineState(nodeMap NUMANodeMap) {
s.Lock()
defer s.Unlock()
s.machineState = nodeMap.Clone()
klog.InfoS("Updated machine memory state")
}
// SetMemoryBlocks stores memory assignments of container
func (s *stateMemory) SetMemoryBlocks(podUID string, containerName string, blocks []Block) {
s.Lock()
defer s.Unlock()
if _, ok := s.assignments[podUID]; !ok {
s.assignments[podUID] = map[string][]Block{}
}
s.assignments[podUID][containerName] = append([]Block{}, blocks...)
klog.InfoS("Updated memory state", "podUID", podUID, "containerName", containerName)
}
// SetMemoryAssignments sets ContainerMemoryAssignments by using the passed parameter
func (s *stateMemory) SetMemoryAssignments(assignments ContainerMemoryAssignments) {
s.Lock()
defer s.Unlock()
s.assignments = assignments.Clone()
}
// Delete deletes corresponding Blocks from ContainerMemoryAssignments
func (s *stateMemory) Delete(podUID string, containerName string) {
s.Lock()
defer s.Unlock()
if _, ok := s.assignments[podUID]; !ok {
return
}
delete(s.assignments[podUID], containerName)
if len(s.assignments[podUID]) == 0 {
delete(s.assignments, podUID)
}
klog.V(2).InfoS("Deleted memory assignment", "podUID", podUID, "containerName", containerName)
}
// ClearState clears machineState and ContainerMemoryAssignments
func (s *stateMemory) ClearState() {
s.Lock()
defer s.Unlock()
s.machineState = NUMANodeMap{}
s.assignments = make(ContainerMemoryAssignments)
klog.V(2).InfoS("Cleared state")
}

View File

@ -0,0 +1,328 @@
//go:build linux
// +build linux
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cm
import (
"errors"
"fmt"
"strconv"
"strings"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
kubefeatures "k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)
const (
defaultNodeAllocatableCgroupName = "kubepods"
)
// createNodeAllocatableCgroups creates Node Allocatable Cgroup when CgroupsPerQOS flag is specified as true
func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
nodeAllocatable := cm.internalCapacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
nc := cm.NodeConfig.NodeAllocatableConfig
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableInternalAbsolute()
}
cgroupConfig := &CgroupConfig{
Name: cm.cgroupRoot,
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
ResourceParameters: cm.getCgroupConfig(nodeAllocatable, false),
}
if cm.cgroupManager.Exists(cgroupConfig.Name) {
return nil
}
if err := cm.cgroupManager.Create(cgroupConfig); err != nil {
klog.ErrorS(err, "Failed to create cgroup", "cgroupName", cm.cgroupRoot)
return err
}
return nil
}
// enforceNodeAllocatableCgroups enforce Node Allocatable Cgroup settings.
func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
nc := cm.NodeConfig.NodeAllocatableConfig
// We need to update limits on node allocatable cgroup no matter what because
// default cpu shares on cgroups are low and can cause cpu starvation.
nodeAllocatable := cm.internalCapacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableInternalAbsolute()
}
klog.V(4).InfoS("Attempting to enforce Node Allocatable", "config", nc)
cgroupConfig := &CgroupConfig{
Name: cm.cgroupRoot,
ResourceParameters: cm.getCgroupConfig(nodeAllocatable, false),
}
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
nodeRef := nodeRefFromNode(cm.nodeInfo.Name)
// If Node Allocatable is enforced on a node that has not been drained or is updated on an existing node to a lower value,
// existing memory usage across pods might be higher than current Node Allocatable Memory Limits.
// Pod Evictions are expected to bring down memory usage to below Node Allocatable limits.
// Until evictions happen retry cgroup updates.
// Update limits on non root cgroup-root to be safe since the default limits for CPU can be too low.
// Check if cgroupRoot is set to a non-empty value (empty would be the root container)
if len(cm.cgroupRoot) > 0 {
go func() {
for {
err := cm.cgroupManager.Update(cgroupConfig)
if err == nil {
cm.recorder.Event(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated Node Allocatable limit across pods")
return
}
message := fmt.Sprintf("Failed to update Node Allocatable Limits %q: %v", cm.cgroupRoot, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
time.Sleep(time.Minute)
}
}()
}
// Now apply kube reserved and system reserved limits if required.
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedEnforcementKey) {
klog.V(2).InfoS("Enforcing system reserved on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved, false); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedEnforcementKey) {
klog.V(2).InfoS("Enforcing kube reserved on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved, false); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(kubetypes.SystemReservedCompressibleEnforcementKey) {
klog.V(2).InfoS("Enforcing system reserved compressible on cgroup", "cgroupName", nc.SystemReservedCgroupName, "limits", nc.SystemReserved)
if err := cm.enforceExistingCgroup(nc.SystemReservedCgroupName, nc.SystemReserved, true); err != nil {
message := fmt.Sprintf("Failed to enforce System Reserved Compressible Cgroup Limits on %q: %v", nc.SystemReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on system reserved cgroup %v", nc.SystemReservedCgroupName)
}
if nc.EnforceNodeAllocatable.Has(kubetypes.KubeReservedCompressibleEnforcementKey) {
klog.V(2).InfoS("Enforcing kube reserved compressible on cgroup", "cgroupName", nc.KubeReservedCgroupName, "limits", nc.KubeReserved)
if err := cm.enforceExistingCgroup(nc.KubeReservedCgroupName, nc.KubeReserved, true); err != nil {
message := fmt.Sprintf("Failed to enforce Kube Reserved Compressible Cgroup Limits on %q: %v", nc.KubeReservedCgroupName, err)
cm.recorder.Event(nodeRef, v1.EventTypeWarning, events.FailedNodeAllocatableEnforcement, message)
return errors.New(message)
}
cm.recorder.Eventf(nodeRef, v1.EventTypeNormal, events.SuccessfulNodeAllocatableEnforcement, "Updated limits on kube reserved cgroup %v", nc.KubeReservedCgroupName)
}
return nil
}
// enforceExistingCgroup updates the limits `rl` on existing cgroup `cName` using `cgroupManager` interface.
func (cm *containerManagerImpl) enforceExistingCgroup(cNameStr string, rl v1.ResourceList, compressibleResources bool) error {
cName := cm.cgroupManager.CgroupName(cNameStr)
rp := cm.getCgroupConfig(rl, compressibleResources)
if rp == nil {
return fmt.Errorf("%q cgroup is not configured properly", cName)
}
// Enforce MemoryQoS for cgroups of kube-reserved/system-reserved. For more information,
// see https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2570-memory-qos
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.MemoryQoS) {
if rp.Memory != nil {
if rp.Unified == nil {
rp.Unified = make(map[string]string)
}
rp.Unified[Cgroup2MemoryMin] = strconv.FormatInt(*rp.Memory, 10)
}
}
cgroupConfig := &CgroupConfig{
Name: cName,
ResourceParameters: rp,
}
klog.V(4).InfoS("Enforcing limits on cgroup", "cgroupName", cName, "cpuShares", cgroupConfig.ResourceParameters.CPUShares, "memory", cgroupConfig.ResourceParameters.Memory, "pidsLimit", cgroupConfig.ResourceParameters.PidsLimit)
if err := cm.cgroupManager.Validate(cgroupConfig.Name); err != nil {
return err
}
if err := cm.cgroupManager.Update(cgroupConfig); err != nil {
return err
}
return nil
}
// getCgroupConfig returns a ResourceConfig object that can be used to create or update cgroups via CgroupManager interface.
func (cm *containerManagerImpl) getCgroupConfig(rl v1.ResourceList, compressibleResourcesOnly bool) *ResourceConfig {
rc := getCgroupConfigInternal(rl, compressibleResourcesOnly)
if rc == nil {
return nil
}
// In the case of a None policy, cgroupv2 and systemd cgroup manager, we must make sure systemd is aware of the cpuset cgroup.
// By default, systemd will not create it, as we've not chosen to delegate it, and we haven't included it in the Apply() request.
// However, this causes a bug where kubelet restarts unnecessarily (cpuset cgroup is created in the cgroupfs, but systemd
// doesn't know about it and deletes it, and then kubelet doesn't continue because the cgroup isn't configured as expected).
// An alternative is to delegate the `cpuset` cgroup to the kubelet, but that would require some plumbing in libcontainer,
// and this is sufficient.
// Only do so on None policy, as Static policy will do its own updating of the cpuset.
// Please see the comment on policy none's GetAllocatableCPUs
if cm.cpuManager.GetAllocatableCPUs().IsEmpty() {
rc.CPUSet = cm.cpuManager.GetAllCPUs()
}
return rc
}
// getCgroupConfigInternal are the pieces of getCgroupConfig that don't require the cm object.
// This is added to unit test without needing to create a full containerManager
func getCgroupConfigInternal(rl v1.ResourceList, compressibleResourcesOnly bool) *ResourceConfig {
// TODO(vishh): Set CPU Quota if necessary.
if rl == nil {
return nil
}
var rc ResourceConfig
setCompressibleResources := func() {
if q, exists := rl[v1.ResourceCPU]; exists {
// CPU is defined in milli-cores.
val := MilliCPUToShares(q.MilliValue())
rc.CPUShares = &val
}
}
// Only return compressible resources
if compressibleResourcesOnly {
setCompressibleResources()
} else {
if q, exists := rl[v1.ResourceMemory]; exists {
// Memory is defined in bytes.
val := q.Value()
rc.Memory = &val
}
setCompressibleResources()
if q, exists := rl[pidlimit.PIDs]; exists {
val := q.Value()
rc.PidsLimit = &val
}
rc.HugePageLimit = HugePageLimits(rl)
}
return &rc
}
// GetNodeAllocatableAbsolute returns the absolute value of Node Allocatable which is primarily useful for enforcement.
// Note that not all resources that are available on the node are included in the returned list of resources.
// Returns a ResourceList.
func (cm *containerManagerImpl) GetNodeAllocatableAbsolute() v1.ResourceList {
return cm.getNodeAllocatableAbsoluteImpl(cm.capacity)
}
func (cm *containerManagerImpl) getNodeAllocatableAbsoluteImpl(capacity v1.ResourceList) v1.ResourceList {
result := make(v1.ResourceList)
for k, v := range capacity {
value := v.DeepCopy()
if cm.NodeConfig.SystemReserved != nil {
value.Sub(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Sub(cm.NodeConfig.KubeReserved[k])
}
if value.Sign() < 0 {
// Negative Allocatable resources don't make sense.
value.Set(0)
}
result[k] = value
}
return result
}
// getNodeAllocatableInternalAbsolute is similar to getNodeAllocatableAbsolute except that
// it also includes internal resources (currently process IDs). It is intended for setting
// up top level cgroups only.
func (cm *containerManagerImpl) getNodeAllocatableInternalAbsolute() v1.ResourceList {
return cm.getNodeAllocatableAbsoluteImpl(cm.internalCapacity)
}
// GetNodeAllocatableReservation returns amount of compute or storage resource that have to be reserved on this node from scheduling.
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity)
result := make(v1.ResourceList)
for k := range cm.capacity {
value := resource.NewQuantity(0, resource.DecimalSI)
if cm.NodeConfig.SystemReserved != nil {
value.Add(cm.NodeConfig.SystemReserved[k])
}
if cm.NodeConfig.KubeReserved != nil {
value.Add(cm.NodeConfig.KubeReserved[k])
}
if evictionReservation != nil {
value.Add(evictionReservation[k])
}
if !value.IsZero() {
result[k] = *value
}
}
return result
}
// validateNodeAllocatable ensures that the user specified Node Allocatable Configuration doesn't reserve more than the node capacity.
// Returns error if the configuration is invalid, nil otherwise.
func (cm *containerManagerImpl) validateNodeAllocatable() error {
var errors []string
nar := cm.GetNodeAllocatableReservation()
for k, v := range nar {
value := cm.capacity[k].DeepCopy()
value.Sub(v)
if value.Sign() < 0 {
errors = append(errors, fmt.Sprintf("Resource %q has a reservation of %v but capacity of %v. Expected capacity >= reservation.", k, v, cm.capacity[k]))
}
}
if len(errors) > 0 {
return fmt.Errorf("invalid Node Allocatable configuration. %s", strings.Join(errors, " "))
}
return nil
}
// Using ObjectReference for events as the node maybe not cached; refer to #42701 for detail.
func nodeRefFromNode(nodeName string) *v1.ObjectReference {
return &v1.ObjectReference{
Kind: "Node",
Name: nodeName,
UID: types.UID(nodeName),
Namespace: "",
}
}

Some files were not shown because too many files have changed in this diff Show More