build: move e2e dependencies into e2e/go.mod

Several packages are only used while running the e2e suite. These
packages are less important to update, as the they can not influence the
final executable that is part of the Ceph-CSI container-image.

By moving these dependencies out of the main Ceph-CSI go.mod, it is
easier to identify if a reported CVE affects Ceph-CSI, or only the
testing (like most of the Kubernetes CVEs).

Signed-off-by: Niels de Vos <ndevos@ibm.com>
This commit is contained in:
Niels de Vos
2025-03-04 08:57:28 +01:00
committed by mergify[bot]
parent 15da101b1b
commit bec6090996
8047 changed files with 1407827 additions and 3453 deletions

8
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/OWNERS generated vendored Normal file
View File

@ -0,0 +1,8 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-scheduling-maintainers
reviewers:
- sig-scheduling
labels:
- sig/scheduling

View File

@ -0,0 +1,11 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- api-approvers
reviewers:
- api-reviewers
- sig-scheduling-api-reviewers
- sig-scheduling-api-approvers
labels:
- kind/api-change
- sig/scheduling

View File

@ -0,0 +1,20 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// +k8s:deepcopy-gen=package
// +groupName=kubescheduler.config.k8s.io
package config // import "k8s.io/kubernetes/pkg/scheduler/apis/config"

View File

@ -0,0 +1,50 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
)
// GroupName is the group name used in this package
const GroupName = "kubescheduler.config.k8s.io"
// SchemeGroupVersion is group version used to register these objects
var SchemeGroupVersion = schema.GroupVersion{Group: GroupName, Version: runtime.APIVersionInternal}
var (
// SchemeBuilder is the scheme builder with scheme init functions to run for this API package
SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes)
// AddToScheme is a global function that registers this API group & version to a scheme
AddToScheme = SchemeBuilder.AddToScheme
)
// addKnownTypes registers known types to the given scheme
func addKnownTypes(scheme *runtime.Scheme) error {
scheme.AddKnownTypes(SchemeGroupVersion,
&KubeSchedulerConfiguration{},
&DefaultPreemptionArgs{},
&InterPodAffinityArgs{},
&NodeResourcesFitArgs{},
&PodTopologySpreadArgs{},
&VolumeBindingArgs{},
&NodeResourcesBalancedAllocationArgs{},
&NodeAffinityArgs{},
)
return nil
}

View File

@ -0,0 +1,46 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheme
import (
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/serializer"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
config "k8s.io/kubernetes/pkg/scheduler/apis/config"
configv1 "k8s.io/kubernetes/pkg/scheduler/apis/config/v1"
)
var (
// Scheme is the runtime.Scheme to which all kubescheduler api types are registered.
Scheme = runtime.NewScheme()
// Codecs provides access to encoding and decoding for the scheme.
Codecs = serializer.NewCodecFactory(Scheme, serializer.EnableStrict)
)
func init() {
AddToScheme(Scheme)
}
// AddToScheme builds the kubescheduler scheme using all known versions of the kubescheduler api.
func AddToScheme(scheme *runtime.Scheme) {
utilruntime.Must(config.AddToScheme(scheme))
utilruntime.Must(configv1.AddToScheme(scheme))
utilruntime.Must(scheme.SetVersionPriority(
configv1.SchemeGroupVersion,
))
}

View File

@ -0,0 +1,336 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
"math"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
componentbaseconfig "k8s.io/component-base/config"
)
const (
// DefaultKubeSchedulerPort is the default port for the scheduler status server.
// May be overridden by a flag at startup.
DefaultKubeSchedulerPort = 10259
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// KubeSchedulerConfiguration configures a scheduler
type KubeSchedulerConfiguration struct {
// TypeMeta contains the API version and kind. In kube-scheduler, after
// conversion from the versioned KubeSchedulerConfiguration type to this
// internal type, we set the APIVersion field to the scheme group/version of
// the type we converted from. This is done in cmd/kube-scheduler in two
// places: (1) when loading config from a file, (2) generating the default
// config. Based on the versioned type set in this field, we make decisions;
// for example (1) during validation to check for usage of removed plugins,
// (2) writing config to a file, (3) initialising the scheduler.
metav1.TypeMeta
// Parallelism defines the amount of parallelism in algorithms for scheduling a Pods. Must be greater than 0. Defaults to 16
Parallelism int32
// LeaderElection defines the configuration of leader election client.
LeaderElection componentbaseconfig.LeaderElectionConfiguration
// ClientConnection specifies the kubeconfig file and client connection
// settings for the proxy server to use when communicating with the apiserver.
ClientConnection componentbaseconfig.ClientConnectionConfiguration
// DebuggingConfiguration holds configuration for Debugging related features
// TODO: We might wanna make this a substruct like Debugging componentbaseconfig.DebuggingConfiguration
componentbaseconfig.DebuggingConfiguration
// PercentageOfNodesToScore is the percentage of all nodes that once found feasible
// for running a pod, the scheduler stops its search for more feasible nodes in
// the cluster. This helps improve scheduler's performance. Scheduler always tries to find
// at least "minFeasibleNodesToFind" feasible nodes no matter what the value of this flag is.
// Example: if the cluster size is 500 nodes and the value of this flag is 30,
// then scheduler stops finding further feasible nodes once it finds 150 feasible ones.
// When the value is 0, default percentage (5%--50% based on the size of the cluster) of the
// nodes will be scored. It is overridden by profile level PercentageOfNodesToScore.
PercentageOfNodesToScore *int32
// PodInitialBackoffSeconds is the initial backoff for unschedulable pods.
// If specified, it must be greater than 0. If this value is null, the default value (1s)
// will be used.
PodInitialBackoffSeconds int64
// PodMaxBackoffSeconds is the max backoff for unschedulable pods.
// If specified, it must be greater than or equal to podInitialBackoffSeconds. If this value is null,
// the default value (10s) will be used.
PodMaxBackoffSeconds int64
// Profiles are scheduling profiles that kube-scheduler supports. Pods can
// choose to be scheduled under a particular profile by setting its associated
// scheduler name. Pods that don't specify any scheduler name are scheduled
// with the "default-scheduler" profile, if present here.
Profiles []KubeSchedulerProfile
// Extenders are the list of scheduler extenders, each holding the values of how to communicate
// with the extender. These extenders are shared by all scheduler profiles.
Extenders []Extender
// DelayCacheUntilActive specifies when to start caching. If this is true and leader election is enabled,
// the scheduler will wait to fill informer caches until it is the leader. Doing so will have slower
// failover with the benefit of lower memory overhead while waiting to become leader.
// Defaults to false.
DelayCacheUntilActive bool
}
// KubeSchedulerProfile is a scheduling profile.
type KubeSchedulerProfile struct {
// SchedulerName is the name of the scheduler associated to this profile.
// If SchedulerName matches with the pod's "spec.schedulerName", then the pod
// is scheduled with this profile.
SchedulerName string
// PercentageOfNodesToScore is the percentage of all nodes that once found feasible
// for running a pod, the scheduler stops its search for more feasible nodes in
// the cluster. This helps improve scheduler's performance. Scheduler always tries to find
// at least "minFeasibleNodesToFind" feasible nodes no matter what the value of this flag is.
// Example: if the cluster size is 500 nodes and the value of this flag is 30,
// then scheduler stops finding further feasible nodes once it finds 150 feasible ones.
// When the value is 0, default percentage (5%--50% based on the size of the cluster) of the
// nodes will be scored. It will override global PercentageOfNodesToScore. If it is empty,
// global PercentageOfNodesToScore will be used.
PercentageOfNodesToScore *int32
// Plugins specify the set of plugins that should be enabled or disabled.
// Enabled plugins are the ones that should be enabled in addition to the
// default plugins. Disabled plugins are any of the default plugins that
// should be disabled.
// When no enabled or disabled plugin is specified for an extension point,
// default plugins for that extension point will be used if there is any.
// If a QueueSort plugin is specified, the same QueueSort Plugin and
// PluginConfig must be specified for all profiles.
Plugins *Plugins
// PluginConfig is an optional set of custom plugin arguments for each plugin.
// Omitting config args for a plugin is equivalent to using the default config
// for that plugin.
PluginConfig []PluginConfig
}
// Plugins include multiple extension points. When specified, the list of plugins for
// a particular extension point are the only ones enabled. If an extension point is
// omitted from the config, then the default set of plugins is used for that extension point.
// Enabled plugins are called in the order specified here, after default plugins. If they need to
// be invoked before default plugins, default plugins must be disabled and re-enabled here in desired order.
type Plugins struct {
// PreEnqueue is a list of plugins that should be invoked before adding pods to the scheduling queue.
PreEnqueue PluginSet
// QueueSort is a list of plugins that should be invoked when sorting pods in the scheduling queue.
QueueSort PluginSet
// PreFilter is a list of plugins that should be invoked at "PreFilter" extension point of the scheduling framework.
PreFilter PluginSet
// Filter is a list of plugins that should be invoked when filtering out nodes that cannot run the Pod.
Filter PluginSet
// PostFilter is a list of plugins that are invoked after filtering phase, but only when no feasible nodes were found for the pod.
PostFilter PluginSet
// PreScore is a list of plugins that are invoked before scoring.
PreScore PluginSet
// Score is a list of plugins that should be invoked when ranking nodes that have passed the filtering phase.
Score PluginSet
// Reserve is a list of plugins invoked when reserving/unreserving resources
// after a node is assigned to run the pod.
Reserve PluginSet
// Permit is a list of plugins that control binding of a Pod. These plugins can prevent or delay binding of a Pod.
Permit PluginSet
// PreBind is a list of plugins that should be invoked before a pod is bound.
PreBind PluginSet
// Bind is a list of plugins that should be invoked at "Bind" extension point of the scheduling framework.
// The scheduler call these plugins in order. Scheduler skips the rest of these plugins as soon as one returns success.
Bind PluginSet
// PostBind is a list of plugins that should be invoked after a pod is successfully bound.
PostBind PluginSet
// MultiPoint is a simplified config field for enabling plugins for all valid extension points
MultiPoint PluginSet
}
// PluginSet specifies enabled and disabled plugins for an extension point.
// If an array is empty, missing, or nil, default plugins at that extension point will be used.
type PluginSet struct {
// Enabled specifies plugins that should be enabled in addition to default plugins.
// These are called after default plugins and in the same order specified here.
Enabled []Plugin
// Disabled specifies default plugins that should be disabled.
// When all default plugins need to be disabled, an array containing only one "*" should be provided.
Disabled []Plugin
}
// Plugin specifies a plugin name and its weight when applicable. Weight is used only for Score plugins.
type Plugin struct {
// Name defines the name of plugin
Name string
// Weight defines the weight of plugin, only used for Score plugins.
Weight int32
}
// PluginConfig specifies arguments that should be passed to a plugin at the time of initialization.
// A plugin that is invoked at multiple extension points is initialized once. Args can have arbitrary structure.
// It is up to the plugin to process these Args.
type PluginConfig struct {
// Name defines the name of plugin being configured
Name string
// Args defines the arguments passed to the plugins at the time of initialization. Args can have arbitrary structure.
Args runtime.Object
}
/*
* NOTE: The following variables and methods are intentionally left out of the staging mirror.
*/
const (
// DefaultPercentageOfNodesToScore defines the percentage of nodes of all nodes
// that once found feasible, the scheduler stops looking for more nodes.
// A value of 0 means adaptive, meaning the scheduler figures out a proper default.
DefaultPercentageOfNodesToScore = 0
// MaxCustomPriorityScore is the max score UtilizationShapePoint expects.
MaxCustomPriorityScore int64 = 10
// MaxTotalScore is the maximum total score.
MaxTotalScore int64 = math.MaxInt64
// MaxWeight defines the max weight value allowed for custom PriorityPolicy
MaxWeight = MaxTotalScore / MaxCustomPriorityScore
)
// Names returns the list of enabled plugin names.
func (p *Plugins) Names() []string {
if p == nil {
return nil
}
extensions := []PluginSet{
p.PreEnqueue,
p.PreFilter,
p.Filter,
p.PostFilter,
p.Reserve,
p.PreScore,
p.Score,
p.PreBind,
p.Bind,
p.PostBind,
p.Permit,
p.QueueSort,
}
n := sets.New[string]()
for _, e := range extensions {
for _, pg := range e.Enabled {
n.Insert(pg.Name)
}
}
return sets.List(n)
}
// Extender holds the parameters used to communicate with the extender. If a verb is unspecified/empty,
// it is assumed that the extender chose not to provide that extension.
type Extender struct {
// URLPrefix at which the extender is available
URLPrefix string
// Verb for the filter call, empty if not supported. This verb is appended to the URLPrefix when issuing the filter call to extender.
FilterVerb string
// Verb for the preempt call, empty if not supported. This verb is appended to the URLPrefix when issuing the preempt call to extender.
PreemptVerb string
// Verb for the prioritize call, empty if not supported. This verb is appended to the URLPrefix when issuing the prioritize call to extender.
PrioritizeVerb string
// The numeric multiplier for the node scores that the prioritize call generates.
// The weight should be a positive integer
Weight int64
// Verb for the bind call, empty if not supported. This verb is appended to the URLPrefix when issuing the bind call to extender.
// If this method is implemented by the extender, it is the extender's responsibility to bind the pod to apiserver. Only one extender
// can implement this function.
BindVerb string
// EnableHTTPS specifies whether https should be used to communicate with the extender
EnableHTTPS bool
// TLSConfig specifies the transport layer security config
TLSConfig *ExtenderTLSConfig
// HTTPTimeout specifies the timeout duration for a call to the extender. Filter timeout fails the scheduling of the pod. Prioritize
// timeout is ignored, k8s/other extenders priorities are used to select the node.
HTTPTimeout metav1.Duration
// NodeCacheCapable specifies that the extender is capable of caching node information,
// so the scheduler should only send minimal information about the eligible nodes
// assuming that the extender already cached full details of all nodes in the cluster
NodeCacheCapable bool
// ManagedResources is a list of extended resources that are managed by
// this extender.
// - A pod will be sent to the extender on the Filter, Prioritize and Bind
// (if the extender is the binder) phases iff the pod requests at least
// one of the extended resources in this list. If empty or unspecified,
// all pods will be sent to this extender.
// - If IgnoredByScheduler is set to true for a resource, kube-scheduler
// will skip checking the resource in predicates.
// +optional
ManagedResources []ExtenderManagedResource
// Ignorable specifies if the extender is ignorable, i.e. scheduling should not
// fail when the extender returns an error or is not reachable.
Ignorable bool
}
// ExtenderManagedResource describes the arguments of extended resources
// managed by an extender.
type ExtenderManagedResource struct {
// Name is the extended resource name.
Name string
// IgnoredByScheduler indicates whether kube-scheduler should ignore this
// resource when applying predicates.
IgnoredByScheduler bool
}
// ExtenderTLSConfig contains settings to enable TLS with extender
type ExtenderTLSConfig struct {
// Server should be accessed without verifying the TLS certificate. For testing only.
Insecure bool
// ServerName is passed to the server for SNI and is used in the client to check server
// certificates against. If ServerName is empty, the hostname used to contact the
// server is used.
ServerName string
// Server requires TLS client certificate authentication
CertFile string
// Server requires TLS client certificate authentication
KeyFile string
// Trusted root certificates for server
CAFile string
// CertData holds PEM-encoded bytes (typically read from a client certificate file).
// CertData takes precedence over CertFile
CertData []byte
// KeyData holds PEM-encoded bytes (typically read from a client certificate key file).
// KeyData takes precedence over KeyFile
KeyData []byte `datapolicy:"security-key"`
// CAData holds PEM-encoded bytes (typically read from a root certificates bundle).
// CAData takes precedence over CAFile
CAData []byte
}

View File

@ -0,0 +1,218 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package config
import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// DefaultPreemptionArgs holds arguments used to configure the
// DefaultPreemption plugin.
type DefaultPreemptionArgs struct {
metav1.TypeMeta
// MinCandidateNodesPercentage is the minimum number of candidates to
// shortlist when dry running preemption as a percentage of number of nodes.
// Must be in the range [0, 100]. Defaults to 10% of the cluster size if
// unspecified.
MinCandidateNodesPercentage int32
// MinCandidateNodesAbsolute is the absolute minimum number of candidates to
// shortlist. The likely number of candidates enumerated for dry running
// preemption is given by the formula:
// numCandidates = max(numNodes * minCandidateNodesPercentage, minCandidateNodesAbsolute)
// We say "likely" because there are other factors such as PDB violations
// that play a role in the number of candidates shortlisted. Must be at least
// 0 nodes. Defaults to 100 nodes if unspecified.
MinCandidateNodesAbsolute int32
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// InterPodAffinityArgs holds arguments used to configure the InterPodAffinity plugin.
type InterPodAffinityArgs struct {
metav1.TypeMeta
// HardPodAffinityWeight is the scoring weight for existing pods with a
// matching hard affinity to the incoming pod.
HardPodAffinityWeight int32
// IgnorePreferredTermsOfExistingPods configures the scheduler to ignore existing pods' preferred affinity
// rules when scoring candidate nodes, unless the incoming pod has inter-pod affinities.
IgnorePreferredTermsOfExistingPods bool
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// NodeResourcesFitArgs holds arguments used to configure the NodeResourcesFit plugin.
type NodeResourcesFitArgs struct {
metav1.TypeMeta
// IgnoredResources is the list of resources that NodeResources fit filter
// should ignore.
IgnoredResources []string
// IgnoredResourceGroups defines the list of resource groups that NodeResources fit filter should ignore.
// e.g. if group is ["example.com"], it will ignore all resource names that begin
// with "example.com", such as "example.com/aaa" and "example.com/bbb".
// A resource group name can't contain '/'.
IgnoredResourceGroups []string
// ScoringStrategy selects the node resource scoring strategy.
ScoringStrategy *ScoringStrategy
}
// PodTopologySpreadConstraintsDefaulting defines how to set default constraints
// for the PodTopologySpread plugin.
type PodTopologySpreadConstraintsDefaulting string
const (
// SystemDefaulting instructs to use the kubernetes defined default.
SystemDefaulting PodTopologySpreadConstraintsDefaulting = "System"
// ListDefaulting instructs to use the config provided default.
ListDefaulting PodTopologySpreadConstraintsDefaulting = "List"
)
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// PodTopologySpreadArgs holds arguments used to configure the PodTopologySpread plugin.
type PodTopologySpreadArgs struct {
metav1.TypeMeta
// DefaultConstraints defines topology spread constraints to be applied to
// Pods that don't define any in `pod.spec.topologySpreadConstraints`.
// `.defaultConstraints[*].labelSelectors` must be empty, as they are
// deduced from the Pod's membership to Services, ReplicationControllers,
// ReplicaSets or StatefulSets.
// When not empty, .defaultingType must be "List".
DefaultConstraints []v1.TopologySpreadConstraint
// DefaultingType determines how .defaultConstraints are deduced. Can be one
// of "System" or "List".
//
// - "System": Use kubernetes defined constraints that spread Pods among
// Nodes and Zones.
// - "List": Use constraints defined in .defaultConstraints.
//
// Defaults to "System".
// +optional
DefaultingType PodTopologySpreadConstraintsDefaulting
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// NodeResourcesBalancedAllocationArgs holds arguments used to configure NodeResourcesBalancedAllocation plugin.
type NodeResourcesBalancedAllocationArgs struct {
metav1.TypeMeta
// Resources to be considered when scoring.
// The default resource set includes "cpu" and "memory", only valid weight is 1.
Resources []ResourceSpec
}
// UtilizationShapePoint represents a single point of a priority function shape.
type UtilizationShapePoint struct {
// Utilization (x axis). Valid values are 0 to 100. Fully utilized node maps to 100.
Utilization int32
// Score assigned to a given utilization (y axis). Valid values are 0 to 10.
Score int32
}
// ResourceSpec represents single resource.
type ResourceSpec struct {
// Name of the resource.
Name string
// Weight of the resource.
Weight int64
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// VolumeBindingArgs holds arguments used to configure the VolumeBinding plugin.
type VolumeBindingArgs struct {
metav1.TypeMeta
// BindTimeoutSeconds is the timeout in seconds in volume binding operation.
// Value must be non-negative integer. The value zero indicates no waiting.
// If this value is nil, the default value will be used.
BindTimeoutSeconds int64
// Shape specifies the points defining the score function shape, which is
// used to score nodes based on the utilization of statically provisioned
// PVs. The utilization is calculated by dividing the total requested
// storage of the pod by the total capacity of feasible PVs on each node.
// Each point contains utilization (ranges from 0 to 100) and its
// associated score (ranges from 0 to 10). You can turn the priority by
// specifying different scores for different utilization numbers.
// The default shape points are:
// 1) 0 for 0 utilization
// 2) 10 for 100 utilization
// All points must be sorted in increasing order by utilization.
// +featureGate=VolumeCapacityPriority
// +optional
Shape []UtilizationShapePoint
}
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
// NodeAffinityArgs holds arguments to configure the NodeAffinity plugin.
type NodeAffinityArgs struct {
metav1.TypeMeta
// AddedAffinity is applied to all Pods additionally to the NodeAffinity
// specified in the PodSpec. That is, Nodes need to satisfy AddedAffinity
// AND .spec.NodeAffinity. AddedAffinity is empty by default (all Nodes
// match).
// When AddedAffinity is used, some Pods with affinity requirements that match
// a specific Node (such as Daemonset Pods) might remain unschedulable.
AddedAffinity *v1.NodeAffinity
}
// ScoringStrategyType the type of scoring strategy used in NodeResourcesFit plugin.
type ScoringStrategyType string
const (
// LeastAllocated strategy prioritizes nodes with least allocated resources.
LeastAllocated ScoringStrategyType = "LeastAllocated"
// MostAllocated strategy prioritizes nodes with most allocated resources.
MostAllocated ScoringStrategyType = "MostAllocated"
// RequestedToCapacityRatio strategy allows specifying a custom shape function
// to score nodes based on the request to capacity ratio.
RequestedToCapacityRatio ScoringStrategyType = "RequestedToCapacityRatio"
)
// ScoringStrategy define ScoringStrategyType for node resource plugin
type ScoringStrategy struct {
// Type selects which strategy to run.
Type ScoringStrategyType
// Resources to consider when scoring.
// The default resource set includes "cpu" and "memory" with an equal weight.
// Allowed weights go from 1 to 100.
// Weight defaults to 1 if not specified or explicitly set to 0.
Resources []ResourceSpec
// Arguments specific to RequestedToCapacityRatio strategy.
RequestedToCapacityRatio *RequestedToCapacityRatioParam
}
// RequestedToCapacityRatioParam define RequestedToCapacityRatio parameters
type RequestedToCapacityRatioParam struct {
// Shape is a list of points defining the scoring function shape.
Shape []UtilizationShapePoint
}

View File

@ -0,0 +1,107 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"fmt"
"sync"
"k8s.io/apimachinery/pkg/conversion"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
v1 "k8s.io/kube-scheduler/config/v1"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
)
var (
// pluginArgConversionScheme is a scheme with internal and v1 registered,
// used for defaulting/converting typed PluginConfig Args.
// Access via getPluginArgConversionScheme()
pluginArgConversionScheme *runtime.Scheme
initPluginArgConversionScheme sync.Once
)
func GetPluginArgConversionScheme() *runtime.Scheme {
initPluginArgConversionScheme.Do(func() {
// set up the scheme used for plugin arg conversion
pluginArgConversionScheme = runtime.NewScheme()
utilruntime.Must(AddToScheme(pluginArgConversionScheme))
utilruntime.Must(config.AddToScheme(pluginArgConversionScheme))
})
return pluginArgConversionScheme
}
func Convert_v1_KubeSchedulerConfiguration_To_config_KubeSchedulerConfiguration(in *v1.KubeSchedulerConfiguration, out *config.KubeSchedulerConfiguration, s conversion.Scope) error {
if err := autoConvert_v1_KubeSchedulerConfiguration_To_config_KubeSchedulerConfiguration(in, out, s); err != nil {
return err
}
return convertToInternalPluginConfigArgs(out)
}
// convertToInternalPluginConfigArgs converts PluginConfig#Args into internal
// types using a scheme, after applying defaults.
func convertToInternalPluginConfigArgs(out *config.KubeSchedulerConfiguration) error {
scheme := GetPluginArgConversionScheme()
for i := range out.Profiles {
prof := &out.Profiles[i]
for j := range prof.PluginConfig {
args := prof.PluginConfig[j].Args
if args == nil {
continue
}
if _, isUnknown := args.(*runtime.Unknown); isUnknown {
continue
}
internalArgs, err := scheme.ConvertToVersion(args, config.SchemeGroupVersion)
if err != nil {
return fmt.Errorf("converting .Profiles[%d].PluginConfig[%d].Args into internal type: %w", i, j, err)
}
prof.PluginConfig[j].Args = internalArgs
}
}
return nil
}
func Convert_config_KubeSchedulerConfiguration_To_v1_KubeSchedulerConfiguration(in *config.KubeSchedulerConfiguration, out *v1.KubeSchedulerConfiguration, s conversion.Scope) error {
if err := autoConvert_config_KubeSchedulerConfiguration_To_v1_KubeSchedulerConfiguration(in, out, s); err != nil {
return err
}
return convertToExternalPluginConfigArgs(out)
}
// convertToExternalPluginConfigArgs converts PluginConfig#Args into
// external (versioned) types using a scheme.
func convertToExternalPluginConfigArgs(out *v1.KubeSchedulerConfiguration) error {
scheme := GetPluginArgConversionScheme()
for i := range out.Profiles {
for j := range out.Profiles[i].PluginConfig {
args := out.Profiles[i].PluginConfig[j].Args
if args.Object == nil {
continue
}
if _, isUnknown := args.Object.(*runtime.Unknown); isUnknown {
continue
}
externalArgs, err := scheme.ConvertToVersion(args.Object, SchemeGroupVersion)
if err != nil {
return err
}
out.Profiles[i].PluginConfig[j].Args.Object = externalArgs
}
}
return nil
}

View File

@ -0,0 +1,157 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
v1 "k8s.io/kube-scheduler/config/v1"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/utils/ptr"
)
// getDefaultPlugins returns the default set of plugins.
func getDefaultPlugins() *v1.Plugins {
plugins := &v1.Plugins{
MultiPoint: v1.PluginSet{
Enabled: []v1.Plugin{
{Name: names.SchedulingGates},
{Name: names.PrioritySort},
{Name: names.NodeUnschedulable},
{Name: names.NodeName},
{Name: names.TaintToleration, Weight: ptr.To[int32](3)},
{Name: names.NodeAffinity, Weight: ptr.To[int32](2)},
{Name: names.NodePorts},
{Name: names.NodeResourcesFit, Weight: ptr.To[int32](1)},
{Name: names.VolumeRestrictions},
{Name: names.NodeVolumeLimits},
{Name: names.VolumeBinding},
{Name: names.VolumeZone},
{Name: names.PodTopologySpread, Weight: ptr.To[int32](2)},
{Name: names.InterPodAffinity, Weight: ptr.To[int32](2)},
{Name: names.DefaultPreemption},
{Name: names.NodeResourcesBalancedAllocation, Weight: ptr.To[int32](1)},
{Name: names.ImageLocality, Weight: ptr.To[int32](1)},
{Name: names.DefaultBinder},
},
},
}
applyFeatureGates(plugins)
return plugins
}
func applyFeatureGates(config *v1.Plugins) {
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
// This plugin should come before DefaultPreemption because if
// there is a problem with a Pod and PostFilter gets called to
// resolve the problem, it is better to first deallocate an
// idle ResourceClaim than it is to evict some Pod that might
// be doing useful work.
for i := range config.MultiPoint.Enabled {
if config.MultiPoint.Enabled[i].Name == names.DefaultPreemption {
extended := make([]v1.Plugin, 0, len(config.MultiPoint.Enabled)+1)
extended = append(extended, config.MultiPoint.Enabled[:i]...)
extended = append(extended, v1.Plugin{Name: names.DynamicResources})
extended = append(extended, config.MultiPoint.Enabled[i:]...)
config.MultiPoint.Enabled = extended
break
}
}
}
}
// mergePlugins merges the custom set into the given default one, handling disabled sets.
func mergePlugins(logger klog.Logger, defaultPlugins, customPlugins *v1.Plugins) *v1.Plugins {
if customPlugins == nil {
return defaultPlugins
}
defaultPlugins.MultiPoint = mergePluginSet(logger, defaultPlugins.MultiPoint, customPlugins.MultiPoint)
defaultPlugins.PreEnqueue = mergePluginSet(logger, defaultPlugins.PreEnqueue, customPlugins.PreEnqueue)
defaultPlugins.QueueSort = mergePluginSet(logger, defaultPlugins.QueueSort, customPlugins.QueueSort)
defaultPlugins.PreFilter = mergePluginSet(logger, defaultPlugins.PreFilter, customPlugins.PreFilter)
defaultPlugins.Filter = mergePluginSet(logger, defaultPlugins.Filter, customPlugins.Filter)
defaultPlugins.PostFilter = mergePluginSet(logger, defaultPlugins.PostFilter, customPlugins.PostFilter)
defaultPlugins.PreScore = mergePluginSet(logger, defaultPlugins.PreScore, customPlugins.PreScore)
defaultPlugins.Score = mergePluginSet(logger, defaultPlugins.Score, customPlugins.Score)
defaultPlugins.Reserve = mergePluginSet(logger, defaultPlugins.Reserve, customPlugins.Reserve)
defaultPlugins.Permit = mergePluginSet(logger, defaultPlugins.Permit, customPlugins.Permit)
defaultPlugins.PreBind = mergePluginSet(logger, defaultPlugins.PreBind, customPlugins.PreBind)
defaultPlugins.Bind = mergePluginSet(logger, defaultPlugins.Bind, customPlugins.Bind)
defaultPlugins.PostBind = mergePluginSet(logger, defaultPlugins.PostBind, customPlugins.PostBind)
return defaultPlugins
}
type pluginIndex struct {
index int
plugin v1.Plugin
}
func mergePluginSet(logger klog.Logger, defaultPluginSet, customPluginSet v1.PluginSet) v1.PluginSet {
disabledPlugins := sets.New[string]()
enabledCustomPlugins := make(map[string]pluginIndex)
// replacedPluginIndex is a set of index of plugins, which have replaced the default plugins.
replacedPluginIndex := sets.New[int]()
var disabled []v1.Plugin
for _, disabledPlugin := range customPluginSet.Disabled {
// if the user is manually disabling any (or all, with "*") default plugins for an extension point,
// we need to track that so that the MultiPoint extension logic in the framework can know to skip
// inserting unspecified default plugins to this point.
disabled = append(disabled, v1.Plugin{Name: disabledPlugin.Name})
disabledPlugins.Insert(disabledPlugin.Name)
}
// With MultiPoint, we may now have some disabledPlugins in the default registry
// For example, we enable PluginX with Filter+Score through MultiPoint but disable its Score plugin by default.
for _, disabledPlugin := range defaultPluginSet.Disabled {
disabled = append(disabled, v1.Plugin{Name: disabledPlugin.Name})
disabledPlugins.Insert(disabledPlugin.Name)
}
for index, enabledPlugin := range customPluginSet.Enabled {
enabledCustomPlugins[enabledPlugin.Name] = pluginIndex{index, enabledPlugin}
}
var enabledPlugins []v1.Plugin
if !disabledPlugins.Has("*") {
for _, defaultEnabledPlugin := range defaultPluginSet.Enabled {
if disabledPlugins.Has(defaultEnabledPlugin.Name) {
continue
}
// The default plugin is explicitly re-configured, update the default plugin accordingly.
if customPlugin, ok := enabledCustomPlugins[defaultEnabledPlugin.Name]; ok {
logger.Info("Default plugin is explicitly re-configured; overriding", "plugin", defaultEnabledPlugin.Name)
// Update the default plugin in place to preserve order.
defaultEnabledPlugin = customPlugin.plugin
replacedPluginIndex.Insert(customPlugin.index)
}
enabledPlugins = append(enabledPlugins, defaultEnabledPlugin)
}
}
// Append all the custom plugins which haven't replaced any default plugins.
// Note: duplicated custom plugins will still be appended here.
// If so, the instantiation of scheduler framework will detect it and abort.
for index, plugin := range customPluginSet.Enabled {
if !replacedPluginIndex.Has(index) {
enabledPlugins = append(enabledPlugins, plugin)
}
}
return v1.PluginSet{Enabled: enabledPlugins, Disabled: disabled}
}

View File

@ -0,0 +1,244 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apiserver/pkg/util/feature"
componentbaseconfigv1alpha1 "k8s.io/component-base/config/v1alpha1"
"k8s.io/klog/v2"
configv1 "k8s.io/kube-scheduler/config/v1"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/utils/ptr"
)
var defaultResourceSpec = []configv1.ResourceSpec{
{Name: string(v1.ResourceCPU), Weight: 1},
{Name: string(v1.ResourceMemory), Weight: 1},
}
func addDefaultingFuncs(scheme *runtime.Scheme) error {
return RegisterDefaults(scheme)
}
func pluginsNames(p *configv1.Plugins) []string {
if p == nil {
return nil
}
extensions := []configv1.PluginSet{
p.MultiPoint,
p.PreFilter,
p.Filter,
p.PostFilter,
p.Reserve,
p.PreScore,
p.Score,
p.PreBind,
p.Bind,
p.PostBind,
p.Permit,
p.PreEnqueue,
p.QueueSort,
}
n := sets.New[string]()
for _, e := range extensions {
for _, pg := range e.Enabled {
n.Insert(pg.Name)
}
}
return sets.List(n)
}
func setDefaults_KubeSchedulerProfile(logger klog.Logger, prof *configv1.KubeSchedulerProfile) {
// Set default plugins.
prof.Plugins = mergePlugins(logger, getDefaultPlugins(), prof.Plugins)
// Set default plugin configs.
scheme := GetPluginArgConversionScheme()
existingConfigs := sets.New[string]()
for j := range prof.PluginConfig {
existingConfigs.Insert(prof.PluginConfig[j].Name)
args := prof.PluginConfig[j].Args.Object
if _, isUnknown := args.(*runtime.Unknown); isUnknown {
continue
}
scheme.Default(args)
}
// Append default configs for plugins that didn't have one explicitly set.
for _, name := range pluginsNames(prof.Plugins) {
if existingConfigs.Has(name) {
continue
}
gvk := configv1.SchemeGroupVersion.WithKind(name + "Args")
args, err := scheme.New(gvk)
if err != nil {
// This plugin is out-of-tree or doesn't require configuration.
continue
}
scheme.Default(args)
args.GetObjectKind().SetGroupVersionKind(gvk)
prof.PluginConfig = append(prof.PluginConfig, configv1.PluginConfig{
Name: name,
Args: runtime.RawExtension{Object: args},
})
}
}
// SetDefaults_KubeSchedulerConfiguration sets additional defaults
func SetDefaults_KubeSchedulerConfiguration(obj *configv1.KubeSchedulerConfiguration) {
logger := klog.TODO() // called by generated code that doesn't pass a logger. See #115724
if obj.Parallelism == nil {
obj.Parallelism = ptr.To[int32](16)
}
if len(obj.Profiles) == 0 {
obj.Profiles = append(obj.Profiles, configv1.KubeSchedulerProfile{})
}
// Only apply a default scheduler name when there is a single profile.
// Validation will ensure that every profile has a non-empty unique name.
if len(obj.Profiles) == 1 && obj.Profiles[0].SchedulerName == nil {
obj.Profiles[0].SchedulerName = ptr.To(v1.DefaultSchedulerName)
}
// Add the default set of plugins and apply the configuration.
for i := range obj.Profiles {
prof := &obj.Profiles[i]
setDefaults_KubeSchedulerProfile(logger, prof)
}
if obj.PercentageOfNodesToScore == nil {
obj.PercentageOfNodesToScore = ptr.To[int32](config.DefaultPercentageOfNodesToScore)
}
if len(obj.LeaderElection.ResourceLock) == 0 {
// Use lease-based leader election to reduce cost.
// We migrated for EndpointsLease lock in 1.17 and starting in 1.20 we
// migrated to Lease lock.
obj.LeaderElection.ResourceLock = "leases"
}
if len(obj.LeaderElection.ResourceNamespace) == 0 {
obj.LeaderElection.ResourceNamespace = configv1.SchedulerDefaultLockObjectNamespace
}
if len(obj.LeaderElection.ResourceName) == 0 {
obj.LeaderElection.ResourceName = configv1.SchedulerDefaultLockObjectName
}
if len(obj.ClientConnection.ContentType) == 0 {
obj.ClientConnection.ContentType = "application/vnd.kubernetes.protobuf"
}
// Scheduler has an opinion about QPS/Burst, setting specific defaults for itself, instead of generic settings.
if obj.ClientConnection.QPS == 0.0 {
obj.ClientConnection.QPS = 50.0
}
if obj.ClientConnection.Burst == 0 {
obj.ClientConnection.Burst = 100
}
// Use the default LeaderElectionConfiguration options
componentbaseconfigv1alpha1.RecommendedDefaultLeaderElectionConfiguration(&obj.LeaderElection)
if obj.PodInitialBackoffSeconds == nil {
obj.PodInitialBackoffSeconds = ptr.To[int64](1)
}
if obj.PodMaxBackoffSeconds == nil {
obj.PodMaxBackoffSeconds = ptr.To[int64](10)
}
// Enable profiling by default in the scheduler
if obj.EnableProfiling == nil {
obj.EnableProfiling = ptr.To(true)
}
// Enable contention profiling by default if profiling is enabled
if *obj.EnableProfiling && obj.EnableContentionProfiling == nil {
obj.EnableContentionProfiling = ptr.To(true)
}
}
func SetDefaults_DefaultPreemptionArgs(obj *configv1.DefaultPreemptionArgs) {
if obj.MinCandidateNodesPercentage == nil {
obj.MinCandidateNodesPercentage = ptr.To[int32](10)
}
if obj.MinCandidateNodesAbsolute == nil {
obj.MinCandidateNodesAbsolute = ptr.To[int32](100)
}
}
func SetDefaults_InterPodAffinityArgs(obj *configv1.InterPodAffinityArgs) {
if obj.HardPodAffinityWeight == nil {
obj.HardPodAffinityWeight = ptr.To[int32](1)
}
}
func SetDefaults_VolumeBindingArgs(obj *configv1.VolumeBindingArgs) {
if obj.BindTimeoutSeconds == nil {
obj.BindTimeoutSeconds = ptr.To[int64](600)
}
if len(obj.Shape) == 0 && feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority) {
obj.Shape = []configv1.UtilizationShapePoint{
{
Utilization: 0,
Score: 0,
},
{
Utilization: 100,
Score: int32(config.MaxCustomPriorityScore),
},
}
}
}
func SetDefaults_NodeResourcesBalancedAllocationArgs(obj *configv1.NodeResourcesBalancedAllocationArgs) {
if len(obj.Resources) == 0 {
obj.Resources = defaultResourceSpec
return
}
// If the weight is not set or it is explicitly set to 0, then apply the default weight(1) instead.
for i := range obj.Resources {
if obj.Resources[i].Weight == 0 {
obj.Resources[i].Weight = 1
}
}
}
func SetDefaults_PodTopologySpreadArgs(obj *configv1.PodTopologySpreadArgs) {
if obj.DefaultingType == "" {
obj.DefaultingType = configv1.SystemDefaulting
}
}
func SetDefaults_NodeResourcesFitArgs(obj *configv1.NodeResourcesFitArgs) {
if obj.ScoringStrategy == nil {
obj.ScoringStrategy = &configv1.ScoringStrategy{
Type: configv1.ScoringStrategyType(config.LeastAllocated),
Resources: defaultResourceSpec,
}
}
if len(obj.ScoringStrategy.Resources) == 0 {
// If no resources specified, use the default set.
obj.ScoringStrategy.Resources = append(obj.ScoringStrategy.Resources, defaultResourceSpec...)
}
for i := range obj.ScoringStrategy.Resources {
if obj.ScoringStrategy.Resources[i].Weight == 0 {
obj.ScoringStrategy.Resources[i].Weight = 1
}
}
}

View File

@ -0,0 +1,24 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// +k8s:deepcopy-gen=package
// +k8s:conversion-gen=k8s.io/kubernetes/pkg/scheduler/apis/config
// +k8s:conversion-gen-external-types=k8s.io/kube-scheduler/config/v1
// +k8s:defaulter-gen=TypeMeta
// +k8s:defaulter-gen-input=k8s.io/kube-scheduler/config/v1
// +groupName=kubescheduler.config.k8s.io
package v1 // import "k8s.io/kubernetes/pkg/scheduler/apis/config/v1"

View File

@ -0,0 +1,42 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package v1
import (
v1 "k8s.io/kube-scheduler/config/v1"
)
// GroupName is the group name used in this package
const GroupName = v1.GroupName
// SchemeGroupVersion is group version used to register these objects
var SchemeGroupVersion = v1.SchemeGroupVersion
var (
// localSchemeBuilder extends the SchemeBuilder instance with the external types. In this package,
// defaulting and conversion init funcs are registered as well.
localSchemeBuilder = &v1.SchemeBuilder
// AddToScheme is a global function that registers this API group & version to a scheme
AddToScheme = localSchemeBuilder.AddToScheme
)
func init() {
// We only register manually written functions here. The registration of the
// generated functions takes place in the generated files. The separation
// makes the code compile even when the generated files are missing.
localSchemeBuilder.Register(addDefaultingFuncs)
}

View File

@ -0,0 +1,946 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by conversion-gen. DO NOT EDIT.
package v1
import (
unsafe "unsafe"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
conversion "k8s.io/apimachinery/pkg/conversion"
runtime "k8s.io/apimachinery/pkg/runtime"
v1alpha1 "k8s.io/component-base/config/v1alpha1"
configv1 "k8s.io/kube-scheduler/config/v1"
config "k8s.io/kubernetes/pkg/scheduler/apis/config"
)
func init() {
localSchemeBuilder.Register(RegisterConversions)
}
// RegisterConversions adds conversion functions to the given scheme.
// Public to allow building arbitrary schemes.
func RegisterConversions(s *runtime.Scheme) error {
if err := s.AddGeneratedConversionFunc((*configv1.DefaultPreemptionArgs)(nil), (*config.DefaultPreemptionArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(a.(*configv1.DefaultPreemptionArgs), b.(*config.DefaultPreemptionArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.DefaultPreemptionArgs)(nil), (*configv1.DefaultPreemptionArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_DefaultPreemptionArgs_To_v1_DefaultPreemptionArgs(a.(*config.DefaultPreemptionArgs), b.(*configv1.DefaultPreemptionArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.Extender)(nil), (*config.Extender)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_Extender_To_config_Extender(a.(*configv1.Extender), b.(*config.Extender), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.Extender)(nil), (*configv1.Extender)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_Extender_To_v1_Extender(a.(*config.Extender), b.(*configv1.Extender), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.ExtenderManagedResource)(nil), (*config.ExtenderManagedResource)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_ExtenderManagedResource_To_config_ExtenderManagedResource(a.(*configv1.ExtenderManagedResource), b.(*config.ExtenderManagedResource), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.ExtenderManagedResource)(nil), (*configv1.ExtenderManagedResource)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_ExtenderManagedResource_To_v1_ExtenderManagedResource(a.(*config.ExtenderManagedResource), b.(*configv1.ExtenderManagedResource), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.ExtenderTLSConfig)(nil), (*config.ExtenderTLSConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_ExtenderTLSConfig_To_config_ExtenderTLSConfig(a.(*configv1.ExtenderTLSConfig), b.(*config.ExtenderTLSConfig), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.ExtenderTLSConfig)(nil), (*configv1.ExtenderTLSConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_ExtenderTLSConfig_To_v1_ExtenderTLSConfig(a.(*config.ExtenderTLSConfig), b.(*configv1.ExtenderTLSConfig), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.InterPodAffinityArgs)(nil), (*config.InterPodAffinityArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_InterPodAffinityArgs_To_config_InterPodAffinityArgs(a.(*configv1.InterPodAffinityArgs), b.(*config.InterPodAffinityArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.InterPodAffinityArgs)(nil), (*configv1.InterPodAffinityArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_InterPodAffinityArgs_To_v1_InterPodAffinityArgs(a.(*config.InterPodAffinityArgs), b.(*configv1.InterPodAffinityArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.KubeSchedulerProfile)(nil), (*config.KubeSchedulerProfile)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_KubeSchedulerProfile_To_config_KubeSchedulerProfile(a.(*configv1.KubeSchedulerProfile), b.(*config.KubeSchedulerProfile), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.KubeSchedulerProfile)(nil), (*configv1.KubeSchedulerProfile)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_KubeSchedulerProfile_To_v1_KubeSchedulerProfile(a.(*config.KubeSchedulerProfile), b.(*configv1.KubeSchedulerProfile), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.NodeAffinityArgs)(nil), (*config.NodeAffinityArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_NodeAffinityArgs_To_config_NodeAffinityArgs(a.(*configv1.NodeAffinityArgs), b.(*config.NodeAffinityArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.NodeAffinityArgs)(nil), (*configv1.NodeAffinityArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_NodeAffinityArgs_To_v1_NodeAffinityArgs(a.(*config.NodeAffinityArgs), b.(*configv1.NodeAffinityArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.NodeResourcesBalancedAllocationArgs)(nil), (*config.NodeResourcesBalancedAllocationArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_NodeResourcesBalancedAllocationArgs_To_config_NodeResourcesBalancedAllocationArgs(a.(*configv1.NodeResourcesBalancedAllocationArgs), b.(*config.NodeResourcesBalancedAllocationArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.NodeResourcesBalancedAllocationArgs)(nil), (*configv1.NodeResourcesBalancedAllocationArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_NodeResourcesBalancedAllocationArgs_To_v1_NodeResourcesBalancedAllocationArgs(a.(*config.NodeResourcesBalancedAllocationArgs), b.(*configv1.NodeResourcesBalancedAllocationArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.NodeResourcesFitArgs)(nil), (*config.NodeResourcesFitArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_NodeResourcesFitArgs_To_config_NodeResourcesFitArgs(a.(*configv1.NodeResourcesFitArgs), b.(*config.NodeResourcesFitArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.NodeResourcesFitArgs)(nil), (*configv1.NodeResourcesFitArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_NodeResourcesFitArgs_To_v1_NodeResourcesFitArgs(a.(*config.NodeResourcesFitArgs), b.(*configv1.NodeResourcesFitArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.Plugin)(nil), (*config.Plugin)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_Plugin_To_config_Plugin(a.(*configv1.Plugin), b.(*config.Plugin), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.Plugin)(nil), (*configv1.Plugin)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_Plugin_To_v1_Plugin(a.(*config.Plugin), b.(*configv1.Plugin), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.PluginConfig)(nil), (*config.PluginConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_PluginConfig_To_config_PluginConfig(a.(*configv1.PluginConfig), b.(*config.PluginConfig), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.PluginConfig)(nil), (*configv1.PluginConfig)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_PluginConfig_To_v1_PluginConfig(a.(*config.PluginConfig), b.(*configv1.PluginConfig), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.PluginSet)(nil), (*config.PluginSet)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_PluginSet_To_config_PluginSet(a.(*configv1.PluginSet), b.(*config.PluginSet), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.PluginSet)(nil), (*configv1.PluginSet)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_PluginSet_To_v1_PluginSet(a.(*config.PluginSet), b.(*configv1.PluginSet), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.Plugins)(nil), (*config.Plugins)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_Plugins_To_config_Plugins(a.(*configv1.Plugins), b.(*config.Plugins), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.Plugins)(nil), (*configv1.Plugins)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_Plugins_To_v1_Plugins(a.(*config.Plugins), b.(*configv1.Plugins), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.PodTopologySpreadArgs)(nil), (*config.PodTopologySpreadArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_PodTopologySpreadArgs_To_config_PodTopologySpreadArgs(a.(*configv1.PodTopologySpreadArgs), b.(*config.PodTopologySpreadArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.PodTopologySpreadArgs)(nil), (*configv1.PodTopologySpreadArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_PodTopologySpreadArgs_To_v1_PodTopologySpreadArgs(a.(*config.PodTopologySpreadArgs), b.(*configv1.PodTopologySpreadArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.RequestedToCapacityRatioParam)(nil), (*config.RequestedToCapacityRatioParam)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_RequestedToCapacityRatioParam_To_config_RequestedToCapacityRatioParam(a.(*configv1.RequestedToCapacityRatioParam), b.(*config.RequestedToCapacityRatioParam), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.RequestedToCapacityRatioParam)(nil), (*configv1.RequestedToCapacityRatioParam)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_RequestedToCapacityRatioParam_To_v1_RequestedToCapacityRatioParam(a.(*config.RequestedToCapacityRatioParam), b.(*configv1.RequestedToCapacityRatioParam), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.ResourceSpec)(nil), (*config.ResourceSpec)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_ResourceSpec_To_config_ResourceSpec(a.(*configv1.ResourceSpec), b.(*config.ResourceSpec), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.ResourceSpec)(nil), (*configv1.ResourceSpec)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_ResourceSpec_To_v1_ResourceSpec(a.(*config.ResourceSpec), b.(*configv1.ResourceSpec), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.ScoringStrategy)(nil), (*config.ScoringStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_ScoringStrategy_To_config_ScoringStrategy(a.(*configv1.ScoringStrategy), b.(*config.ScoringStrategy), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.ScoringStrategy)(nil), (*configv1.ScoringStrategy)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_ScoringStrategy_To_v1_ScoringStrategy(a.(*config.ScoringStrategy), b.(*configv1.ScoringStrategy), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.UtilizationShapePoint)(nil), (*config.UtilizationShapePoint)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_UtilizationShapePoint_To_config_UtilizationShapePoint(a.(*configv1.UtilizationShapePoint), b.(*config.UtilizationShapePoint), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.UtilizationShapePoint)(nil), (*configv1.UtilizationShapePoint)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_UtilizationShapePoint_To_v1_UtilizationShapePoint(a.(*config.UtilizationShapePoint), b.(*configv1.UtilizationShapePoint), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*configv1.VolumeBindingArgs)(nil), (*config.VolumeBindingArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_VolumeBindingArgs_To_config_VolumeBindingArgs(a.(*configv1.VolumeBindingArgs), b.(*config.VolumeBindingArgs), scope)
}); err != nil {
return err
}
if err := s.AddGeneratedConversionFunc((*config.VolumeBindingArgs)(nil), (*configv1.VolumeBindingArgs)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_VolumeBindingArgs_To_v1_VolumeBindingArgs(a.(*config.VolumeBindingArgs), b.(*configv1.VolumeBindingArgs), scope)
}); err != nil {
return err
}
if err := s.AddConversionFunc((*config.KubeSchedulerConfiguration)(nil), (*configv1.KubeSchedulerConfiguration)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_config_KubeSchedulerConfiguration_To_v1_KubeSchedulerConfiguration(a.(*config.KubeSchedulerConfiguration), b.(*configv1.KubeSchedulerConfiguration), scope)
}); err != nil {
return err
}
if err := s.AddConversionFunc((*configv1.KubeSchedulerConfiguration)(nil), (*config.KubeSchedulerConfiguration)(nil), func(a, b interface{}, scope conversion.Scope) error {
return Convert_v1_KubeSchedulerConfiguration_To_config_KubeSchedulerConfiguration(a.(*configv1.KubeSchedulerConfiguration), b.(*config.KubeSchedulerConfiguration), scope)
}); err != nil {
return err
}
return nil
}
func autoConvert_v1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(in *configv1.DefaultPreemptionArgs, out *config.DefaultPreemptionArgs, s conversion.Scope) error {
if err := metav1.Convert_Pointer_int32_To_int32(&in.MinCandidateNodesPercentage, &out.MinCandidateNodesPercentage, s); err != nil {
return err
}
if err := metav1.Convert_Pointer_int32_To_int32(&in.MinCandidateNodesAbsolute, &out.MinCandidateNodesAbsolute, s); err != nil {
return err
}
return nil
}
// Convert_v1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs is an autogenerated conversion function.
func Convert_v1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(in *configv1.DefaultPreemptionArgs, out *config.DefaultPreemptionArgs, s conversion.Scope) error {
return autoConvert_v1_DefaultPreemptionArgs_To_config_DefaultPreemptionArgs(in, out, s)
}
func autoConvert_config_DefaultPreemptionArgs_To_v1_DefaultPreemptionArgs(in *config.DefaultPreemptionArgs, out *configv1.DefaultPreemptionArgs, s conversion.Scope) error {
if err := metav1.Convert_int32_To_Pointer_int32(&in.MinCandidateNodesPercentage, &out.MinCandidateNodesPercentage, s); err != nil {
return err
}
if err := metav1.Convert_int32_To_Pointer_int32(&in.MinCandidateNodesAbsolute, &out.MinCandidateNodesAbsolute, s); err != nil {
return err
}
return nil
}
// Convert_config_DefaultPreemptionArgs_To_v1_DefaultPreemptionArgs is an autogenerated conversion function.
func Convert_config_DefaultPreemptionArgs_To_v1_DefaultPreemptionArgs(in *config.DefaultPreemptionArgs, out *configv1.DefaultPreemptionArgs, s conversion.Scope) error {
return autoConvert_config_DefaultPreemptionArgs_To_v1_DefaultPreemptionArgs(in, out, s)
}
func autoConvert_v1_Extender_To_config_Extender(in *configv1.Extender, out *config.Extender, s conversion.Scope) error {
out.URLPrefix = in.URLPrefix
out.FilterVerb = in.FilterVerb
out.PreemptVerb = in.PreemptVerb
out.PrioritizeVerb = in.PrioritizeVerb
out.Weight = in.Weight
out.BindVerb = in.BindVerb
out.EnableHTTPS = in.EnableHTTPS
out.TLSConfig = (*config.ExtenderTLSConfig)(unsafe.Pointer(in.TLSConfig))
out.HTTPTimeout = in.HTTPTimeout
out.NodeCacheCapable = in.NodeCacheCapable
out.ManagedResources = *(*[]config.ExtenderManagedResource)(unsafe.Pointer(&in.ManagedResources))
out.Ignorable = in.Ignorable
return nil
}
// Convert_v1_Extender_To_config_Extender is an autogenerated conversion function.
func Convert_v1_Extender_To_config_Extender(in *configv1.Extender, out *config.Extender, s conversion.Scope) error {
return autoConvert_v1_Extender_To_config_Extender(in, out, s)
}
func autoConvert_config_Extender_To_v1_Extender(in *config.Extender, out *configv1.Extender, s conversion.Scope) error {
out.URLPrefix = in.URLPrefix
out.FilterVerb = in.FilterVerb
out.PreemptVerb = in.PreemptVerb
out.PrioritizeVerb = in.PrioritizeVerb
out.Weight = in.Weight
out.BindVerb = in.BindVerb
out.EnableHTTPS = in.EnableHTTPS
out.TLSConfig = (*configv1.ExtenderTLSConfig)(unsafe.Pointer(in.TLSConfig))
out.HTTPTimeout = in.HTTPTimeout
out.NodeCacheCapable = in.NodeCacheCapable
out.ManagedResources = *(*[]configv1.ExtenderManagedResource)(unsafe.Pointer(&in.ManagedResources))
out.Ignorable = in.Ignorable
return nil
}
// Convert_config_Extender_To_v1_Extender is an autogenerated conversion function.
func Convert_config_Extender_To_v1_Extender(in *config.Extender, out *configv1.Extender, s conversion.Scope) error {
return autoConvert_config_Extender_To_v1_Extender(in, out, s)
}
func autoConvert_v1_ExtenderManagedResource_To_config_ExtenderManagedResource(in *configv1.ExtenderManagedResource, out *config.ExtenderManagedResource, s conversion.Scope) error {
out.Name = in.Name
out.IgnoredByScheduler = in.IgnoredByScheduler
return nil
}
// Convert_v1_ExtenderManagedResource_To_config_ExtenderManagedResource is an autogenerated conversion function.
func Convert_v1_ExtenderManagedResource_To_config_ExtenderManagedResource(in *configv1.ExtenderManagedResource, out *config.ExtenderManagedResource, s conversion.Scope) error {
return autoConvert_v1_ExtenderManagedResource_To_config_ExtenderManagedResource(in, out, s)
}
func autoConvert_config_ExtenderManagedResource_To_v1_ExtenderManagedResource(in *config.ExtenderManagedResource, out *configv1.ExtenderManagedResource, s conversion.Scope) error {
out.Name = in.Name
out.IgnoredByScheduler = in.IgnoredByScheduler
return nil
}
// Convert_config_ExtenderManagedResource_To_v1_ExtenderManagedResource is an autogenerated conversion function.
func Convert_config_ExtenderManagedResource_To_v1_ExtenderManagedResource(in *config.ExtenderManagedResource, out *configv1.ExtenderManagedResource, s conversion.Scope) error {
return autoConvert_config_ExtenderManagedResource_To_v1_ExtenderManagedResource(in, out, s)
}
func autoConvert_v1_ExtenderTLSConfig_To_config_ExtenderTLSConfig(in *configv1.ExtenderTLSConfig, out *config.ExtenderTLSConfig, s conversion.Scope) error {
out.Insecure = in.Insecure
out.ServerName = in.ServerName
out.CertFile = in.CertFile
out.KeyFile = in.KeyFile
out.CAFile = in.CAFile
out.CertData = *(*[]byte)(unsafe.Pointer(&in.CertData))
out.KeyData = *(*[]byte)(unsafe.Pointer(&in.KeyData))
out.CAData = *(*[]byte)(unsafe.Pointer(&in.CAData))
return nil
}
// Convert_v1_ExtenderTLSConfig_To_config_ExtenderTLSConfig is an autogenerated conversion function.
func Convert_v1_ExtenderTLSConfig_To_config_ExtenderTLSConfig(in *configv1.ExtenderTLSConfig, out *config.ExtenderTLSConfig, s conversion.Scope) error {
return autoConvert_v1_ExtenderTLSConfig_To_config_ExtenderTLSConfig(in, out, s)
}
func autoConvert_config_ExtenderTLSConfig_To_v1_ExtenderTLSConfig(in *config.ExtenderTLSConfig, out *configv1.ExtenderTLSConfig, s conversion.Scope) error {
out.Insecure = in.Insecure
out.ServerName = in.ServerName
out.CertFile = in.CertFile
out.KeyFile = in.KeyFile
out.CAFile = in.CAFile
out.CertData = *(*[]byte)(unsafe.Pointer(&in.CertData))
out.KeyData = *(*[]byte)(unsafe.Pointer(&in.KeyData))
out.CAData = *(*[]byte)(unsafe.Pointer(&in.CAData))
return nil
}
// Convert_config_ExtenderTLSConfig_To_v1_ExtenderTLSConfig is an autogenerated conversion function.
func Convert_config_ExtenderTLSConfig_To_v1_ExtenderTLSConfig(in *config.ExtenderTLSConfig, out *configv1.ExtenderTLSConfig, s conversion.Scope) error {
return autoConvert_config_ExtenderTLSConfig_To_v1_ExtenderTLSConfig(in, out, s)
}
func autoConvert_v1_InterPodAffinityArgs_To_config_InterPodAffinityArgs(in *configv1.InterPodAffinityArgs, out *config.InterPodAffinityArgs, s conversion.Scope) error {
if err := metav1.Convert_Pointer_int32_To_int32(&in.HardPodAffinityWeight, &out.HardPodAffinityWeight, s); err != nil {
return err
}
out.IgnorePreferredTermsOfExistingPods = in.IgnorePreferredTermsOfExistingPods
return nil
}
// Convert_v1_InterPodAffinityArgs_To_config_InterPodAffinityArgs is an autogenerated conversion function.
func Convert_v1_InterPodAffinityArgs_To_config_InterPodAffinityArgs(in *configv1.InterPodAffinityArgs, out *config.InterPodAffinityArgs, s conversion.Scope) error {
return autoConvert_v1_InterPodAffinityArgs_To_config_InterPodAffinityArgs(in, out, s)
}
func autoConvert_config_InterPodAffinityArgs_To_v1_InterPodAffinityArgs(in *config.InterPodAffinityArgs, out *configv1.InterPodAffinityArgs, s conversion.Scope) error {
if err := metav1.Convert_int32_To_Pointer_int32(&in.HardPodAffinityWeight, &out.HardPodAffinityWeight, s); err != nil {
return err
}
out.IgnorePreferredTermsOfExistingPods = in.IgnorePreferredTermsOfExistingPods
return nil
}
// Convert_config_InterPodAffinityArgs_To_v1_InterPodAffinityArgs is an autogenerated conversion function.
func Convert_config_InterPodAffinityArgs_To_v1_InterPodAffinityArgs(in *config.InterPodAffinityArgs, out *configv1.InterPodAffinityArgs, s conversion.Scope) error {
return autoConvert_config_InterPodAffinityArgs_To_v1_InterPodAffinityArgs(in, out, s)
}
func autoConvert_v1_KubeSchedulerConfiguration_To_config_KubeSchedulerConfiguration(in *configv1.KubeSchedulerConfiguration, out *config.KubeSchedulerConfiguration, s conversion.Scope) error {
if err := metav1.Convert_Pointer_int32_To_int32(&in.Parallelism, &out.Parallelism, s); err != nil {
return err
}
if err := v1alpha1.Convert_v1alpha1_LeaderElectionConfiguration_To_config_LeaderElectionConfiguration(&in.LeaderElection, &out.LeaderElection, s); err != nil {
return err
}
if err := v1alpha1.Convert_v1alpha1_ClientConnectionConfiguration_To_config_ClientConnectionConfiguration(&in.ClientConnection, &out.ClientConnection, s); err != nil {
return err
}
if err := v1alpha1.Convert_v1alpha1_DebuggingConfiguration_To_config_DebuggingConfiguration(&in.DebuggingConfiguration, &out.DebuggingConfiguration, s); err != nil {
return err
}
out.PercentageOfNodesToScore = (*int32)(unsafe.Pointer(in.PercentageOfNodesToScore))
if err := metav1.Convert_Pointer_int64_To_int64(&in.PodInitialBackoffSeconds, &out.PodInitialBackoffSeconds, s); err != nil {
return err
}
if err := metav1.Convert_Pointer_int64_To_int64(&in.PodMaxBackoffSeconds, &out.PodMaxBackoffSeconds, s); err != nil {
return err
}
if in.Profiles != nil {
in, out := &in.Profiles, &out.Profiles
*out = make([]config.KubeSchedulerProfile, len(*in))
for i := range *in {
if err := Convert_v1_KubeSchedulerProfile_To_config_KubeSchedulerProfile(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.Profiles = nil
}
out.Extenders = *(*[]config.Extender)(unsafe.Pointer(&in.Extenders))
out.DelayCacheUntilActive = in.DelayCacheUntilActive
return nil
}
func autoConvert_config_KubeSchedulerConfiguration_To_v1_KubeSchedulerConfiguration(in *config.KubeSchedulerConfiguration, out *configv1.KubeSchedulerConfiguration, s conversion.Scope) error {
if err := metav1.Convert_int32_To_Pointer_int32(&in.Parallelism, &out.Parallelism, s); err != nil {
return err
}
if err := v1alpha1.Convert_config_LeaderElectionConfiguration_To_v1alpha1_LeaderElectionConfiguration(&in.LeaderElection, &out.LeaderElection, s); err != nil {
return err
}
if err := v1alpha1.Convert_config_ClientConnectionConfiguration_To_v1alpha1_ClientConnectionConfiguration(&in.ClientConnection, &out.ClientConnection, s); err != nil {
return err
}
if err := v1alpha1.Convert_config_DebuggingConfiguration_To_v1alpha1_DebuggingConfiguration(&in.DebuggingConfiguration, &out.DebuggingConfiguration, s); err != nil {
return err
}
out.PercentageOfNodesToScore = (*int32)(unsafe.Pointer(in.PercentageOfNodesToScore))
if err := metav1.Convert_int64_To_Pointer_int64(&in.PodInitialBackoffSeconds, &out.PodInitialBackoffSeconds, s); err != nil {
return err
}
if err := metav1.Convert_int64_To_Pointer_int64(&in.PodMaxBackoffSeconds, &out.PodMaxBackoffSeconds, s); err != nil {
return err
}
if in.Profiles != nil {
in, out := &in.Profiles, &out.Profiles
*out = make([]configv1.KubeSchedulerProfile, len(*in))
for i := range *in {
if err := Convert_config_KubeSchedulerProfile_To_v1_KubeSchedulerProfile(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.Profiles = nil
}
out.Extenders = *(*[]configv1.Extender)(unsafe.Pointer(&in.Extenders))
out.DelayCacheUntilActive = in.DelayCacheUntilActive
return nil
}
func autoConvert_v1_KubeSchedulerProfile_To_config_KubeSchedulerProfile(in *configv1.KubeSchedulerProfile, out *config.KubeSchedulerProfile, s conversion.Scope) error {
if err := metav1.Convert_Pointer_string_To_string(&in.SchedulerName, &out.SchedulerName, s); err != nil {
return err
}
out.PercentageOfNodesToScore = (*int32)(unsafe.Pointer(in.PercentageOfNodesToScore))
if in.Plugins != nil {
in, out := &in.Plugins, &out.Plugins
*out = new(config.Plugins)
if err := Convert_v1_Plugins_To_config_Plugins(*in, *out, s); err != nil {
return err
}
} else {
out.Plugins = nil
}
if in.PluginConfig != nil {
in, out := &in.PluginConfig, &out.PluginConfig
*out = make([]config.PluginConfig, len(*in))
for i := range *in {
if err := Convert_v1_PluginConfig_To_config_PluginConfig(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.PluginConfig = nil
}
return nil
}
// Convert_v1_KubeSchedulerProfile_To_config_KubeSchedulerProfile is an autogenerated conversion function.
func Convert_v1_KubeSchedulerProfile_To_config_KubeSchedulerProfile(in *configv1.KubeSchedulerProfile, out *config.KubeSchedulerProfile, s conversion.Scope) error {
return autoConvert_v1_KubeSchedulerProfile_To_config_KubeSchedulerProfile(in, out, s)
}
func autoConvert_config_KubeSchedulerProfile_To_v1_KubeSchedulerProfile(in *config.KubeSchedulerProfile, out *configv1.KubeSchedulerProfile, s conversion.Scope) error {
if err := metav1.Convert_string_To_Pointer_string(&in.SchedulerName, &out.SchedulerName, s); err != nil {
return err
}
out.PercentageOfNodesToScore = (*int32)(unsafe.Pointer(in.PercentageOfNodesToScore))
if in.Plugins != nil {
in, out := &in.Plugins, &out.Plugins
*out = new(configv1.Plugins)
if err := Convert_config_Plugins_To_v1_Plugins(*in, *out, s); err != nil {
return err
}
} else {
out.Plugins = nil
}
if in.PluginConfig != nil {
in, out := &in.PluginConfig, &out.PluginConfig
*out = make([]configv1.PluginConfig, len(*in))
for i := range *in {
if err := Convert_config_PluginConfig_To_v1_PluginConfig(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.PluginConfig = nil
}
return nil
}
// Convert_config_KubeSchedulerProfile_To_v1_KubeSchedulerProfile is an autogenerated conversion function.
func Convert_config_KubeSchedulerProfile_To_v1_KubeSchedulerProfile(in *config.KubeSchedulerProfile, out *configv1.KubeSchedulerProfile, s conversion.Scope) error {
return autoConvert_config_KubeSchedulerProfile_To_v1_KubeSchedulerProfile(in, out, s)
}
func autoConvert_v1_NodeAffinityArgs_To_config_NodeAffinityArgs(in *configv1.NodeAffinityArgs, out *config.NodeAffinityArgs, s conversion.Scope) error {
out.AddedAffinity = (*corev1.NodeAffinity)(unsafe.Pointer(in.AddedAffinity))
return nil
}
// Convert_v1_NodeAffinityArgs_To_config_NodeAffinityArgs is an autogenerated conversion function.
func Convert_v1_NodeAffinityArgs_To_config_NodeAffinityArgs(in *configv1.NodeAffinityArgs, out *config.NodeAffinityArgs, s conversion.Scope) error {
return autoConvert_v1_NodeAffinityArgs_To_config_NodeAffinityArgs(in, out, s)
}
func autoConvert_config_NodeAffinityArgs_To_v1_NodeAffinityArgs(in *config.NodeAffinityArgs, out *configv1.NodeAffinityArgs, s conversion.Scope) error {
out.AddedAffinity = (*corev1.NodeAffinity)(unsafe.Pointer(in.AddedAffinity))
return nil
}
// Convert_config_NodeAffinityArgs_To_v1_NodeAffinityArgs is an autogenerated conversion function.
func Convert_config_NodeAffinityArgs_To_v1_NodeAffinityArgs(in *config.NodeAffinityArgs, out *configv1.NodeAffinityArgs, s conversion.Scope) error {
return autoConvert_config_NodeAffinityArgs_To_v1_NodeAffinityArgs(in, out, s)
}
func autoConvert_v1_NodeResourcesBalancedAllocationArgs_To_config_NodeResourcesBalancedAllocationArgs(in *configv1.NodeResourcesBalancedAllocationArgs, out *config.NodeResourcesBalancedAllocationArgs, s conversion.Scope) error {
out.Resources = *(*[]config.ResourceSpec)(unsafe.Pointer(&in.Resources))
return nil
}
// Convert_v1_NodeResourcesBalancedAllocationArgs_To_config_NodeResourcesBalancedAllocationArgs is an autogenerated conversion function.
func Convert_v1_NodeResourcesBalancedAllocationArgs_To_config_NodeResourcesBalancedAllocationArgs(in *configv1.NodeResourcesBalancedAllocationArgs, out *config.NodeResourcesBalancedAllocationArgs, s conversion.Scope) error {
return autoConvert_v1_NodeResourcesBalancedAllocationArgs_To_config_NodeResourcesBalancedAllocationArgs(in, out, s)
}
func autoConvert_config_NodeResourcesBalancedAllocationArgs_To_v1_NodeResourcesBalancedAllocationArgs(in *config.NodeResourcesBalancedAllocationArgs, out *configv1.NodeResourcesBalancedAllocationArgs, s conversion.Scope) error {
out.Resources = *(*[]configv1.ResourceSpec)(unsafe.Pointer(&in.Resources))
return nil
}
// Convert_config_NodeResourcesBalancedAllocationArgs_To_v1_NodeResourcesBalancedAllocationArgs is an autogenerated conversion function.
func Convert_config_NodeResourcesBalancedAllocationArgs_To_v1_NodeResourcesBalancedAllocationArgs(in *config.NodeResourcesBalancedAllocationArgs, out *configv1.NodeResourcesBalancedAllocationArgs, s conversion.Scope) error {
return autoConvert_config_NodeResourcesBalancedAllocationArgs_To_v1_NodeResourcesBalancedAllocationArgs(in, out, s)
}
func autoConvert_v1_NodeResourcesFitArgs_To_config_NodeResourcesFitArgs(in *configv1.NodeResourcesFitArgs, out *config.NodeResourcesFitArgs, s conversion.Scope) error {
out.IgnoredResources = *(*[]string)(unsafe.Pointer(&in.IgnoredResources))
out.IgnoredResourceGroups = *(*[]string)(unsafe.Pointer(&in.IgnoredResourceGroups))
out.ScoringStrategy = (*config.ScoringStrategy)(unsafe.Pointer(in.ScoringStrategy))
return nil
}
// Convert_v1_NodeResourcesFitArgs_To_config_NodeResourcesFitArgs is an autogenerated conversion function.
func Convert_v1_NodeResourcesFitArgs_To_config_NodeResourcesFitArgs(in *configv1.NodeResourcesFitArgs, out *config.NodeResourcesFitArgs, s conversion.Scope) error {
return autoConvert_v1_NodeResourcesFitArgs_To_config_NodeResourcesFitArgs(in, out, s)
}
func autoConvert_config_NodeResourcesFitArgs_To_v1_NodeResourcesFitArgs(in *config.NodeResourcesFitArgs, out *configv1.NodeResourcesFitArgs, s conversion.Scope) error {
out.IgnoredResources = *(*[]string)(unsafe.Pointer(&in.IgnoredResources))
out.IgnoredResourceGroups = *(*[]string)(unsafe.Pointer(&in.IgnoredResourceGroups))
out.ScoringStrategy = (*configv1.ScoringStrategy)(unsafe.Pointer(in.ScoringStrategy))
return nil
}
// Convert_config_NodeResourcesFitArgs_To_v1_NodeResourcesFitArgs is an autogenerated conversion function.
func Convert_config_NodeResourcesFitArgs_To_v1_NodeResourcesFitArgs(in *config.NodeResourcesFitArgs, out *configv1.NodeResourcesFitArgs, s conversion.Scope) error {
return autoConvert_config_NodeResourcesFitArgs_To_v1_NodeResourcesFitArgs(in, out, s)
}
func autoConvert_v1_Plugin_To_config_Plugin(in *configv1.Plugin, out *config.Plugin, s conversion.Scope) error {
out.Name = in.Name
if err := metav1.Convert_Pointer_int32_To_int32(&in.Weight, &out.Weight, s); err != nil {
return err
}
return nil
}
// Convert_v1_Plugin_To_config_Plugin is an autogenerated conversion function.
func Convert_v1_Plugin_To_config_Plugin(in *configv1.Plugin, out *config.Plugin, s conversion.Scope) error {
return autoConvert_v1_Plugin_To_config_Plugin(in, out, s)
}
func autoConvert_config_Plugin_To_v1_Plugin(in *config.Plugin, out *configv1.Plugin, s conversion.Scope) error {
out.Name = in.Name
if err := metav1.Convert_int32_To_Pointer_int32(&in.Weight, &out.Weight, s); err != nil {
return err
}
return nil
}
// Convert_config_Plugin_To_v1_Plugin is an autogenerated conversion function.
func Convert_config_Plugin_To_v1_Plugin(in *config.Plugin, out *configv1.Plugin, s conversion.Scope) error {
return autoConvert_config_Plugin_To_v1_Plugin(in, out, s)
}
func autoConvert_v1_PluginConfig_To_config_PluginConfig(in *configv1.PluginConfig, out *config.PluginConfig, s conversion.Scope) error {
out.Name = in.Name
if err := runtime.Convert_runtime_RawExtension_To_runtime_Object(&in.Args, &out.Args, s); err != nil {
return err
}
return nil
}
// Convert_v1_PluginConfig_To_config_PluginConfig is an autogenerated conversion function.
func Convert_v1_PluginConfig_To_config_PluginConfig(in *configv1.PluginConfig, out *config.PluginConfig, s conversion.Scope) error {
return autoConvert_v1_PluginConfig_To_config_PluginConfig(in, out, s)
}
func autoConvert_config_PluginConfig_To_v1_PluginConfig(in *config.PluginConfig, out *configv1.PluginConfig, s conversion.Scope) error {
out.Name = in.Name
if err := runtime.Convert_runtime_Object_To_runtime_RawExtension(&in.Args, &out.Args, s); err != nil {
return err
}
return nil
}
// Convert_config_PluginConfig_To_v1_PluginConfig is an autogenerated conversion function.
func Convert_config_PluginConfig_To_v1_PluginConfig(in *config.PluginConfig, out *configv1.PluginConfig, s conversion.Scope) error {
return autoConvert_config_PluginConfig_To_v1_PluginConfig(in, out, s)
}
func autoConvert_v1_PluginSet_To_config_PluginSet(in *configv1.PluginSet, out *config.PluginSet, s conversion.Scope) error {
if in.Enabled != nil {
in, out := &in.Enabled, &out.Enabled
*out = make([]config.Plugin, len(*in))
for i := range *in {
if err := Convert_v1_Plugin_To_config_Plugin(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.Enabled = nil
}
if in.Disabled != nil {
in, out := &in.Disabled, &out.Disabled
*out = make([]config.Plugin, len(*in))
for i := range *in {
if err := Convert_v1_Plugin_To_config_Plugin(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.Disabled = nil
}
return nil
}
// Convert_v1_PluginSet_To_config_PluginSet is an autogenerated conversion function.
func Convert_v1_PluginSet_To_config_PluginSet(in *configv1.PluginSet, out *config.PluginSet, s conversion.Scope) error {
return autoConvert_v1_PluginSet_To_config_PluginSet(in, out, s)
}
func autoConvert_config_PluginSet_To_v1_PluginSet(in *config.PluginSet, out *configv1.PluginSet, s conversion.Scope) error {
if in.Enabled != nil {
in, out := &in.Enabled, &out.Enabled
*out = make([]configv1.Plugin, len(*in))
for i := range *in {
if err := Convert_config_Plugin_To_v1_Plugin(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.Enabled = nil
}
if in.Disabled != nil {
in, out := &in.Disabled, &out.Disabled
*out = make([]configv1.Plugin, len(*in))
for i := range *in {
if err := Convert_config_Plugin_To_v1_Plugin(&(*in)[i], &(*out)[i], s); err != nil {
return err
}
}
} else {
out.Disabled = nil
}
return nil
}
// Convert_config_PluginSet_To_v1_PluginSet is an autogenerated conversion function.
func Convert_config_PluginSet_To_v1_PluginSet(in *config.PluginSet, out *configv1.PluginSet, s conversion.Scope) error {
return autoConvert_config_PluginSet_To_v1_PluginSet(in, out, s)
}
func autoConvert_v1_Plugins_To_config_Plugins(in *configv1.Plugins, out *config.Plugins, s conversion.Scope) error {
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.PreEnqueue, &out.PreEnqueue, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.QueueSort, &out.QueueSort, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.PreFilter, &out.PreFilter, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.Filter, &out.Filter, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.PostFilter, &out.PostFilter, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.PreScore, &out.PreScore, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.Score, &out.Score, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.Reserve, &out.Reserve, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.Permit, &out.Permit, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.PreBind, &out.PreBind, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.Bind, &out.Bind, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.PostBind, &out.PostBind, s); err != nil {
return err
}
if err := Convert_v1_PluginSet_To_config_PluginSet(&in.MultiPoint, &out.MultiPoint, s); err != nil {
return err
}
return nil
}
// Convert_v1_Plugins_To_config_Plugins is an autogenerated conversion function.
func Convert_v1_Plugins_To_config_Plugins(in *configv1.Plugins, out *config.Plugins, s conversion.Scope) error {
return autoConvert_v1_Plugins_To_config_Plugins(in, out, s)
}
func autoConvert_config_Plugins_To_v1_Plugins(in *config.Plugins, out *configv1.Plugins, s conversion.Scope) error {
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.PreEnqueue, &out.PreEnqueue, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.QueueSort, &out.QueueSort, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.PreFilter, &out.PreFilter, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.Filter, &out.Filter, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.PostFilter, &out.PostFilter, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.PreScore, &out.PreScore, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.Score, &out.Score, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.Reserve, &out.Reserve, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.Permit, &out.Permit, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.PreBind, &out.PreBind, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.Bind, &out.Bind, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.PostBind, &out.PostBind, s); err != nil {
return err
}
if err := Convert_config_PluginSet_To_v1_PluginSet(&in.MultiPoint, &out.MultiPoint, s); err != nil {
return err
}
return nil
}
// Convert_config_Plugins_To_v1_Plugins is an autogenerated conversion function.
func Convert_config_Plugins_To_v1_Plugins(in *config.Plugins, out *configv1.Plugins, s conversion.Scope) error {
return autoConvert_config_Plugins_To_v1_Plugins(in, out, s)
}
func autoConvert_v1_PodTopologySpreadArgs_To_config_PodTopologySpreadArgs(in *configv1.PodTopologySpreadArgs, out *config.PodTopologySpreadArgs, s conversion.Scope) error {
out.DefaultConstraints = *(*[]corev1.TopologySpreadConstraint)(unsafe.Pointer(&in.DefaultConstraints))
out.DefaultingType = config.PodTopologySpreadConstraintsDefaulting(in.DefaultingType)
return nil
}
// Convert_v1_PodTopologySpreadArgs_To_config_PodTopologySpreadArgs is an autogenerated conversion function.
func Convert_v1_PodTopologySpreadArgs_To_config_PodTopologySpreadArgs(in *configv1.PodTopologySpreadArgs, out *config.PodTopologySpreadArgs, s conversion.Scope) error {
return autoConvert_v1_PodTopologySpreadArgs_To_config_PodTopologySpreadArgs(in, out, s)
}
func autoConvert_config_PodTopologySpreadArgs_To_v1_PodTopologySpreadArgs(in *config.PodTopologySpreadArgs, out *configv1.PodTopologySpreadArgs, s conversion.Scope) error {
out.DefaultConstraints = *(*[]corev1.TopologySpreadConstraint)(unsafe.Pointer(&in.DefaultConstraints))
out.DefaultingType = configv1.PodTopologySpreadConstraintsDefaulting(in.DefaultingType)
return nil
}
// Convert_config_PodTopologySpreadArgs_To_v1_PodTopologySpreadArgs is an autogenerated conversion function.
func Convert_config_PodTopologySpreadArgs_To_v1_PodTopologySpreadArgs(in *config.PodTopologySpreadArgs, out *configv1.PodTopologySpreadArgs, s conversion.Scope) error {
return autoConvert_config_PodTopologySpreadArgs_To_v1_PodTopologySpreadArgs(in, out, s)
}
func autoConvert_v1_RequestedToCapacityRatioParam_To_config_RequestedToCapacityRatioParam(in *configv1.RequestedToCapacityRatioParam, out *config.RequestedToCapacityRatioParam, s conversion.Scope) error {
out.Shape = *(*[]config.UtilizationShapePoint)(unsafe.Pointer(&in.Shape))
return nil
}
// Convert_v1_RequestedToCapacityRatioParam_To_config_RequestedToCapacityRatioParam is an autogenerated conversion function.
func Convert_v1_RequestedToCapacityRatioParam_To_config_RequestedToCapacityRatioParam(in *configv1.RequestedToCapacityRatioParam, out *config.RequestedToCapacityRatioParam, s conversion.Scope) error {
return autoConvert_v1_RequestedToCapacityRatioParam_To_config_RequestedToCapacityRatioParam(in, out, s)
}
func autoConvert_config_RequestedToCapacityRatioParam_To_v1_RequestedToCapacityRatioParam(in *config.RequestedToCapacityRatioParam, out *configv1.RequestedToCapacityRatioParam, s conversion.Scope) error {
out.Shape = *(*[]configv1.UtilizationShapePoint)(unsafe.Pointer(&in.Shape))
return nil
}
// Convert_config_RequestedToCapacityRatioParam_To_v1_RequestedToCapacityRatioParam is an autogenerated conversion function.
func Convert_config_RequestedToCapacityRatioParam_To_v1_RequestedToCapacityRatioParam(in *config.RequestedToCapacityRatioParam, out *configv1.RequestedToCapacityRatioParam, s conversion.Scope) error {
return autoConvert_config_RequestedToCapacityRatioParam_To_v1_RequestedToCapacityRatioParam(in, out, s)
}
func autoConvert_v1_ResourceSpec_To_config_ResourceSpec(in *configv1.ResourceSpec, out *config.ResourceSpec, s conversion.Scope) error {
out.Name = in.Name
out.Weight = in.Weight
return nil
}
// Convert_v1_ResourceSpec_To_config_ResourceSpec is an autogenerated conversion function.
func Convert_v1_ResourceSpec_To_config_ResourceSpec(in *configv1.ResourceSpec, out *config.ResourceSpec, s conversion.Scope) error {
return autoConvert_v1_ResourceSpec_To_config_ResourceSpec(in, out, s)
}
func autoConvert_config_ResourceSpec_To_v1_ResourceSpec(in *config.ResourceSpec, out *configv1.ResourceSpec, s conversion.Scope) error {
out.Name = in.Name
out.Weight = in.Weight
return nil
}
// Convert_config_ResourceSpec_To_v1_ResourceSpec is an autogenerated conversion function.
func Convert_config_ResourceSpec_To_v1_ResourceSpec(in *config.ResourceSpec, out *configv1.ResourceSpec, s conversion.Scope) error {
return autoConvert_config_ResourceSpec_To_v1_ResourceSpec(in, out, s)
}
func autoConvert_v1_ScoringStrategy_To_config_ScoringStrategy(in *configv1.ScoringStrategy, out *config.ScoringStrategy, s conversion.Scope) error {
out.Type = config.ScoringStrategyType(in.Type)
out.Resources = *(*[]config.ResourceSpec)(unsafe.Pointer(&in.Resources))
out.RequestedToCapacityRatio = (*config.RequestedToCapacityRatioParam)(unsafe.Pointer(in.RequestedToCapacityRatio))
return nil
}
// Convert_v1_ScoringStrategy_To_config_ScoringStrategy is an autogenerated conversion function.
func Convert_v1_ScoringStrategy_To_config_ScoringStrategy(in *configv1.ScoringStrategy, out *config.ScoringStrategy, s conversion.Scope) error {
return autoConvert_v1_ScoringStrategy_To_config_ScoringStrategy(in, out, s)
}
func autoConvert_config_ScoringStrategy_To_v1_ScoringStrategy(in *config.ScoringStrategy, out *configv1.ScoringStrategy, s conversion.Scope) error {
out.Type = configv1.ScoringStrategyType(in.Type)
out.Resources = *(*[]configv1.ResourceSpec)(unsafe.Pointer(&in.Resources))
out.RequestedToCapacityRatio = (*configv1.RequestedToCapacityRatioParam)(unsafe.Pointer(in.RequestedToCapacityRatio))
return nil
}
// Convert_config_ScoringStrategy_To_v1_ScoringStrategy is an autogenerated conversion function.
func Convert_config_ScoringStrategy_To_v1_ScoringStrategy(in *config.ScoringStrategy, out *configv1.ScoringStrategy, s conversion.Scope) error {
return autoConvert_config_ScoringStrategy_To_v1_ScoringStrategy(in, out, s)
}
func autoConvert_v1_UtilizationShapePoint_To_config_UtilizationShapePoint(in *configv1.UtilizationShapePoint, out *config.UtilizationShapePoint, s conversion.Scope) error {
out.Utilization = in.Utilization
out.Score = in.Score
return nil
}
// Convert_v1_UtilizationShapePoint_To_config_UtilizationShapePoint is an autogenerated conversion function.
func Convert_v1_UtilizationShapePoint_To_config_UtilizationShapePoint(in *configv1.UtilizationShapePoint, out *config.UtilizationShapePoint, s conversion.Scope) error {
return autoConvert_v1_UtilizationShapePoint_To_config_UtilizationShapePoint(in, out, s)
}
func autoConvert_config_UtilizationShapePoint_To_v1_UtilizationShapePoint(in *config.UtilizationShapePoint, out *configv1.UtilizationShapePoint, s conversion.Scope) error {
out.Utilization = in.Utilization
out.Score = in.Score
return nil
}
// Convert_config_UtilizationShapePoint_To_v1_UtilizationShapePoint is an autogenerated conversion function.
func Convert_config_UtilizationShapePoint_To_v1_UtilizationShapePoint(in *config.UtilizationShapePoint, out *configv1.UtilizationShapePoint, s conversion.Scope) error {
return autoConvert_config_UtilizationShapePoint_To_v1_UtilizationShapePoint(in, out, s)
}
func autoConvert_v1_VolumeBindingArgs_To_config_VolumeBindingArgs(in *configv1.VolumeBindingArgs, out *config.VolumeBindingArgs, s conversion.Scope) error {
if err := metav1.Convert_Pointer_int64_To_int64(&in.BindTimeoutSeconds, &out.BindTimeoutSeconds, s); err != nil {
return err
}
out.Shape = *(*[]config.UtilizationShapePoint)(unsafe.Pointer(&in.Shape))
return nil
}
// Convert_v1_VolumeBindingArgs_To_config_VolumeBindingArgs is an autogenerated conversion function.
func Convert_v1_VolumeBindingArgs_To_config_VolumeBindingArgs(in *configv1.VolumeBindingArgs, out *config.VolumeBindingArgs, s conversion.Scope) error {
return autoConvert_v1_VolumeBindingArgs_To_config_VolumeBindingArgs(in, out, s)
}
func autoConvert_config_VolumeBindingArgs_To_v1_VolumeBindingArgs(in *config.VolumeBindingArgs, out *configv1.VolumeBindingArgs, s conversion.Scope) error {
if err := metav1.Convert_int64_To_Pointer_int64(&in.BindTimeoutSeconds, &out.BindTimeoutSeconds, s); err != nil {
return err
}
out.Shape = *(*[]configv1.UtilizationShapePoint)(unsafe.Pointer(&in.Shape))
return nil
}
// Convert_config_VolumeBindingArgs_To_v1_VolumeBindingArgs is an autogenerated conversion function.
func Convert_config_VolumeBindingArgs_To_v1_VolumeBindingArgs(in *config.VolumeBindingArgs, out *configv1.VolumeBindingArgs, s conversion.Scope) error {
return autoConvert_config_VolumeBindingArgs_To_v1_VolumeBindingArgs(in, out, s)
}

View File

@ -0,0 +1,22 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by deepcopy-gen. DO NOT EDIT.
package v1

View File

@ -0,0 +1,73 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by defaulter-gen. DO NOT EDIT.
package v1
import (
runtime "k8s.io/apimachinery/pkg/runtime"
configv1 "k8s.io/kube-scheduler/config/v1"
)
// RegisterDefaults adds defaulters functions to the given scheme.
// Public to allow building arbitrary schemes.
// All generated defaulters are covering - they call all nested defaulters.
func RegisterDefaults(scheme *runtime.Scheme) error {
scheme.AddTypeDefaultingFunc(&configv1.DefaultPreemptionArgs{}, func(obj interface{}) { SetObjectDefaults_DefaultPreemptionArgs(obj.(*configv1.DefaultPreemptionArgs)) })
scheme.AddTypeDefaultingFunc(&configv1.InterPodAffinityArgs{}, func(obj interface{}) { SetObjectDefaults_InterPodAffinityArgs(obj.(*configv1.InterPodAffinityArgs)) })
scheme.AddTypeDefaultingFunc(&configv1.KubeSchedulerConfiguration{}, func(obj interface{}) {
SetObjectDefaults_KubeSchedulerConfiguration(obj.(*configv1.KubeSchedulerConfiguration))
})
scheme.AddTypeDefaultingFunc(&configv1.NodeResourcesBalancedAllocationArgs{}, func(obj interface{}) {
SetObjectDefaults_NodeResourcesBalancedAllocationArgs(obj.(*configv1.NodeResourcesBalancedAllocationArgs))
})
scheme.AddTypeDefaultingFunc(&configv1.NodeResourcesFitArgs{}, func(obj interface{}) { SetObjectDefaults_NodeResourcesFitArgs(obj.(*configv1.NodeResourcesFitArgs)) })
scheme.AddTypeDefaultingFunc(&configv1.PodTopologySpreadArgs{}, func(obj interface{}) { SetObjectDefaults_PodTopologySpreadArgs(obj.(*configv1.PodTopologySpreadArgs)) })
scheme.AddTypeDefaultingFunc(&configv1.VolumeBindingArgs{}, func(obj interface{}) { SetObjectDefaults_VolumeBindingArgs(obj.(*configv1.VolumeBindingArgs)) })
return nil
}
func SetObjectDefaults_DefaultPreemptionArgs(in *configv1.DefaultPreemptionArgs) {
SetDefaults_DefaultPreemptionArgs(in)
}
func SetObjectDefaults_InterPodAffinityArgs(in *configv1.InterPodAffinityArgs) {
SetDefaults_InterPodAffinityArgs(in)
}
func SetObjectDefaults_KubeSchedulerConfiguration(in *configv1.KubeSchedulerConfiguration) {
SetDefaults_KubeSchedulerConfiguration(in)
}
func SetObjectDefaults_NodeResourcesBalancedAllocationArgs(in *configv1.NodeResourcesBalancedAllocationArgs) {
SetDefaults_NodeResourcesBalancedAllocationArgs(in)
}
func SetObjectDefaults_NodeResourcesFitArgs(in *configv1.NodeResourcesFitArgs) {
SetDefaults_NodeResourcesFitArgs(in)
}
func SetObjectDefaults_PodTopologySpreadArgs(in *configv1.PodTopologySpreadArgs) {
SetDefaults_PodTopologySpreadArgs(in)
}
func SetObjectDefaults_VolumeBindingArgs(in *configv1.VolumeBindingArgs) {
SetDefaults_VolumeBindingArgs(in)
}

View File

@ -0,0 +1,296 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package validation
import (
"fmt"
"reflect"
v1 "k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/runtime"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/validation"
"k8s.io/apimachinery/pkg/util/validation/field"
componentbasevalidation "k8s.io/component-base/config/validation"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
)
// ValidateKubeSchedulerConfiguration ensures validation of the KubeSchedulerConfiguration struct
func ValidateKubeSchedulerConfiguration(cc *config.KubeSchedulerConfiguration) utilerrors.Aggregate {
var errs []error
errs = append(errs, componentbasevalidation.ValidateClientConnectionConfiguration(&cc.ClientConnection, field.NewPath("clientConnection")).ToAggregate())
errs = append(errs, componentbasevalidation.ValidateLeaderElectionConfiguration(&cc.LeaderElection, field.NewPath("leaderElection")).ToAggregate())
// TODO: This can be removed when ResourceLock is not available
// Only ResourceLock values with leases are allowed
if cc.LeaderElection.LeaderElect && cc.LeaderElection.ResourceLock != "leases" {
leaderElectionPath := field.NewPath("leaderElection")
errs = append(errs, field.Invalid(leaderElectionPath.Child("resourceLock"), cc.LeaderElection.ResourceLock, `resourceLock value must be "leases"`))
}
profilesPath := field.NewPath("profiles")
if cc.Parallelism <= 0 {
errs = append(errs, field.Invalid(field.NewPath("parallelism"), cc.Parallelism, "should be an integer value greater than zero"))
}
if len(cc.Profiles) == 0 {
errs = append(errs, field.Required(profilesPath, ""))
} else {
existingProfiles := make(map[string]int, len(cc.Profiles))
for i := range cc.Profiles {
profile := &cc.Profiles[i]
path := profilesPath.Index(i)
errs = append(errs, validateKubeSchedulerProfile(path, cc.APIVersion, profile)...)
if idx, ok := existingProfiles[profile.SchedulerName]; ok {
errs = append(errs, field.Duplicate(path.Child("schedulerName"), profilesPath.Index(idx).Child("schedulerName")))
}
existingProfiles[profile.SchedulerName] = i
}
errs = append(errs, validateCommonQueueSort(profilesPath, cc.Profiles)...)
}
errs = append(errs, validatePercentageOfNodesToScore(field.NewPath("percentageOfNodesToScore"), cc.PercentageOfNodesToScore))
if cc.PodInitialBackoffSeconds <= 0 {
errs = append(errs, field.Invalid(field.NewPath("podInitialBackoffSeconds"),
cc.PodInitialBackoffSeconds, "must be greater than 0"))
}
if cc.PodMaxBackoffSeconds < cc.PodInitialBackoffSeconds {
errs = append(errs, field.Invalid(field.NewPath("podMaxBackoffSeconds"),
cc.PodMaxBackoffSeconds, "must be greater than or equal to PodInitialBackoffSeconds"))
}
errs = append(errs, validateExtenders(field.NewPath("extenders"), cc.Extenders)...)
return utilerrors.Flatten(utilerrors.NewAggregate(errs))
}
func validatePercentageOfNodesToScore(path *field.Path, percentageOfNodesToScore *int32) error {
if percentageOfNodesToScore != nil {
if *percentageOfNodesToScore < 0 || *percentageOfNodesToScore > 100 {
return field.Invalid(path, *percentageOfNodesToScore, "not in valid range [0-100]")
}
}
return nil
}
type invalidPlugins struct {
schemeGroupVersion string
plugins []string
}
// invalidPluginsByVersion maintains a list of removed/deprecated plugins in each version.
// Remember to add an entry to that list when creating a new component config
// version (even if the list of invalid plugins is empty).
var invalidPluginsByVersion = []invalidPlugins{
{
schemeGroupVersion: v1.SchemeGroupVersion.String(),
plugins: []string{
"AzureDiskLimits",
"CinderLimits",
"EBSLimits",
"GCEPDLimits",
},
},
}
// isPluginInvalid checks if a given plugin was removed/deprecated in the given component
// config version or earlier.
func isPluginInvalid(apiVersion string, name string) (bool, string) {
for _, dp := range invalidPluginsByVersion {
for _, plugin := range dp.plugins {
if name == plugin {
return true, dp.schemeGroupVersion
}
}
if apiVersion == dp.schemeGroupVersion {
break
}
}
return false, ""
}
func validatePluginSetForInvalidPlugins(path *field.Path, apiVersion string, ps config.PluginSet) []error {
var errs []error
for i, plugin := range ps.Enabled {
if invalid, invalidVersion := isPluginInvalid(apiVersion, plugin.Name); invalid {
errs = append(errs, field.Invalid(path.Child("enabled").Index(i), plugin.Name, fmt.Sprintf("was invalid in version %q (KubeSchedulerConfiguration is version %q)", invalidVersion, apiVersion)))
}
}
return errs
}
func validateKubeSchedulerProfile(path *field.Path, apiVersion string, profile *config.KubeSchedulerProfile) []error {
var errs []error
if len(profile.SchedulerName) == 0 {
errs = append(errs, field.Required(path.Child("schedulerName"), ""))
}
errs = append(errs, validatePercentageOfNodesToScore(path.Child("percentageOfNodesToScore"), profile.PercentageOfNodesToScore))
errs = append(errs, validatePluginConfig(path, apiVersion, profile)...)
return errs
}
func validatePluginConfig(path *field.Path, apiVersion string, profile *config.KubeSchedulerProfile) []error {
var errs []error
m := map[string]interface{}{
"DefaultPreemption": ValidateDefaultPreemptionArgs,
"InterPodAffinity": ValidateInterPodAffinityArgs,
"NodeAffinity": ValidateNodeAffinityArgs,
"NodeResourcesBalancedAllocation": ValidateNodeResourcesBalancedAllocationArgs,
"NodeResourcesFitArgs": ValidateNodeResourcesFitArgs,
"PodTopologySpread": ValidatePodTopologySpreadArgs,
"VolumeBinding": ValidateVolumeBindingArgs,
}
if profile.Plugins != nil {
stagesToPluginSet := map[string]config.PluginSet{
"preEnqueue": profile.Plugins.PreEnqueue,
"queueSort": profile.Plugins.QueueSort,
"preFilter": profile.Plugins.PreFilter,
"filter": profile.Plugins.Filter,
"postFilter": profile.Plugins.PostFilter,
"preScore": profile.Plugins.PreScore,
"score": profile.Plugins.Score,
"reserve": profile.Plugins.Reserve,
"permit": profile.Plugins.Permit,
"preBind": profile.Plugins.PreBind,
"bind": profile.Plugins.Bind,
"postBind": profile.Plugins.PostBind,
}
pluginsPath := path.Child("plugins")
for s, p := range stagesToPluginSet {
errs = append(errs, validatePluginSetForInvalidPlugins(
pluginsPath.Child(s), apiVersion, p)...)
}
}
seenPluginConfig := sets.New[string]()
for i := range profile.PluginConfig {
pluginConfigPath := path.Child("pluginConfig").Index(i)
name := profile.PluginConfig[i].Name
args := profile.PluginConfig[i].Args
if seenPluginConfig.Has(name) {
errs = append(errs, field.Duplicate(pluginConfigPath, name))
} else {
seenPluginConfig.Insert(name)
}
if invalid, invalidVersion := isPluginInvalid(apiVersion, name); invalid {
errs = append(errs, field.Invalid(pluginConfigPath, name, fmt.Sprintf("was invalid in version %q (KubeSchedulerConfiguration is version %q)", invalidVersion, apiVersion)))
} else if validateFunc, ok := m[name]; ok {
// type mismatch, no need to validate the `args`.
if reflect.TypeOf(args) != reflect.ValueOf(validateFunc).Type().In(1) {
errs = append(errs, field.Invalid(pluginConfigPath.Child("args"), args, "has to match plugin args"))
} else {
in := []reflect.Value{reflect.ValueOf(pluginConfigPath.Child("args")), reflect.ValueOf(args)}
res := reflect.ValueOf(validateFunc).Call(in)
// It's possible that validation function return a Aggregate, just append here and it will be flattened at the end of CC validation.
if res[0].Interface() != nil {
errs = append(errs, res[0].Interface().(error))
}
}
}
}
return errs
}
func validateCommonQueueSort(path *field.Path, profiles []config.KubeSchedulerProfile) []error {
var errs []error
var canon config.PluginSet
var queueSortName string
var queueSortArgs runtime.Object
if profiles[0].Plugins != nil {
canon = profiles[0].Plugins.QueueSort
if len(profiles[0].Plugins.QueueSort.Enabled) != 0 {
queueSortName = profiles[0].Plugins.QueueSort.Enabled[0].Name
}
length := len(profiles[0].Plugins.QueueSort.Enabled)
if length > 1 {
errs = append(errs, field.Invalid(path.Index(0).Child("plugins", "queueSort", "Enabled"), length, "only one queue sort plugin can be enabled"))
}
}
for _, cfg := range profiles[0].PluginConfig {
if len(queueSortName) > 0 && cfg.Name == queueSortName {
queueSortArgs = cfg.Args
}
}
for i := 1; i < len(profiles); i++ {
var curr config.PluginSet
if profiles[i].Plugins != nil {
curr = profiles[i].Plugins.QueueSort
}
if !apiequality.Semantic.DeepEqual(canon, curr) {
errs = append(errs, field.Invalid(path.Index(i).Child("plugins", "queueSort"), curr, "queueSort must be the same for all profiles"))
}
for _, cfg := range profiles[i].PluginConfig {
if cfg.Name == queueSortName && !apiequality.Semantic.DeepEqual(queueSortArgs, cfg.Args) {
errs = append(errs, field.Invalid(path.Index(i).Child("pluginConfig", "args"), cfg.Args, "queueSort must be the same for all profiles"))
}
}
}
return errs
}
// validateExtenders validates the configured extenders for the Scheduler
func validateExtenders(fldPath *field.Path, extenders []config.Extender) []error {
var errs []error
binders := 0
extenderManagedResources := sets.New[string]()
for i, extender := range extenders {
path := fldPath.Index(i)
if len(extender.PrioritizeVerb) > 0 && extender.Weight <= 0 {
errs = append(errs, field.Invalid(path.Child("weight"),
extender.Weight, "must have a positive weight applied to it"))
}
if extender.BindVerb != "" {
binders++
}
for j, resource := range extender.ManagedResources {
managedResourcesPath := path.Child("managedResources").Index(j)
validationErrors := validateExtendedResourceName(managedResourcesPath.Child("name"), v1.ResourceName(resource.Name))
errs = append(errs, validationErrors...)
if extenderManagedResources.Has(resource.Name) {
errs = append(errs, field.Invalid(managedResourcesPath.Child("name"),
resource.Name, "duplicate extender managed resource name"))
}
extenderManagedResources.Insert(resource.Name)
}
}
if binders > 1 {
errs = append(errs, field.Invalid(fldPath, fmt.Sprintf("found %d extenders implementing bind", binders), "only one extender can implement bind"))
}
return errs
}
// validateExtendedResourceName checks whether the specified name is a valid
// extended resource name.
func validateExtendedResourceName(path *field.Path, name v1.ResourceName) []error {
var validationErrors []error
for _, msg := range validation.IsQualifiedName(string(name)) {
validationErrors = append(validationErrors, field.Invalid(path, name, msg))
}
if len(validationErrors) != 0 {
return validationErrors
}
if !v1helper.IsExtendedResourceName(name) {
validationErrors = append(validationErrors, field.Invalid(path, string(name), "is an invalid extended resource name"))
}
return validationErrors
}

View File

@ -0,0 +1,329 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package validation
import (
"fmt"
"strings"
v1 "k8s.io/api/core/v1"
metav1validation "k8s.io/apimachinery/pkg/apis/meta/v1/validation"
"k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/validation/field"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
)
// supportedScoringStrategyTypes has to be a set of strings for use with field.Unsupported
var supportedScoringStrategyTypes = sets.New(
string(config.LeastAllocated),
string(config.MostAllocated),
string(config.RequestedToCapacityRatio),
)
// ValidateDefaultPreemptionArgs validates that DefaultPreemptionArgs are correct.
func ValidateDefaultPreemptionArgs(path *field.Path, args *config.DefaultPreemptionArgs) error {
var allErrs field.ErrorList
percentagePath := path.Child("minCandidateNodesPercentage")
absolutePath := path.Child("minCandidateNodesAbsolute")
if err := validateMinCandidateNodesPercentage(args.MinCandidateNodesPercentage, percentagePath); err != nil {
allErrs = append(allErrs, err)
}
if err := validateMinCandidateNodesAbsolute(args.MinCandidateNodesAbsolute, absolutePath); err != nil {
allErrs = append(allErrs, err)
}
if args.MinCandidateNodesPercentage == 0 && args.MinCandidateNodesAbsolute == 0 {
allErrs = append(allErrs,
field.Invalid(percentagePath, args.MinCandidateNodesPercentage, "cannot be zero at the same time as minCandidateNodesAbsolute"),
field.Invalid(absolutePath, args.MinCandidateNodesAbsolute, "cannot be zero at the same time as minCandidateNodesPercentage"))
}
return allErrs.ToAggregate()
}
// validateMinCandidateNodesPercentage validates that
// minCandidateNodesPercentage is within the allowed range.
func validateMinCandidateNodesPercentage(minCandidateNodesPercentage int32, p *field.Path) *field.Error {
if minCandidateNodesPercentage < 0 || minCandidateNodesPercentage > 100 {
return field.Invalid(p, minCandidateNodesPercentage, "not in valid range [0, 100]")
}
return nil
}
// validateMinCandidateNodesAbsolute validates that minCandidateNodesAbsolute
// is within the allowed range.
func validateMinCandidateNodesAbsolute(minCandidateNodesAbsolute int32, p *field.Path) *field.Error {
if minCandidateNodesAbsolute < 0 {
return field.Invalid(p, minCandidateNodesAbsolute, "not in valid range [0, inf)")
}
return nil
}
// ValidateInterPodAffinityArgs validates that InterPodAffinityArgs are correct.
func ValidateInterPodAffinityArgs(path *field.Path, args *config.InterPodAffinityArgs) error {
return validateHardPodAffinityWeight(path.Child("hardPodAffinityWeight"), args.HardPodAffinityWeight)
}
// validateHardPodAffinityWeight validates that weight is within allowed range.
func validateHardPodAffinityWeight(path *field.Path, w int32) error {
const (
minHardPodAffinityWeight = 0
maxHardPodAffinityWeight = 100
)
if w < minHardPodAffinityWeight || w > maxHardPodAffinityWeight {
msg := fmt.Sprintf("not in valid range [%d, %d]", minHardPodAffinityWeight, maxHardPodAffinityWeight)
return field.Invalid(path, w, msg)
}
return nil
}
// ValidatePodTopologySpreadArgs validates that PodTopologySpreadArgs are correct.
// It replicates the validation from pkg/apis/core/validation.validateTopologySpreadConstraints
// with an additional check for .labelSelector to be nil.
func ValidatePodTopologySpreadArgs(path *field.Path, args *config.PodTopologySpreadArgs) error {
var allErrs field.ErrorList
if err := validateDefaultingType(path.Child("defaultingType"), args.DefaultingType, args.DefaultConstraints); err != nil {
allErrs = append(allErrs, err)
}
defaultConstraintsPath := path.Child("defaultConstraints")
for i, c := range args.DefaultConstraints {
p := defaultConstraintsPath.Index(i)
if c.MaxSkew <= 0 {
f := p.Child("maxSkew")
allErrs = append(allErrs, field.Invalid(f, c.MaxSkew, "not in valid range (0, inf)"))
}
allErrs = append(allErrs, validateTopologyKey(p.Child("topologyKey"), c.TopologyKey)...)
if err := validateWhenUnsatisfiable(p.Child("whenUnsatisfiable"), c.WhenUnsatisfiable); err != nil {
allErrs = append(allErrs, err)
}
if c.LabelSelector != nil {
f := field.Forbidden(p.Child("labelSelector"), "constraint must not define a selector, as they deduced for each pod")
allErrs = append(allErrs, f)
}
if err := validateConstraintNotRepeat(defaultConstraintsPath, args.DefaultConstraints, i); err != nil {
allErrs = append(allErrs, err)
}
}
if len(allErrs) == 0 {
return nil
}
return allErrs.ToAggregate()
}
func validateDefaultingType(p *field.Path, v config.PodTopologySpreadConstraintsDefaulting, constraints []v1.TopologySpreadConstraint) *field.Error {
if v != config.SystemDefaulting && v != config.ListDefaulting {
return field.NotSupported(p, v, []string{string(config.SystemDefaulting), string(config.ListDefaulting)})
}
if v == config.SystemDefaulting && len(constraints) > 0 {
return field.Invalid(p, v, "when .defaultConstraints are not empty")
}
return nil
}
func validateTopologyKey(p *field.Path, v string) field.ErrorList {
var allErrs field.ErrorList
if len(v) == 0 {
allErrs = append(allErrs, field.Required(p, "can not be empty"))
} else {
allErrs = append(allErrs, metav1validation.ValidateLabelName(v, p)...)
}
return allErrs
}
func validateWhenUnsatisfiable(p *field.Path, v v1.UnsatisfiableConstraintAction) *field.Error {
supportedScheduleActions := sets.New(string(v1.DoNotSchedule), string(v1.ScheduleAnyway))
if len(v) == 0 {
return field.Required(p, "can not be empty")
}
if !supportedScheduleActions.Has(string(v)) {
return field.NotSupported(p, v, sets.List(supportedScheduleActions))
}
return nil
}
func validateConstraintNotRepeat(path *field.Path, constraints []v1.TopologySpreadConstraint, idx int) *field.Error {
c := &constraints[idx]
for i := range constraints[:idx] {
other := &constraints[i]
if c.TopologyKey == other.TopologyKey && c.WhenUnsatisfiable == other.WhenUnsatisfiable {
return field.Duplicate(path.Index(idx), fmt.Sprintf("{%v, %v}", c.TopologyKey, c.WhenUnsatisfiable))
}
}
return nil
}
func validateFunctionShape(shape []config.UtilizationShapePoint, path *field.Path) field.ErrorList {
const (
minUtilization = 0
maxUtilization = 100
minScore = 0
maxScore = int32(config.MaxCustomPriorityScore)
)
var allErrs field.ErrorList
if len(shape) == 0 {
allErrs = append(allErrs, field.Required(path, "at least one point must be specified"))
return allErrs
}
for i := 1; i < len(shape); i++ {
if shape[i-1].Utilization >= shape[i].Utilization {
allErrs = append(allErrs, field.Invalid(path.Index(i).Child("utilization"), shape[i].Utilization, "utilization values must be sorted in increasing order"))
break
}
}
for i, point := range shape {
if point.Utilization < minUtilization || point.Utilization > maxUtilization {
msg := fmt.Sprintf("not in valid range [%d, %d]", minUtilization, maxUtilization)
allErrs = append(allErrs, field.Invalid(path.Index(i).Child("utilization"), point.Utilization, msg))
}
if point.Score < minScore || point.Score > maxScore {
msg := fmt.Sprintf("not in valid range [%d, %d]", minScore, maxScore)
allErrs = append(allErrs, field.Invalid(path.Index(i).Child("score"), point.Score, msg))
}
}
return allErrs
}
func validateResources(resources []config.ResourceSpec, p *field.Path) field.ErrorList {
var allErrs field.ErrorList
for i, resource := range resources {
if resource.Weight <= 0 || resource.Weight > 100 {
msg := fmt.Sprintf("resource weight of %v not in valid range (0, 100]", resource.Name)
allErrs = append(allErrs, field.Invalid(p.Index(i).Child("weight"), resource.Weight, msg))
}
}
return allErrs
}
// ValidateNodeResourcesBalancedAllocationArgs validates that NodeResourcesBalancedAllocationArgs are set correctly.
func ValidateNodeResourcesBalancedAllocationArgs(path *field.Path, args *config.NodeResourcesBalancedAllocationArgs) error {
var allErrs field.ErrorList
seenResources := sets.New[string]()
for i, resource := range args.Resources {
if seenResources.Has(resource.Name) {
allErrs = append(allErrs, field.Duplicate(path.Child("resources").Index(i).Child("name"), resource.Name))
} else {
seenResources.Insert(resource.Name)
}
if resource.Weight != 1 {
allErrs = append(allErrs, field.Invalid(path.Child("resources").Index(i).Child("weight"), resource.Weight, "must be 1"))
}
}
return allErrs.ToAggregate()
}
// ValidateNodeAffinityArgs validates that NodeAffinityArgs are correct.
func ValidateNodeAffinityArgs(path *field.Path, args *config.NodeAffinityArgs) error {
if args.AddedAffinity == nil {
return nil
}
affinity := args.AddedAffinity
var errs []error
if ns := affinity.RequiredDuringSchedulingIgnoredDuringExecution; ns != nil {
_, err := nodeaffinity.NewNodeSelector(ns, field.WithPath(path.Child("addedAffinity", "requiredDuringSchedulingIgnoredDuringExecution")))
if err != nil {
errs = append(errs, err)
}
}
// TODO: Add validation for requiredDuringSchedulingRequiredDuringExecution when it gets added to the API.
if terms := affinity.PreferredDuringSchedulingIgnoredDuringExecution; len(terms) != 0 {
_, err := nodeaffinity.NewPreferredSchedulingTerms(terms, field.WithPath(path.Child("addedAffinity", "preferredDuringSchedulingIgnoredDuringExecution")))
if err != nil {
errs = append(errs, err)
}
}
return errors.Flatten(errors.NewAggregate(errs))
}
// VolumeBindingArgsValidationOptions contains the different settings for validation.
type VolumeBindingArgsValidationOptions struct {
AllowVolumeCapacityPriority bool
}
// ValidateVolumeBindingArgs validates that VolumeBindingArgs are set correctly.
func ValidateVolumeBindingArgs(path *field.Path, args *config.VolumeBindingArgs) error {
return ValidateVolumeBindingArgsWithOptions(path, args, VolumeBindingArgsValidationOptions{
AllowVolumeCapacityPriority: utilfeature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
})
}
// ValidateVolumeBindingArgsWithOptions validates that VolumeBindingArgs and VolumeBindingArgsValidationOptions with scheduler features.
func ValidateVolumeBindingArgsWithOptions(path *field.Path, args *config.VolumeBindingArgs, opts VolumeBindingArgsValidationOptions) error {
var allErrs field.ErrorList
if args.BindTimeoutSeconds < 0 {
allErrs = append(allErrs, field.Invalid(path.Child("bindTimeoutSeconds"), args.BindTimeoutSeconds, "invalid BindTimeoutSeconds, should not be a negative value"))
}
if opts.AllowVolumeCapacityPriority {
allErrs = append(allErrs, validateFunctionShape(args.Shape, path.Child("shape"))...)
} else if args.Shape != nil {
// When the feature is off, return an error if the config is not nil.
// This prevents unexpected configuration from taking effect when the
// feature turns on in the future.
allErrs = append(allErrs, field.Invalid(path.Child("shape"), args.Shape, "unexpected field `shape`, remove it or turn on the feature gate VolumeCapacityPriority"))
}
return allErrs.ToAggregate()
}
func ValidateNodeResourcesFitArgs(path *field.Path, args *config.NodeResourcesFitArgs) error {
var allErrs field.ErrorList
resPath := path.Child("ignoredResources")
for i, res := range args.IgnoredResources {
path := resPath.Index(i)
if errs := metav1validation.ValidateLabelName(res, path); len(errs) != 0 {
allErrs = append(allErrs, errs...)
}
}
groupPath := path.Child("ignoredResourceGroups")
for i, group := range args.IgnoredResourceGroups {
path := groupPath.Index(i)
if strings.Contains(group, "/") {
allErrs = append(allErrs, field.Invalid(path, group, "resource group name can't contain '/'"))
}
if errs := metav1validation.ValidateLabelName(group, path); len(errs) != 0 {
allErrs = append(allErrs, errs...)
}
}
strategyPath := path.Child("scoringStrategy")
if args.ScoringStrategy != nil {
if !supportedScoringStrategyTypes.Has(string(args.ScoringStrategy.Type)) {
allErrs = append(allErrs, field.NotSupported(strategyPath.Child("type"), args.ScoringStrategy.Type, sets.List(supportedScoringStrategyTypes)))
}
allErrs = append(allErrs, validateResources(args.ScoringStrategy.Resources, strategyPath.Child("resources"))...)
if args.ScoringStrategy.RequestedToCapacityRatio != nil {
allErrs = append(allErrs, validateFunctionShape(args.ScoringStrategy.RequestedToCapacityRatio.Shape, strategyPath.Child("shape"))...)
}
}
if len(allErrs) == 0 {
return nil
}
return allErrs.ToAggregate()
}

View File

@ -0,0 +1,562 @@
//go:build !ignore_autogenerated
// +build !ignore_autogenerated
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Code generated by deepcopy-gen. DO NOT EDIT.
package config
import (
v1 "k8s.io/api/core/v1"
runtime "k8s.io/apimachinery/pkg/runtime"
)
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *DefaultPreemptionArgs) DeepCopyInto(out *DefaultPreemptionArgs) {
*out = *in
out.TypeMeta = in.TypeMeta
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DefaultPreemptionArgs.
func (in *DefaultPreemptionArgs) DeepCopy() *DefaultPreemptionArgs {
if in == nil {
return nil
}
out := new(DefaultPreemptionArgs)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *DefaultPreemptionArgs) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Extender) DeepCopyInto(out *Extender) {
*out = *in
if in.TLSConfig != nil {
in, out := &in.TLSConfig, &out.TLSConfig
*out = new(ExtenderTLSConfig)
(*in).DeepCopyInto(*out)
}
out.HTTPTimeout = in.HTTPTimeout
if in.ManagedResources != nil {
in, out := &in.ManagedResources, &out.ManagedResources
*out = make([]ExtenderManagedResource, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Extender.
func (in *Extender) DeepCopy() *Extender {
if in == nil {
return nil
}
out := new(Extender)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ExtenderManagedResource) DeepCopyInto(out *ExtenderManagedResource) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderManagedResource.
func (in *ExtenderManagedResource) DeepCopy() *ExtenderManagedResource {
if in == nil {
return nil
}
out := new(ExtenderManagedResource)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ExtenderTLSConfig) DeepCopyInto(out *ExtenderTLSConfig) {
*out = *in
if in.CertData != nil {
in, out := &in.CertData, &out.CertData
*out = make([]byte, len(*in))
copy(*out, *in)
}
if in.KeyData != nil {
in, out := &in.KeyData, &out.KeyData
*out = make([]byte, len(*in))
copy(*out, *in)
}
if in.CAData != nil {
in, out := &in.CAData, &out.CAData
*out = make([]byte, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ExtenderTLSConfig.
func (in *ExtenderTLSConfig) DeepCopy() *ExtenderTLSConfig {
if in == nil {
return nil
}
out := new(ExtenderTLSConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *InterPodAffinityArgs) DeepCopyInto(out *InterPodAffinityArgs) {
*out = *in
out.TypeMeta = in.TypeMeta
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InterPodAffinityArgs.
func (in *InterPodAffinityArgs) DeepCopy() *InterPodAffinityArgs {
if in == nil {
return nil
}
out := new(InterPodAffinityArgs)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *InterPodAffinityArgs) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeSchedulerConfiguration) DeepCopyInto(out *KubeSchedulerConfiguration) {
*out = *in
out.TypeMeta = in.TypeMeta
out.LeaderElection = in.LeaderElection
out.ClientConnection = in.ClientConnection
out.DebuggingConfiguration = in.DebuggingConfiguration
if in.PercentageOfNodesToScore != nil {
in, out := &in.PercentageOfNodesToScore, &out.PercentageOfNodesToScore
*out = new(int32)
**out = **in
}
if in.Profiles != nil {
in, out := &in.Profiles, &out.Profiles
*out = make([]KubeSchedulerProfile, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.Extenders != nil {
in, out := &in.Extenders, &out.Extenders
*out = make([]Extender, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeSchedulerConfiguration.
func (in *KubeSchedulerConfiguration) DeepCopy() *KubeSchedulerConfiguration {
if in == nil {
return nil
}
out := new(KubeSchedulerConfiguration)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *KubeSchedulerConfiguration) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *KubeSchedulerProfile) DeepCopyInto(out *KubeSchedulerProfile) {
*out = *in
if in.PercentageOfNodesToScore != nil {
in, out := &in.PercentageOfNodesToScore, &out.PercentageOfNodesToScore
*out = new(int32)
**out = **in
}
if in.Plugins != nil {
in, out := &in.Plugins, &out.Plugins
*out = new(Plugins)
(*in).DeepCopyInto(*out)
}
if in.PluginConfig != nil {
in, out := &in.PluginConfig, &out.PluginConfig
*out = make([]PluginConfig, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubeSchedulerProfile.
func (in *KubeSchedulerProfile) DeepCopy() *KubeSchedulerProfile {
if in == nil {
return nil
}
out := new(KubeSchedulerProfile)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeAffinityArgs) DeepCopyInto(out *NodeAffinityArgs) {
*out = *in
out.TypeMeta = in.TypeMeta
if in.AddedAffinity != nil {
in, out := &in.AddedAffinity, &out.AddedAffinity
*out = new(v1.NodeAffinity)
(*in).DeepCopyInto(*out)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeAffinityArgs.
func (in *NodeAffinityArgs) DeepCopy() *NodeAffinityArgs {
if in == nil {
return nil
}
out := new(NodeAffinityArgs)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *NodeAffinityArgs) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeResourcesBalancedAllocationArgs) DeepCopyInto(out *NodeResourcesBalancedAllocationArgs) {
*out = *in
out.TypeMeta = in.TypeMeta
if in.Resources != nil {
in, out := &in.Resources, &out.Resources
*out = make([]ResourceSpec, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeResourcesBalancedAllocationArgs.
func (in *NodeResourcesBalancedAllocationArgs) DeepCopy() *NodeResourcesBalancedAllocationArgs {
if in == nil {
return nil
}
out := new(NodeResourcesBalancedAllocationArgs)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *NodeResourcesBalancedAllocationArgs) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *NodeResourcesFitArgs) DeepCopyInto(out *NodeResourcesFitArgs) {
*out = *in
out.TypeMeta = in.TypeMeta
if in.IgnoredResources != nil {
in, out := &in.IgnoredResources, &out.IgnoredResources
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.IgnoredResourceGroups != nil {
in, out := &in.IgnoredResourceGroups, &out.IgnoredResourceGroups
*out = make([]string, len(*in))
copy(*out, *in)
}
if in.ScoringStrategy != nil {
in, out := &in.ScoringStrategy, &out.ScoringStrategy
*out = new(ScoringStrategy)
(*in).DeepCopyInto(*out)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeResourcesFitArgs.
func (in *NodeResourcesFitArgs) DeepCopy() *NodeResourcesFitArgs {
if in == nil {
return nil
}
out := new(NodeResourcesFitArgs)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *NodeResourcesFitArgs) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Plugin) DeepCopyInto(out *Plugin) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Plugin.
func (in *Plugin) DeepCopy() *Plugin {
if in == nil {
return nil
}
out := new(Plugin)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PluginConfig) DeepCopyInto(out *PluginConfig) {
*out = *in
if in.Args != nil {
out.Args = in.Args.DeepCopyObject()
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PluginConfig.
func (in *PluginConfig) DeepCopy() *PluginConfig {
if in == nil {
return nil
}
out := new(PluginConfig)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PluginSet) DeepCopyInto(out *PluginSet) {
*out = *in
if in.Enabled != nil {
in, out := &in.Enabled, &out.Enabled
*out = make([]Plugin, len(*in))
copy(*out, *in)
}
if in.Disabled != nil {
in, out := &in.Disabled, &out.Disabled
*out = make([]Plugin, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PluginSet.
func (in *PluginSet) DeepCopy() *PluginSet {
if in == nil {
return nil
}
out := new(PluginSet)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *Plugins) DeepCopyInto(out *Plugins) {
*out = *in
in.PreEnqueue.DeepCopyInto(&out.PreEnqueue)
in.QueueSort.DeepCopyInto(&out.QueueSort)
in.PreFilter.DeepCopyInto(&out.PreFilter)
in.Filter.DeepCopyInto(&out.Filter)
in.PostFilter.DeepCopyInto(&out.PostFilter)
in.PreScore.DeepCopyInto(&out.PreScore)
in.Score.DeepCopyInto(&out.Score)
in.Reserve.DeepCopyInto(&out.Reserve)
in.Permit.DeepCopyInto(&out.Permit)
in.PreBind.DeepCopyInto(&out.PreBind)
in.Bind.DeepCopyInto(&out.Bind)
in.PostBind.DeepCopyInto(&out.PostBind)
in.MultiPoint.DeepCopyInto(&out.MultiPoint)
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Plugins.
func (in *Plugins) DeepCopy() *Plugins {
if in == nil {
return nil
}
out := new(Plugins)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *PodTopologySpreadArgs) DeepCopyInto(out *PodTopologySpreadArgs) {
*out = *in
out.TypeMeta = in.TypeMeta
if in.DefaultConstraints != nil {
in, out := &in.DefaultConstraints, &out.DefaultConstraints
*out = make([]v1.TopologySpreadConstraint, len(*in))
for i := range *in {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodTopologySpreadArgs.
func (in *PodTopologySpreadArgs) DeepCopy() *PodTopologySpreadArgs {
if in == nil {
return nil
}
out := new(PodTopologySpreadArgs)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *PodTopologySpreadArgs) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *RequestedToCapacityRatioParam) DeepCopyInto(out *RequestedToCapacityRatioParam) {
*out = *in
if in.Shape != nil {
in, out := &in.Shape, &out.Shape
*out = make([]UtilizationShapePoint, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RequestedToCapacityRatioParam.
func (in *RequestedToCapacityRatioParam) DeepCopy() *RequestedToCapacityRatioParam {
if in == nil {
return nil
}
out := new(RequestedToCapacityRatioParam)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ResourceSpec) DeepCopyInto(out *ResourceSpec) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ResourceSpec.
func (in *ResourceSpec) DeepCopy() *ResourceSpec {
if in == nil {
return nil
}
out := new(ResourceSpec)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *ScoringStrategy) DeepCopyInto(out *ScoringStrategy) {
*out = *in
if in.Resources != nil {
in, out := &in.Resources, &out.Resources
*out = make([]ResourceSpec, len(*in))
copy(*out, *in)
}
if in.RequestedToCapacityRatio != nil {
in, out := &in.RequestedToCapacityRatio, &out.RequestedToCapacityRatio
*out = new(RequestedToCapacityRatioParam)
(*in).DeepCopyInto(*out)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScoringStrategy.
func (in *ScoringStrategy) DeepCopy() *ScoringStrategy {
if in == nil {
return nil
}
out := new(ScoringStrategy)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *UtilizationShapePoint) DeepCopyInto(out *UtilizationShapePoint) {
*out = *in
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UtilizationShapePoint.
func (in *UtilizationShapePoint) DeepCopy() *UtilizationShapePoint {
if in == nil {
return nil
}
out := new(UtilizationShapePoint)
in.DeepCopyInto(out)
return out
}
// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
func (in *VolumeBindingArgs) DeepCopyInto(out *VolumeBindingArgs) {
*out = *in
out.TypeMeta = in.TypeMeta
if in.Shape != nil {
in, out := &in.Shape, &out.Shape
*out = make([]UtilizationShapePoint, len(*in))
copy(*out, *in)
}
return
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VolumeBindingArgs.
func (in *VolumeBindingArgs) DeepCopy() *VolumeBindingArgs {
if in == nil {
return nil
}
out := new(VolumeBindingArgs)
in.DeepCopyInto(out)
return out
}
// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object.
func (in *VolumeBindingArgs) DeepCopyObject() runtime.Object {
if c := in.DeepCopy(); c != nil {
return c
}
return nil
}

View File

@ -0,0 +1,760 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"context"
"errors"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
var (
cleanAssumedPeriod = 1 * time.Second
)
// New returns a Cache implementation.
// It automatically starts a go routine that manages expiration of assumed pods.
// "ttl" is how long the assumed pod will get expired.
// "ctx" is the context that would close the background goroutine.
func New(ctx context.Context, ttl time.Duration) Cache {
logger := klog.FromContext(ctx)
cache := newCache(ctx, ttl, cleanAssumedPeriod)
cache.run(logger)
return cache
}
// nodeInfoListItem holds a NodeInfo pointer and acts as an item in a doubly
// linked list. When a NodeInfo is updated, it goes to the head of the list.
// The items closer to the head are the most recently updated items.
type nodeInfoListItem struct {
info *framework.NodeInfo
next *nodeInfoListItem
prev *nodeInfoListItem
}
type cacheImpl struct {
stop <-chan struct{}
ttl time.Duration
period time.Duration
// This mutex guards all fields within this cache struct.
mu sync.RWMutex
// a set of assumed pod keys.
// The key could further be used to get an entry in podStates.
assumedPods sets.Set[string]
// a map from pod key to podState.
podStates map[string]*podState
nodes map[string]*nodeInfoListItem
// headNode points to the most recently updated NodeInfo in "nodes". It is the
// head of the linked list.
headNode *nodeInfoListItem
nodeTree *nodeTree
// A map from image name to its ImageStateSummary.
imageStates map[string]*framework.ImageStateSummary
}
type podState struct {
pod *v1.Pod
// Used by assumedPod to determinate expiration.
// If deadline is nil, assumedPod will never expire.
deadline *time.Time
// Used to block cache from expiring assumedPod if binding still runs
bindingFinished bool
}
func newCache(ctx context.Context, ttl, period time.Duration) *cacheImpl {
logger := klog.FromContext(ctx)
return &cacheImpl{
ttl: ttl,
period: period,
stop: ctx.Done(),
nodes: make(map[string]*nodeInfoListItem),
nodeTree: newNodeTree(logger, nil),
assumedPods: sets.New[string](),
podStates: make(map[string]*podState),
imageStates: make(map[string]*framework.ImageStateSummary),
}
}
// newNodeInfoListItem initializes a new nodeInfoListItem.
func newNodeInfoListItem(ni *framework.NodeInfo) *nodeInfoListItem {
return &nodeInfoListItem{
info: ni,
}
}
// moveNodeInfoToHead moves a NodeInfo to the head of "cache.nodes" doubly
// linked list. The head is the most recently updated NodeInfo.
// We assume cache lock is already acquired.
func (cache *cacheImpl) moveNodeInfoToHead(logger klog.Logger, name string) {
ni, ok := cache.nodes[name]
if !ok {
logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return
}
// if the node info list item is already at the head, we are done.
if ni == cache.headNode {
return
}
if ni.prev != nil {
ni.prev.next = ni.next
}
if ni.next != nil {
ni.next.prev = ni.prev
}
if cache.headNode != nil {
cache.headNode.prev = ni
}
ni.next = cache.headNode
ni.prev = nil
cache.headNode = ni
}
// removeNodeInfoFromList removes a NodeInfo from the "cache.nodes" doubly
// linked list.
// We assume cache lock is already acquired.
func (cache *cacheImpl) removeNodeInfoFromList(logger klog.Logger, name string) {
ni, ok := cache.nodes[name]
if !ok {
logger.Error(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return
}
if ni.prev != nil {
ni.prev.next = ni.next
}
if ni.next != nil {
ni.next.prev = ni.prev
}
// if the removed item was at the head, we must update the head.
if ni == cache.headNode {
cache.headNode = ni.next
}
delete(cache.nodes, name)
}
// Dump produces a dump of the current scheduler cache. This is used for
// debugging purposes only and shouldn't be confused with UpdateSnapshot
// function.
// This method is expensive, and should be only used in non-critical path.
func (cache *cacheImpl) Dump() *Dump {
cache.mu.RLock()
defer cache.mu.RUnlock()
nodes := make(map[string]*framework.NodeInfo, len(cache.nodes))
for k, v := range cache.nodes {
nodes[k] = v.info.Snapshot()
}
return &Dump{
Nodes: nodes,
AssumedPods: cache.assumedPods.Union(nil),
}
}
// UpdateSnapshot takes a snapshot of cached NodeInfo map. This is called at
// beginning of every scheduling cycle.
// The snapshot only includes Nodes that are not deleted at the time this function is called.
// nodeInfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
// This function tracks generation number of NodeInfo and updates only the
// entries of an existing snapshot that have changed after the snapshot was taken.
func (cache *cacheImpl) UpdateSnapshot(logger klog.Logger, nodeSnapshot *Snapshot) error {
cache.mu.Lock()
defer cache.mu.Unlock()
// Get the last generation of the snapshot.
snapshotGeneration := nodeSnapshot.generation
// NodeInfoList and HavePodsWithAffinityNodeInfoList must be re-created if a node was added
// or removed from the cache.
updateAllLists := false
// HavePodsWithAffinityNodeInfoList must be re-created if a node changed its
// status from having pods with affinity to NOT having pods with affinity or the other
// way around.
updateNodesHavePodsWithAffinity := false
// HavePodsWithRequiredAntiAffinityNodeInfoList must be re-created if a node changed its
// status from having pods with required anti-affinity to NOT having pods with required
// anti-affinity or the other way around.
updateNodesHavePodsWithRequiredAntiAffinity := false
// usedPVCSet must be re-created whenever the head node generation is greater than
// last snapshot generation.
updateUsedPVCSet := false
// Start from the head of the NodeInfo doubly linked list and update snapshot
// of NodeInfos updated after the last snapshot.
for node := cache.headNode; node != nil; node = node.next {
if node.info.Generation <= snapshotGeneration {
// all the nodes are updated before the existing snapshot. We are done.
break
}
if np := node.info.Node(); np != nil {
existing, ok := nodeSnapshot.nodeInfoMap[np.Name]
if !ok {
updateAllLists = true
existing = &framework.NodeInfo{}
nodeSnapshot.nodeInfoMap[np.Name] = existing
}
clone := node.info.Snapshot()
// We track nodes that have pods with affinity, here we check if this node changed its
// status from having pods with affinity to NOT having pods with affinity or the other
// way around.
if (len(existing.PodsWithAffinity) > 0) != (len(clone.PodsWithAffinity) > 0) {
updateNodesHavePodsWithAffinity = true
}
if (len(existing.PodsWithRequiredAntiAffinity) > 0) != (len(clone.PodsWithRequiredAntiAffinity) > 0) {
updateNodesHavePodsWithRequiredAntiAffinity = true
}
if !updateUsedPVCSet {
if len(existing.PVCRefCounts) != len(clone.PVCRefCounts) {
updateUsedPVCSet = true
} else {
for pvcKey := range clone.PVCRefCounts {
if _, found := existing.PVCRefCounts[pvcKey]; !found {
updateUsedPVCSet = true
break
}
}
}
}
// We need to preserve the original pointer of the NodeInfo struct since it
// is used in the NodeInfoList, which we may not update.
*existing = *clone
}
}
// Update the snapshot generation with the latest NodeInfo generation.
if cache.headNode != nil {
nodeSnapshot.generation = cache.headNode.info.Generation
}
// Comparing to pods in nodeTree.
// Deleted nodes get removed from the tree, but they might remain in the nodes map
// if they still have non-deleted Pods.
if len(nodeSnapshot.nodeInfoMap) > cache.nodeTree.numNodes {
cache.removeDeletedNodesFromSnapshot(nodeSnapshot)
updateAllLists = true
}
if updateAllLists || updateNodesHavePodsWithAffinity || updateNodesHavePodsWithRequiredAntiAffinity || updateUsedPVCSet {
cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, updateAllLists)
}
if len(nodeSnapshot.nodeInfoList) != cache.nodeTree.numNodes {
errMsg := fmt.Sprintf("snapshot state is not consistent, length of NodeInfoList=%v not equal to length of nodes in tree=%v "+
", length of NodeInfoMap=%v, length of nodes in cache=%v"+
", trying to recover",
len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes,
len(nodeSnapshot.nodeInfoMap), len(cache.nodes))
logger.Error(nil, errMsg)
// We will try to recover by re-creating the lists for the next scheduling cycle, but still return an
// error to surface the problem, the error will likely cause a failure to the current scheduling cycle.
cache.updateNodeInfoSnapshotList(logger, nodeSnapshot, true)
return errors.New(errMsg)
}
return nil
}
func (cache *cacheImpl) updateNodeInfoSnapshotList(logger klog.Logger, snapshot *Snapshot, updateAll bool) {
snapshot.havePodsWithAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
snapshot.usedPVCSet = sets.New[string]()
if updateAll {
// Take a snapshot of the nodes order in the tree
snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
nodesList, err := cache.nodeTree.list()
if err != nil {
logger.Error(err, "Error occurred while retrieving the list of names of the nodes from node tree")
}
for _, nodeName := range nodesList {
if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil {
snapshot.nodeInfoList = append(snapshot.nodeInfoList, nodeInfo)
if len(nodeInfo.PodsWithAffinity) > 0 {
snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
}
if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
}
for key := range nodeInfo.PVCRefCounts {
snapshot.usedPVCSet.Insert(key)
}
} else {
logger.Error(nil, "Node exists in nodeTree but not in NodeInfoMap, this should not happen", "node", klog.KRef("", nodeName))
}
}
} else {
for _, nodeInfo := range snapshot.nodeInfoList {
if len(nodeInfo.PodsWithAffinity) > 0 {
snapshot.havePodsWithAffinityNodeInfoList = append(snapshot.havePodsWithAffinityNodeInfoList, nodeInfo)
}
if len(nodeInfo.PodsWithRequiredAntiAffinity) > 0 {
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
}
for key := range nodeInfo.PVCRefCounts {
snapshot.usedPVCSet.Insert(key)
}
}
}
}
// If certain nodes were deleted after the last snapshot was taken, we should remove them from the snapshot.
func (cache *cacheImpl) removeDeletedNodesFromSnapshot(snapshot *Snapshot) {
toDelete := len(snapshot.nodeInfoMap) - cache.nodeTree.numNodes
for name := range snapshot.nodeInfoMap {
if toDelete <= 0 {
break
}
if n, ok := cache.nodes[name]; !ok || n.info.Node() == nil {
delete(snapshot.nodeInfoMap, name)
toDelete--
}
}
}
// NodeCount returns the number of nodes in the cache.
// DO NOT use outside of tests.
func (cache *cacheImpl) NodeCount() int {
cache.mu.RLock()
defer cache.mu.RUnlock()
return len(cache.nodes)
}
// PodCount returns the number of pods in the cache (including those from deleted nodes).
// DO NOT use outside of tests.
func (cache *cacheImpl) PodCount() (int, error) {
cache.mu.RLock()
defer cache.mu.RUnlock()
// podFilter is expected to return true for most or all of the pods. We
// can avoid expensive array growth without wasting too much memory by
// pre-allocating capacity.
count := 0
for _, n := range cache.nodes {
count += len(n.info.Pods)
}
return count, nil
}
func (cache *cacheImpl) AssumePod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
if _, ok := cache.podStates[key]; ok {
return fmt.Errorf("pod %v(%v) is in the cache, so can't be assumed", key, klog.KObj(pod))
}
return cache.addPod(logger, pod, true)
}
func (cache *cacheImpl) FinishBinding(logger klog.Logger, pod *v1.Pod) error {
return cache.finishBinding(logger, pod, time.Now())
}
// finishBinding exists to make tests deterministic by injecting now as an argument
func (cache *cacheImpl) finishBinding(logger klog.Logger, pod *v1.Pod, now time.Time) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.RLock()
defer cache.mu.RUnlock()
logger.V(5).Info("Finished binding for pod, can be expired", "podKey", key, "pod", klog.KObj(pod))
currState, ok := cache.podStates[key]
if ok && cache.assumedPods.Has(key) {
if cache.ttl == time.Duration(0) {
currState.deadline = nil
} else {
dl := now.Add(cache.ttl)
currState.deadline = &dl
}
currState.bindingFinished = true
}
return nil
}
func (cache *cacheImpl) ForgetPod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
if ok && currState.pod.Spec.NodeName != pod.Spec.NodeName {
return fmt.Errorf("pod %v(%v) was assumed on %v but assigned to %v", key, klog.KObj(pod), pod.Spec.NodeName, currState.pod.Spec.NodeName)
}
// Only assumed pod can be forgotten.
if ok && cache.assumedPods.Has(key) {
return cache.removePod(logger, pod)
}
return fmt.Errorf("pod %v(%v) wasn't assumed so cannot be forgotten", key, klog.KObj(pod))
}
// Assumes that lock is already acquired.
func (cache *cacheImpl) addPod(logger klog.Logger, pod *v1.Pod, assumePod bool) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
n, ok := cache.nodes[pod.Spec.NodeName]
if !ok {
n = newNodeInfoListItem(framework.NewNodeInfo())
cache.nodes[pod.Spec.NodeName] = n
}
n.info.AddPod(pod)
cache.moveNodeInfoToHead(logger, pod.Spec.NodeName)
ps := &podState{
pod: pod,
}
cache.podStates[key] = ps
if assumePod {
cache.assumedPods.Insert(key)
}
return nil
}
// Assumes that lock is already acquired.
func (cache *cacheImpl) updatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error {
if err := cache.removePod(logger, oldPod); err != nil {
return err
}
return cache.addPod(logger, newPod, false)
}
// Assumes that lock is already acquired.
// Removes a pod from the cached node info. If the node information was already
// removed and there are no more pods left in the node, cleans up the node from
// the cache.
func (cache *cacheImpl) removePod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
n, ok := cache.nodes[pod.Spec.NodeName]
if !ok {
logger.Error(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "podKey", key, "pod", klog.KObj(pod))
} else {
if err := n.info.RemovePod(logger, pod); err != nil {
return err
}
if len(n.info.Pods) == 0 && n.info.Node() == nil {
cache.removeNodeInfoFromList(logger, pod.Spec.NodeName)
} else {
cache.moveNodeInfoToHead(logger, pod.Spec.NodeName)
}
}
delete(cache.podStates, key)
delete(cache.assumedPods, key)
return nil
}
func (cache *cacheImpl) AddPod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
switch {
case ok && cache.assumedPods.Has(key):
// When assuming, we've already added the Pod to cache,
// Just update here to make sure the Pod's status is up-to-date.
if err = cache.updatePod(logger, currState.pod, pod); err != nil {
logger.Error(err, "Error occurred while updating pod")
}
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
// The pod was added to a different node than it was assumed to.
logger.Info("Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
return nil
}
case !ok:
// Pod was expired. We should add it back.
if err = cache.addPod(logger, pod, false); err != nil {
logger.Error(err, "Error occurred while adding pod")
}
default:
return fmt.Errorf("pod %v(%v) was already in added state", key, klog.KObj(pod))
}
return nil
}
func (cache *cacheImpl) UpdatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error {
key, err := framework.GetPodKey(oldPod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
if !ok {
return fmt.Errorf("pod %v(%v) is not added to scheduler cache, so cannot be updated", key, klog.KObj(oldPod))
}
// An assumed pod won't have Update/Remove event. It needs to have Add event
// before Update event, in which case the state would change from Assumed to Added.
if cache.assumedPods.Has(key) {
return fmt.Errorf("assumed pod %v(%v) should not be updated", key, klog.KObj(oldPod))
}
if currState.pod.Spec.NodeName != newPod.Spec.NodeName {
logger.Error(nil, "Pod updated on a different node than previously added to", "podKey", key, "pod", klog.KObj(oldPod))
logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
return cache.updatePod(logger, oldPod, newPod)
}
func (cache *cacheImpl) RemovePod(logger klog.Logger, pod *v1.Pod) error {
key, err := framework.GetPodKey(pod)
if err != nil {
return err
}
cache.mu.Lock()
defer cache.mu.Unlock()
currState, ok := cache.podStates[key]
if !ok {
return fmt.Errorf("pod %v(%v) is not found in scheduler cache, so cannot be removed from it", key, klog.KObj(pod))
}
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
logger.Error(nil, "Pod was added to a different node than it was assumed", "podKey", key, "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
if pod.Spec.NodeName != "" {
// An empty NodeName is possible when the scheduler misses a Delete
// event and it gets the last known state from the informer cache.
logger.Error(nil, "scheduler cache is corrupted and can badly affect scheduling decisions")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
}
return cache.removePod(logger, currState.pod)
}
func (cache *cacheImpl) IsAssumedPod(pod *v1.Pod) (bool, error) {
key, err := framework.GetPodKey(pod)
if err != nil {
return false, err
}
cache.mu.RLock()
defer cache.mu.RUnlock()
return cache.assumedPods.Has(key), nil
}
// GetPod might return a pod for which its node has already been deleted from
// the main cache. This is useful to properly process pod update events.
func (cache *cacheImpl) GetPod(pod *v1.Pod) (*v1.Pod, error) {
key, err := framework.GetPodKey(pod)
if err != nil {
return nil, err
}
cache.mu.RLock()
defer cache.mu.RUnlock()
podState, ok := cache.podStates[key]
if !ok {
return nil, fmt.Errorf("pod %v(%v) does not exist in scheduler cache", key, klog.KObj(pod))
}
return podState.pod, nil
}
func (cache *cacheImpl) AddNode(logger klog.Logger, node *v1.Node) *framework.NodeInfo {
cache.mu.Lock()
defer cache.mu.Unlock()
n, ok := cache.nodes[node.Name]
if !ok {
n = newNodeInfoListItem(framework.NewNodeInfo())
cache.nodes[node.Name] = n
} else {
cache.removeNodeImageStates(n.info.Node())
}
cache.moveNodeInfoToHead(logger, node.Name)
cache.nodeTree.addNode(logger, node)
cache.addNodeImageStates(node, n.info)
n.info.SetNode(node)
return n.info.Snapshot()
}
func (cache *cacheImpl) UpdateNode(logger klog.Logger, oldNode, newNode *v1.Node) *framework.NodeInfo {
cache.mu.Lock()
defer cache.mu.Unlock()
n, ok := cache.nodes[newNode.Name]
if !ok {
n = newNodeInfoListItem(framework.NewNodeInfo())
cache.nodes[newNode.Name] = n
cache.nodeTree.addNode(logger, newNode)
} else {
cache.removeNodeImageStates(n.info.Node())
}
cache.moveNodeInfoToHead(logger, newNode.Name)
cache.nodeTree.updateNode(logger, oldNode, newNode)
cache.addNodeImageStates(newNode, n.info)
n.info.SetNode(newNode)
return n.info.Snapshot()
}
// RemoveNode removes a node from the cache's tree.
// The node might still have pods because their deletion events didn't arrive
// yet. Those pods are considered removed from the cache, being the node tree
// the source of truth.
// However, we keep a ghost node with the list of pods until all pod deletion
// events have arrived. A ghost node is skipped from snapshots.
func (cache *cacheImpl) RemoveNode(logger klog.Logger, node *v1.Node) error {
cache.mu.Lock()
defer cache.mu.Unlock()
n, ok := cache.nodes[node.Name]
if !ok {
return fmt.Errorf("node %v is not found", node.Name)
}
n.info.RemoveNode()
// We remove NodeInfo for this node only if there aren't any pods on this node.
// We can't do it unconditionally, because notifications about pods are delivered
// in a different watch, and thus can potentially be observed later, even though
// they happened before node removal.
if len(n.info.Pods) == 0 {
cache.removeNodeInfoFromList(logger, node.Name)
} else {
cache.moveNodeInfoToHead(logger, node.Name)
}
if err := cache.nodeTree.removeNode(logger, node); err != nil {
return err
}
cache.removeNodeImageStates(node)
return nil
}
// addNodeImageStates adds states of the images on given node to the given nodeInfo and update the imageStates in
// scheduler cache. This function assumes the lock to scheduler cache has been acquired.
func (cache *cacheImpl) addNodeImageStates(node *v1.Node, nodeInfo *framework.NodeInfo) {
newSum := make(map[string]*framework.ImageStateSummary)
for _, image := range node.Status.Images {
for _, name := range image.Names {
// update the entry in imageStates
state, ok := cache.imageStates[name]
if !ok {
state = &framework.ImageStateSummary{
Size: image.SizeBytes,
Nodes: sets.New(node.Name),
}
cache.imageStates[name] = state
} else {
state.Nodes.Insert(node.Name)
}
// create the ImageStateSummary for this image
if _, ok := newSum[name]; !ok {
newSum[name] = state
}
}
}
nodeInfo.ImageStates = newSum
}
// removeNodeImageStates removes the given node record from image entries having the node
// in imageStates cache. After the removal, if any image becomes free, i.e., the image
// is no longer available on any node, the image entry will be removed from imageStates.
func (cache *cacheImpl) removeNodeImageStates(node *v1.Node) {
if node == nil {
return
}
for _, image := range node.Status.Images {
for _, name := range image.Names {
state, ok := cache.imageStates[name]
if ok {
state.Nodes.Delete(node.Name)
if state.Nodes.Len() == 0 {
// Remove the unused image to make sure the length of
// imageStates represents the total number of different
// images on all nodes
delete(cache.imageStates, name)
}
}
}
}
}
func (cache *cacheImpl) run(logger klog.Logger) {
go wait.Until(func() {
cache.cleanupAssumedPods(logger, time.Now())
}, cache.period, cache.stop)
}
// cleanupAssumedPods exists for making test deterministic by taking time as input argument.
// It also reports metrics on the cache size for nodes, pods, and assumed pods.
func (cache *cacheImpl) cleanupAssumedPods(logger klog.Logger, now time.Time) {
cache.mu.Lock()
defer cache.mu.Unlock()
defer cache.updateMetrics()
// The size of assumedPods should be small
for key := range cache.assumedPods {
ps, ok := cache.podStates[key]
if !ok {
logger.Error(nil, "Key found in assumed set but not in podStates, potentially a logical error")
klog.FlushAndExit(klog.ExitFlushTimeout, 1)
}
if !ps.bindingFinished {
logger.V(5).Info("Could not expire cache for pod as binding is still in progress", "podKey", key, "pod", klog.KObj(ps.pod))
continue
}
if cache.ttl != 0 && now.After(*ps.deadline) {
logger.Info("Pod expired", "podKey", key, "pod", klog.KObj(ps.pod))
if err := cache.removePod(logger, ps.pod); err != nil {
logger.Error(err, "ExpirePod failed", "podKey", key, "pod", klog.KObj(ps.pod))
}
}
}
}
// updateMetrics updates cache size metric values for pods, assumed pods, and nodes
func (cache *cacheImpl) updateMetrics() {
metrics.CacheSize.WithLabelValues("assumed_pods").Set(float64(len(cache.assumedPods)))
metrics.CacheSize.WithLabelValues("pods").Set(float64(len(cache.podStates)))
metrics.CacheSize.WithLabelValues("nodes").Set(float64(len(cache.nodes)))
}

View File

@ -0,0 +1,135 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import (
"sort"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// CacheComparer is an implementation of the Scheduler's cache comparer.
type CacheComparer struct {
NodeLister corelisters.NodeLister
PodLister corelisters.PodLister
Cache internalcache.Cache
PodQueue internalqueue.SchedulingQueue
}
// Compare compares the nodes and pods of NodeLister with Cache.Snapshot.
func (c *CacheComparer) Compare(logger klog.Logger) error {
logger.V(3).Info("Cache comparer started")
defer logger.V(3).Info("Cache comparer finished")
nodes, err := c.NodeLister.List(labels.Everything())
if err != nil {
return err
}
pods, err := c.PodLister.List(labels.Everything())
if err != nil {
return err
}
dump := c.Cache.Dump()
pendingPods, _ := c.PodQueue.PendingPods()
if missed, redundant := c.CompareNodes(nodes, dump.Nodes); len(missed)+len(redundant) != 0 {
logger.Info("Cache mismatch", "missedNodes", missed, "redundantNodes", redundant)
}
if missed, redundant := c.ComparePods(pods, pendingPods, dump.Nodes); len(missed)+len(redundant) != 0 {
logger.Info("Cache mismatch", "missedPods", missed, "redundantPods", redundant)
}
return nil
}
// CompareNodes compares actual nodes with cached nodes.
func (c *CacheComparer) CompareNodes(nodes []*v1.Node, nodeinfos map[string]*framework.NodeInfo) (missed, redundant []string) {
actual := []string{}
for _, node := range nodes {
actual = append(actual, node.Name)
}
cached := []string{}
for nodeName := range nodeinfos {
cached = append(cached, nodeName)
}
return compareStrings(actual, cached)
}
// ComparePods compares actual pods with cached pods.
func (c *CacheComparer) ComparePods(pods, waitingPods []*v1.Pod, nodeinfos map[string]*framework.NodeInfo) (missed, redundant []string) {
actual := []string{}
for _, pod := range pods {
actual = append(actual, string(pod.UID))
}
cached := []string{}
for _, nodeinfo := range nodeinfos {
for _, p := range nodeinfo.Pods {
cached = append(cached, string(p.Pod.UID))
}
}
for _, pod := range waitingPods {
cached = append(cached, string(pod.UID))
}
return compareStrings(actual, cached)
}
func compareStrings(actual, cached []string) (missed, redundant []string) {
missed, redundant = []string{}, []string{}
sort.Strings(actual)
sort.Strings(cached)
compare := func(i, j int) int {
if i == len(actual) {
return 1
} else if j == len(cached) {
return -1
}
return strings.Compare(actual[i], cached[j])
}
for i, j := 0, 0; i < len(actual) || j < len(cached); {
switch compare(i, j) {
case 0:
i++
j++
case -1:
missed = append(missed, actual[i])
i++
case 1:
redundant = append(redundant, cached[j])
j++
}
}
return
}

View File

@ -0,0 +1,76 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import (
"context"
"os"
"os/signal"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
internalqueue "k8s.io/kubernetes/pkg/scheduler/backend/queue"
)
// CacheDebugger provides ways to check and write cache information for debugging.
type CacheDebugger struct {
Comparer CacheComparer
Dumper CacheDumper
}
// New creates a CacheDebugger.
func New(
nodeLister corelisters.NodeLister,
podLister corelisters.PodLister,
cache internalcache.Cache,
podQueue internalqueue.SchedulingQueue,
) *CacheDebugger {
return &CacheDebugger{
Comparer: CacheComparer{
NodeLister: nodeLister,
PodLister: podLister,
Cache: cache,
PodQueue: podQueue,
},
Dumper: CacheDumper{
cache: cache,
podQueue: podQueue,
},
}
}
// ListenForSignal starts a goroutine that will trigger the CacheDebugger's
// behavior when the process receives SIGINT (Windows) or SIGUSER2 (non-Windows).
func (d *CacheDebugger) ListenForSignal(ctx context.Context) {
logger := klog.FromContext(ctx)
stopCh := ctx.Done()
ch := make(chan os.Signal, 1)
signal.Notify(ch, compareSignal)
go func() {
for {
select {
case <-stopCh:
return
case <-ch:
d.Comparer.Compare(logger)
d.Dumper.DumpAll(logger)
}
}
}()
}

View File

@ -0,0 +1,88 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import (
"fmt"
"strings"
"k8s.io/klog/v2"
v1 "k8s.io/api/core/v1"
internalcache "k8s.io/kubernetes/pkg/scheduler/backend/cache"
"k8s.io/kubernetes/pkg/scheduler/backend/queue"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// CacheDumper writes some information from the scheduler cache and the scheduling queue to the
// scheduler logs for debugging purposes.
type CacheDumper struct {
cache internalcache.Cache
podQueue queue.SchedulingQueue
}
// DumpAll writes cached nodes and scheduling queue information to the scheduler logs.
func (d *CacheDumper) DumpAll(logger klog.Logger) {
d.dumpNodes(logger)
d.dumpSchedulingQueue(logger)
}
// dumpNodes writes NodeInfo to the scheduler logs.
func (d *CacheDumper) dumpNodes(logger klog.Logger) {
dump := d.cache.Dump()
nodeInfos := make([]string, 0, len(dump.Nodes))
for name, nodeInfo := range dump.Nodes {
nodeInfos = append(nodeInfos, d.printNodeInfo(name, nodeInfo))
}
// Extra blank line added between node entries for readability.
logger.Info("Dump of cached NodeInfo", "nodes", strings.Join(nodeInfos, "\n\n"))
}
// dumpSchedulingQueue writes pods in the scheduling queue to the scheduler logs.
func (d *CacheDumper) dumpSchedulingQueue(logger klog.Logger) {
pendingPods, s := d.podQueue.PendingPods()
var podData strings.Builder
for _, p := range pendingPods {
podData.WriteString(printPod(p))
}
logger.Info("Dump of scheduling queue", "summary", s, "pods", podData.String())
}
// printNodeInfo writes parts of NodeInfo to a string.
func (d *CacheDumper) printNodeInfo(name string, n *framework.NodeInfo) string {
var nodeData strings.Builder
nodeData.WriteString(fmt.Sprintf("Node name: %s\nDeleted: %t\nRequested Resources: %+v\nAllocatable Resources:%+v\nScheduled Pods(number: %v):\n",
name, n.Node() == nil, n.Requested, n.Allocatable, len(n.Pods)))
// Dumping Pod Info
for _, p := range n.Pods {
nodeData.WriteString(printPod(p.Pod))
}
// Dumping nominated pods info on the node
nominatedPodInfos := d.podQueue.NominatedPodsForNode(name)
if len(nominatedPodInfos) != 0 {
nodeData.WriteString(fmt.Sprintf("Nominated Pods(number: %v):\n", len(nominatedPodInfos)))
for _, pi := range nominatedPodInfos {
nodeData.WriteString(printPod(pi.Pod))
}
}
return nodeData.String()
}
// printPod writes parts of a Pod object to a string.
func printPod(p *v1.Pod) string {
return fmt.Sprintf("name: %v, namespace: %v, uid: %v, phase: %v, nominated node: %v\n", p.Name, p.Namespace, p.UID, p.Status.Phase, p.Status.NominatedNodeName)
}

View File

@ -0,0 +1,26 @@
//go:build !windows
// +build !windows
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import "syscall"
// compareSignal is the signal to trigger cache compare. For non-windows
// environment it's SIGUSR2.
var compareSignal = syscall.SIGUSR2

View File

@ -0,0 +1,23 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package debugger
import "os"
// compareSignal is the signal to trigger cache compare. For windows,
// it's SIGINT.
var compareSignal = os.Interrupt

View File

@ -0,0 +1,123 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// Cache collects pods' information and provides node-level aggregated information.
// It's intended for generic scheduler to do efficient lookup.
// Cache's operations are pod centric. It does incremental updates based on pod events.
// Pod events are sent via network. We don't have guaranteed delivery of all events:
// We use Reflector to list and watch from remote.
// Reflector might be slow and do a relist, which would lead to missing events.
//
// State Machine of a pod's events in scheduler's cache:
//
// +-------------------------------------------+ +----+
// | Add | | |
// | | | | Update
// + Assume Add v v |
//
// Initial +--------> Assumed +------------+---> Added <--+
//
// ^ + + | +
// | | | | |
// | | | Add | | Remove
// | | | | |
// | | | + |
// +----------------+ +-----------> Expired +----> Deleted
// Forget Expire
//
// Note that an assumed pod can expire, because if we haven't received Add event notifying us
// for a while, there might be some problems and we shouldn't keep the pod in cache anymore.
//
// Note that "Initial", "Expired", and "Deleted" pods do not actually exist in cache.
// Based on existing use cases, we are making the following assumptions:
// - No pod would be assumed twice
// - A pod could be added without going through scheduler. In this case, we will see Add but not Assume event.
// - If a pod wasn't added, it wouldn't be removed or updated.
// - Both "Expired" and "Deleted" are valid end states. In case of some problems, e.g. network issue,
// a pod might have changed its state (e.g. added and deleted) without delivering notification to the cache.
type Cache interface {
// NodeCount returns the number of nodes in the cache.
// DO NOT use outside of tests.
NodeCount() int
// PodCount returns the number of pods in the cache (including those from deleted nodes).
// DO NOT use outside of tests.
PodCount() (int, error)
// AssumePod assumes a pod scheduled and aggregates the pod's information into its node.
// The implementation also decides the policy to expire pod before being confirmed (receiving Add event).
// After expiration, its information would be subtracted.
AssumePod(logger klog.Logger, pod *v1.Pod) error
// FinishBinding signals that cache for assumed pod can be expired
FinishBinding(logger klog.Logger, pod *v1.Pod) error
// ForgetPod removes an assumed pod from cache.
ForgetPod(logger klog.Logger, pod *v1.Pod) error
// AddPod either confirms a pod if it's assumed, or adds it back if it's expired.
// If added back, the pod's information would be added again.
AddPod(logger klog.Logger, pod *v1.Pod) error
// UpdatePod removes oldPod's information and adds newPod's information.
UpdatePod(logger klog.Logger, oldPod, newPod *v1.Pod) error
// RemovePod removes a pod. The pod's information would be subtracted from assigned node.
RemovePod(logger klog.Logger, pod *v1.Pod) error
// GetPod returns the pod from the cache with the same namespace and the
// same name of the specified pod.
GetPod(pod *v1.Pod) (*v1.Pod, error)
// IsAssumedPod returns true if the pod is assumed and not expired.
IsAssumedPod(pod *v1.Pod) (bool, error)
// AddNode adds overall information about node.
// It returns a clone of added NodeInfo object.
AddNode(logger klog.Logger, node *v1.Node) *framework.NodeInfo
// UpdateNode updates overall information about node.
// It returns a clone of updated NodeInfo object.
UpdateNode(logger klog.Logger, oldNode, newNode *v1.Node) *framework.NodeInfo
// RemoveNode removes overall information about node.
RemoveNode(logger klog.Logger, node *v1.Node) error
// UpdateSnapshot updates the passed infoSnapshot to the current contents of Cache.
// The node info contains aggregated information of pods scheduled (including assumed to be)
// on this node.
// The snapshot only includes Nodes that are not deleted at the time this function is called.
// nodeinfo.Node() is guaranteed to be not nil for all the nodes in the snapshot.
UpdateSnapshot(logger klog.Logger, nodeSnapshot *Snapshot) error
// Dump produces a dump of the current cache.
Dump() *Dump
}
// Dump is a dump of the cache state.
type Dump struct {
AssumedPods sets.Set[string]
Nodes map[string]*framework.NodeInfo
}

View File

@ -0,0 +1,143 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"errors"
"fmt"
v1 "k8s.io/api/core/v1"
utilnode "k8s.io/component-helpers/node/topology"
"k8s.io/klog/v2"
)
// nodeTree is a tree-like data structure that holds node names in each zone. Zone names are
// keys to "NodeTree.tree" and values of "NodeTree.tree" are arrays of node names.
// NodeTree is NOT thread-safe, any concurrent updates/reads from it must be synchronized by the caller.
// It is used only by schedulerCache, and should stay as such.
type nodeTree struct {
tree map[string][]string // a map from zone (region-zone) to an array of nodes in the zone.
zones []string // a list of all the zones in the tree (keys)
numNodes int
}
// newNodeTree creates a NodeTree from nodes.
func newNodeTree(logger klog.Logger, nodes []*v1.Node) *nodeTree {
nt := &nodeTree{
tree: make(map[string][]string, len(nodes)),
}
for _, n := range nodes {
nt.addNode(logger, n)
}
return nt
}
// addNode adds a node and its corresponding zone to the tree. If the zone already exists, the node
// is added to the array of nodes in that zone.
func (nt *nodeTree) addNode(logger klog.Logger, n *v1.Node) {
zone := utilnode.GetZoneKey(n)
if na, ok := nt.tree[zone]; ok {
for _, nodeName := range na {
if nodeName == n.Name {
logger.Info("Did not add to the NodeTree because it already exists", "node", klog.KObj(n))
return
}
}
nt.tree[zone] = append(na, n.Name)
} else {
nt.zones = append(nt.zones, zone)
nt.tree[zone] = []string{n.Name}
}
logger.V(2).Info("Added node to NodeTree", "node", klog.KObj(n), "zone", zone)
nt.numNodes++
}
// removeNode removes a node from the NodeTree.
func (nt *nodeTree) removeNode(logger klog.Logger, n *v1.Node) error {
zone := utilnode.GetZoneKey(n)
if na, ok := nt.tree[zone]; ok {
for i, nodeName := range na {
if nodeName == n.Name {
nt.tree[zone] = append(na[:i], na[i+1:]...)
if len(nt.tree[zone]) == 0 {
nt.removeZone(zone)
}
logger.V(2).Info("Removed node from NodeTree", "node", klog.KObj(n), "zone", zone)
nt.numNodes--
return nil
}
}
}
logger.Error(nil, "Did not remove Node in NodeTree because it was not found", "node", klog.KObj(n), "zone", zone)
return fmt.Errorf("node %q in group %q was not found", n.Name, zone)
}
// removeZone removes a zone from tree.
// This function must be called while writer locks are hold.
func (nt *nodeTree) removeZone(zone string) {
delete(nt.tree, zone)
for i, z := range nt.zones {
if z == zone {
nt.zones = append(nt.zones[:i], nt.zones[i+1:]...)
return
}
}
}
// updateNode updates a node in the NodeTree.
func (nt *nodeTree) updateNode(logger klog.Logger, old, new *v1.Node) {
var oldZone string
if old != nil {
oldZone = utilnode.GetZoneKey(old)
}
newZone := utilnode.GetZoneKey(new)
// If the zone ID of the node has not changed, we don't need to do anything. Name of the node
// cannot be changed in an update.
if oldZone == newZone {
return
}
nt.removeNode(logger, old) // No error checking. We ignore whether the old node exists or not.
nt.addNode(logger, new)
}
// list returns the list of names of the node. NodeTree iterates over zones and in each zone iterates
// over nodes in a round robin fashion.
func (nt *nodeTree) list() ([]string, error) {
if len(nt.zones) == 0 {
return nil, nil
}
nodesList := make([]string, 0, nt.numNodes)
numExhaustedZones := 0
nodeIndex := 0
for len(nodesList) < nt.numNodes {
if numExhaustedZones >= len(nt.zones) { // all zones are exhausted.
return nodesList, errors.New("all zones exhausted before reaching count of nodes expected")
}
for zoneIndex := 0; zoneIndex < len(nt.zones); zoneIndex++ {
na := nt.tree[nt.zones[zoneIndex]]
if nodeIndex >= len(na) { // If the zone is exhausted, continue
if nodeIndex == len(na) { // If it is the first time the zone is exhausted
numExhaustedZones++
}
continue
}
nodesList = append(nodesList, na[nodeIndex])
}
nodeIndex++
}
return nodesList, nil
}

View File

@ -0,0 +1,198 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package cache
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// Snapshot is a snapshot of cache NodeInfo and NodeTree order. The scheduler takes a
// snapshot at the beginning of each scheduling cycle and uses it for its operations in that cycle.
type Snapshot struct {
// nodeInfoMap a map of node name to a snapshot of its NodeInfo.
nodeInfoMap map[string]*framework.NodeInfo
// nodeInfoList is the list of nodes as ordered in the cache's nodeTree.
nodeInfoList []*framework.NodeInfo
// havePodsWithAffinityNodeInfoList is the list of nodes with at least one pod declaring affinity terms.
havePodsWithAffinityNodeInfoList []*framework.NodeInfo
// havePodsWithRequiredAntiAffinityNodeInfoList is the list of nodes with at least one pod declaring
// required anti-affinity terms.
havePodsWithRequiredAntiAffinityNodeInfoList []*framework.NodeInfo
// usedPVCSet contains a set of PVC names that have one or more scheduled pods using them,
// keyed in the format "namespace/name".
usedPVCSet sets.Set[string]
generation int64
}
var _ framework.SharedLister = &Snapshot{}
// NewEmptySnapshot initializes a Snapshot struct and returns it.
func NewEmptySnapshot() *Snapshot {
return &Snapshot{
nodeInfoMap: make(map[string]*framework.NodeInfo),
usedPVCSet: sets.New[string](),
}
}
// NewSnapshot initializes a Snapshot struct and returns it.
func NewSnapshot(pods []*v1.Pod, nodes []*v1.Node) *Snapshot {
nodeInfoMap := createNodeInfoMap(pods, nodes)
nodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
havePodsWithAffinityNodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
havePodsWithRequiredAntiAffinityNodeInfoList := make([]*framework.NodeInfo, 0, len(nodeInfoMap))
for _, v := range nodeInfoMap {
nodeInfoList = append(nodeInfoList, v)
if len(v.PodsWithAffinity) > 0 {
havePodsWithAffinityNodeInfoList = append(havePodsWithAffinityNodeInfoList, v)
}
if len(v.PodsWithRequiredAntiAffinity) > 0 {
havePodsWithRequiredAntiAffinityNodeInfoList = append(havePodsWithRequiredAntiAffinityNodeInfoList, v)
}
}
s := NewEmptySnapshot()
s.nodeInfoMap = nodeInfoMap
s.nodeInfoList = nodeInfoList
s.havePodsWithAffinityNodeInfoList = havePodsWithAffinityNodeInfoList
s.havePodsWithRequiredAntiAffinityNodeInfoList = havePodsWithRequiredAntiAffinityNodeInfoList
s.usedPVCSet = createUsedPVCSet(pods)
return s
}
// createNodeInfoMap obtains a list of pods and pivots that list into a map
// where the keys are node names and the values are the aggregated information
// for that node.
func createNodeInfoMap(pods []*v1.Pod, nodes []*v1.Node) map[string]*framework.NodeInfo {
nodeNameToInfo := make(map[string]*framework.NodeInfo)
for _, pod := range pods {
nodeName := pod.Spec.NodeName
if _, ok := nodeNameToInfo[nodeName]; !ok {
nodeNameToInfo[nodeName] = framework.NewNodeInfo()
}
nodeNameToInfo[nodeName].AddPod(pod)
}
imageExistenceMap := createImageExistenceMap(nodes)
for _, node := range nodes {
if _, ok := nodeNameToInfo[node.Name]; !ok {
nodeNameToInfo[node.Name] = framework.NewNodeInfo()
}
nodeInfo := nodeNameToInfo[node.Name]
nodeInfo.SetNode(node)
nodeInfo.ImageStates = getNodeImageStates(node, imageExistenceMap)
}
return nodeNameToInfo
}
func createUsedPVCSet(pods []*v1.Pod) sets.Set[string] {
usedPVCSet := sets.New[string]()
for _, pod := range pods {
if pod.Spec.NodeName == "" {
continue
}
for _, v := range pod.Spec.Volumes {
if v.PersistentVolumeClaim == nil {
continue
}
key := framework.GetNamespacedName(pod.Namespace, v.PersistentVolumeClaim.ClaimName)
usedPVCSet.Insert(key)
}
}
return usedPVCSet
}
// getNodeImageStates returns the given node's image states based on the given imageExistence map.
func getNodeImageStates(node *v1.Node, imageExistenceMap map[string]sets.Set[string]) map[string]*framework.ImageStateSummary {
imageStates := make(map[string]*framework.ImageStateSummary)
for _, image := range node.Status.Images {
for _, name := range image.Names {
imageStates[name] = &framework.ImageStateSummary{
Size: image.SizeBytes,
NumNodes: imageExistenceMap[name].Len(),
}
}
}
return imageStates
}
// createImageExistenceMap returns a map recording on which nodes the images exist, keyed by the images' names.
func createImageExistenceMap(nodes []*v1.Node) map[string]sets.Set[string] {
imageExistenceMap := make(map[string]sets.Set[string])
for _, node := range nodes {
for _, image := range node.Status.Images {
for _, name := range image.Names {
if _, ok := imageExistenceMap[name]; !ok {
imageExistenceMap[name] = sets.New(node.Name)
} else {
imageExistenceMap[name].Insert(node.Name)
}
}
}
}
return imageExistenceMap
}
// NodeInfos returns a NodeInfoLister.
func (s *Snapshot) NodeInfos() framework.NodeInfoLister {
return s
}
// StorageInfos returns a StorageInfoLister.
func (s *Snapshot) StorageInfos() framework.StorageInfoLister {
return s
}
// NumNodes returns the number of nodes in the snapshot.
func (s *Snapshot) NumNodes() int {
return len(s.nodeInfoList)
}
// List returns the list of nodes in the snapshot.
func (s *Snapshot) List() ([]*framework.NodeInfo, error) {
return s.nodeInfoList, nil
}
// HavePodsWithAffinityList returns the list of nodes with at least one pod with inter-pod affinity
func (s *Snapshot) HavePodsWithAffinityList() ([]*framework.NodeInfo, error) {
return s.havePodsWithAffinityNodeInfoList, nil
}
// HavePodsWithRequiredAntiAffinityList returns the list of nodes with at least one pod with
// required inter-pod anti-affinity
func (s *Snapshot) HavePodsWithRequiredAntiAffinityList() ([]*framework.NodeInfo, error) {
return s.havePodsWithRequiredAntiAffinityNodeInfoList, nil
}
// Get returns the NodeInfo of the given node name.
func (s *Snapshot) Get(nodeName string) (*framework.NodeInfo, error) {
if v, ok := s.nodeInfoMap[nodeName]; ok && v.Node() != nil {
return v, nil
}
return nil, fmt.Errorf("nodeinfo not found for node name %q", nodeName)
}
func (s *Snapshot) IsPVCUsedByPods(key string) bool {
return s.usedPVCSet.Has(key)
}

View File

@ -0,0 +1,244 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Below is the implementation of the a heap. The logic is pretty much the same
// as cache.heap, however, this heap does not perform synchronization. It leaves
// synchronization to the SchedulingQueue.
package heap
import (
"container/heap"
"fmt"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
// KeyFunc is a function type to get the key from an object.
type KeyFunc[T any] func(obj T) string
type heapItem[T any] struct {
obj T // The object which is stored in the heap.
index int // The index of the object's key in the Heap.queue.
}
type itemKeyValue[T any] struct {
key string
obj T
}
// data is an internal struct that implements the standard heap interface
// and keeps the data stored in the heap.
type data[T any] struct {
// items is a map from key of the objects to the objects and their index.
// We depend on the property that items in the map are in the queue and vice versa.
items map[string]*heapItem[T]
// queue implements a heap data structure and keeps the order of elements
// according to the heap invariant. The queue keeps the keys of objects stored
// in "items".
queue []string
// keyFunc is used to make the key used for queued item insertion and retrieval, and
// should be deterministic.
keyFunc KeyFunc[T]
// lessFunc is used to compare two objects in the heap.
lessFunc LessFunc[T]
}
var (
_ = heap.Interface(&data[any]{}) // heapData is a standard heap
)
// Less compares two objects and returns true if the first one should go
// in front of the second one in the heap.
func (h *data[T]) Less(i, j int) bool {
if i > len(h.queue) || j > len(h.queue) {
return false
}
itemi, ok := h.items[h.queue[i]]
if !ok {
return false
}
itemj, ok := h.items[h.queue[j]]
if !ok {
return false
}
return h.lessFunc(itemi.obj, itemj.obj)
}
// Len returns the number of items in the Heap.
func (h *data[T]) Len() int { return len(h.queue) }
// Swap implements swapping of two elements in the heap. This is a part of standard
// heap interface and should never be called directly.
func (h *data[T]) Swap(i, j int) {
if i < 0 || j < 0 {
return
}
h.queue[i], h.queue[j] = h.queue[j], h.queue[i]
item := h.items[h.queue[i]]
item.index = i
item = h.items[h.queue[j]]
item.index = j
}
// Push is supposed to be called by container/heap.Push only.
func (h *data[T]) Push(kv interface{}) {
keyValue := kv.(*itemKeyValue[T])
n := len(h.queue)
h.items[keyValue.key] = &heapItem[T]{keyValue.obj, n}
h.queue = append(h.queue, keyValue.key)
}
// Pop is supposed to be called by container/heap.Pop only.
func (h *data[T]) Pop() interface{} {
if len(h.queue) == 0 {
return nil
}
key := h.queue[len(h.queue)-1]
h.queue = h.queue[0 : len(h.queue)-1]
item, ok := h.items[key]
if !ok {
// This is an error
return nil
}
delete(h.items, key)
return item.obj
}
// Peek returns the head of the heap without removing it.
func (h *data[T]) Peek() (T, bool) {
if len(h.queue) > 0 {
return h.items[h.queue[0]].obj, true
}
var zero T
return zero, false
}
// Heap is a producer/consumer queue that implements a heap data structure.
// It can be used to implement priority queues and similar data structures.
type Heap[T any] struct {
// data stores objects and has a queue that keeps their ordering according
// to the heap invariant.
data *data[T]
// metricRecorder updates the counter when elements of a heap get added or
// removed, and it does nothing if it's nil
metricRecorder metrics.MetricRecorder
}
// AddOrUpdate inserts an item, and puts it in the queue. The item is updated if it
// already exists.
func (h *Heap[T]) AddOrUpdate(obj T) {
key := h.data.keyFunc(obj)
if _, exists := h.data.items[key]; exists {
h.data.items[key].obj = obj
heap.Fix(h.data, h.data.items[key].index)
} else {
heap.Push(h.data, &itemKeyValue[T]{key, obj})
if h.metricRecorder != nil {
h.metricRecorder.Inc()
}
}
}
// Delete removes an item.
func (h *Heap[T]) Delete(obj T) error {
key := h.data.keyFunc(obj)
if item, ok := h.data.items[key]; ok {
heap.Remove(h.data, item.index)
if h.metricRecorder != nil {
h.metricRecorder.Dec()
}
return nil
}
return fmt.Errorf("object not found")
}
// Peek returns the head of the heap without removing it.
func (h *Heap[T]) Peek() (T, bool) {
return h.data.Peek()
}
// Pop returns the head of the heap and removes it.
func (h *Heap[T]) Pop() (T, error) {
obj := heap.Pop(h.data)
if obj != nil {
if h.metricRecorder != nil {
h.metricRecorder.Dec()
}
return obj.(T), nil
}
var zero T
return zero, fmt.Errorf("heap is empty")
}
// Get returns the requested item, or sets exists=false.
func (h *Heap[T]) Get(obj T) (T, bool) {
key := h.data.keyFunc(obj)
return h.GetByKey(key)
}
// GetByKey returns the requested item, or sets exists=false.
func (h *Heap[T]) GetByKey(key string) (T, bool) {
item, exists := h.data.items[key]
if !exists {
var zero T
return zero, false
}
return item.obj, true
}
func (h *Heap[T]) Has(obj T) bool {
key := h.data.keyFunc(obj)
_, ok := h.GetByKey(key)
return ok
}
// List returns a list of all the items.
func (h *Heap[T]) List() []T {
list := make([]T, 0, len(h.data.items))
for _, item := range h.data.items {
list = append(list, item.obj)
}
return list
}
// Len returns the number of items in the heap.
func (h *Heap[T]) Len() int {
return len(h.data.queue)
}
// New returns a Heap which can be used to queue up items to process.
func New[T any](keyFn KeyFunc[T], lessFn LessFunc[T]) *Heap[T] {
return NewWithRecorder(keyFn, lessFn, nil)
}
// NewWithRecorder wraps an optional metricRecorder to compose a Heap object.
func NewWithRecorder[T any](keyFn KeyFunc[T], lessFn LessFunc[T], metricRecorder metrics.MetricRecorder) *Heap[T] {
return &Heap[T]{
data: &data[T]{
items: map[string]*heapItem[T]{},
queue: []string{},
keyFunc: keyFn,
lessFunc: lessFn,
},
metricRecorder: metricRecorder,
}
}
// LessFunc is a function that receives two items and returns true if the first
// item should be placed before the second one when the list is sorted.
type LessFunc[T any] func(item1, item2 T) bool

View File

@ -0,0 +1,415 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"container/list"
"fmt"
"sync"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/backend/heap"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
// activeQueuer is a wrapper for activeQ related operations.
// Its methods, except "unlocked" ones, take the lock inside.
// Note: be careful when using unlocked() methods.
// getLock() methods should be used only for unlocked() methods
// and it is forbidden to call any other activeQueuer's method under this lock.
type activeQueuer interface {
underLock(func(unlockedActiveQ unlockedActiveQueuer))
underRLock(func(unlockedActiveQ unlockedActiveQueueReader))
update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo
delete(pInfo *framework.QueuedPodInfo) error
pop(logger klog.Logger) (*framework.QueuedPodInfo, error)
list() []*v1.Pod
len() int
has(pInfo *framework.QueuedPodInfo) bool
listInFlightEvents() []interface{}
listInFlightPods() []*v1.Pod
clusterEventsForPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) ([]*clusterEvent, error)
addEventsIfPodInFlight(oldPod, newPod *v1.Pod, events []framework.ClusterEvent) bool
addEventIfAnyInFlight(oldObj, newObj interface{}, event framework.ClusterEvent) bool
schedulingCycle() int64
done(pod types.UID)
close()
broadcast()
}
// unlockedActiveQueuer defines activeQ methods that are not protected by the lock itself.
// underLock() method should be used to protect these methods.
type unlockedActiveQueuer interface {
unlockedActiveQueueReader
AddOrUpdate(pInfo *framework.QueuedPodInfo)
}
// unlockedActiveQueueReader defines activeQ read-only methods that are not protected by the lock itself.
// underLock() or underRLock() method should be used to protect these methods.
type unlockedActiveQueueReader interface {
Get(pInfo *framework.QueuedPodInfo) (*framework.QueuedPodInfo, bool)
Has(pInfo *framework.QueuedPodInfo) bool
}
// activeQueue implements activeQueuer. All of the fields have to be protected using the lock.
type activeQueue struct {
// lock synchronizes all operations related to activeQ.
// It protects activeQ, inFlightPods, inFlightEvents, schedulingCycle and closed fields.
// Caution: DO NOT take "SchedulingQueue.lock" after taking "lock".
// You should always take "SchedulingQueue.lock" first, otherwise the queue could end up in deadlock.
// "lock" should not be taken after taking "nLock".
// Correct locking order is: SchedulingQueue.lock > lock > nominator.nLock.
lock sync.RWMutex
// activeQ is heap structure that scheduler actively looks at to find pods to
// schedule. Head of heap is the highest priority pod.
queue *heap.Heap[*framework.QueuedPodInfo]
// cond is a condition that is notified when the pod is added to activeQ.
// It is used with lock.
cond sync.Cond
// inFlightPods holds the UID of all pods which have been popped out for which Done
// hasn't been called yet - in other words, all pods that are currently being
// processed (being scheduled, in permit, or in the binding cycle).
//
// The values in the map are the entry of each pod in the inFlightEvents list.
// The value of that entry is the *v1.Pod at the time that scheduling of that
// pod started, which can be useful for logging or debugging.
inFlightPods map[types.UID]*list.Element
// inFlightEvents holds the events received by the scheduling queue
// (entry value is clusterEvent) together with in-flight pods (entry
// value is *v1.Pod). Entries get added at the end while the mutex is
// locked, so they get serialized.
//
// The pod entries are added in Pop and used to track which events
// occurred after the pod scheduling attempt for that pod started.
// They get removed when the scheduling attempt is done, at which
// point all events that occurred in the meantime are processed.
//
// After removal of a pod, events at the start of the list are no
// longer needed because all of the other in-flight pods started
// later. Those events can be removed.
inFlightEvents *list.List
// schedCycle represents sequence number of scheduling cycle and is incremented
// when a pod is popped.
schedCycle int64
// closed indicates that the queue is closed.
// It is mainly used to let Pop() exit its control loop while waiting for an item.
closed bool
// isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled.
isSchedulingQueueHintEnabled bool
metricsRecorder metrics.MetricAsyncRecorder
}
func newActiveQueue(queue *heap.Heap[*framework.QueuedPodInfo], isSchedulingQueueHintEnabled bool, metricRecorder metrics.MetricAsyncRecorder) *activeQueue {
aq := &activeQueue{
queue: queue,
inFlightPods: make(map[types.UID]*list.Element),
inFlightEvents: list.New(),
isSchedulingQueueHintEnabled: isSchedulingQueueHintEnabled,
metricsRecorder: metricRecorder,
}
aq.cond.L = &aq.lock
return aq
}
// underLock runs the fn function under the lock.Lock.
// fn can run unlockedActiveQueuer methods but should NOT run any other activeQueue method,
// as it would end up in deadlock.
func (aq *activeQueue) underLock(fn func(unlockedActiveQ unlockedActiveQueuer)) {
aq.lock.Lock()
defer aq.lock.Unlock()
fn(aq.queue)
}
// underLock runs the fn function under the lock.RLock.
// fn can run unlockedActiveQueueReader methods but should NOT run any other activeQueue method,
// as it would end up in deadlock.
func (aq *activeQueue) underRLock(fn func(unlockedActiveQ unlockedActiveQueueReader)) {
aq.lock.RLock()
defer aq.lock.RUnlock()
fn(aq.queue)
}
// update updates the pod in activeQ if oldPodInfo is already in the queue.
// It returns new pod info if updated, nil otherwise.
func (aq *activeQueue) update(newPod *v1.Pod, oldPodInfo *framework.QueuedPodInfo) *framework.QueuedPodInfo {
aq.lock.Lock()
defer aq.lock.Unlock()
if pInfo, exists := aq.queue.Get(oldPodInfo); exists {
_ = pInfo.Update(newPod)
aq.queue.AddOrUpdate(pInfo)
return pInfo
}
return nil
}
// delete deletes the pod info from activeQ.
func (aq *activeQueue) delete(pInfo *framework.QueuedPodInfo) error {
aq.lock.Lock()
defer aq.lock.Unlock()
return aq.queue.Delete(pInfo)
}
// pop removes the head of the queue and returns it.
// It blocks if the queue is empty and waits until a new item is added to the queue.
// It increments scheduling cycle when a pod is popped.
func (aq *activeQueue) pop(logger klog.Logger) (*framework.QueuedPodInfo, error) {
aq.lock.Lock()
defer aq.lock.Unlock()
return aq.unlockedPop(logger)
}
func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo, error) {
for aq.queue.Len() == 0 {
// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
// When Close() is called, the p.closed is set and the condition is broadcast,
// which causes this loop to continue and return from the Pop().
if aq.closed {
logger.V(2).Info("Scheduling queue is closed")
return nil, nil
}
aq.cond.Wait()
}
pInfo, err := aq.queue.Pop()
if err != nil {
return nil, err
}
pInfo.Attempts++
// In flight, no concurrent events yet.
if aq.isSchedulingQueueHintEnabled {
// If the pod is already in the map, we shouldn't overwrite the inFlightPods otherwise it'd lead to a memory leak.
// https://github.com/kubernetes/kubernetes/pull/127016
if _, ok := aq.inFlightPods[pInfo.Pod.UID]; ok {
// Just report it as an error, but no need to stop the scheduler
// because it likely doesn't cause any visible issues from the scheduling perspective.
logger.Error(nil, "the same pod is tracked in multiple places in the scheduler, and just discard it", "pod", klog.KObj(pInfo.Pod))
// Just ignore/discard this duplicated pod and try to pop the next one.
return aq.unlockedPop(logger)
}
aq.metricsRecorder.ObserveInFlightEventsAsync(metrics.PodPoppedInFlightEvent, 1, false)
aq.inFlightPods[pInfo.Pod.UID] = aq.inFlightEvents.PushBack(pInfo.Pod)
}
aq.schedCycle++
// Update metrics and reset the set of unschedulable plugins for the next attempt.
for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) {
metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec()
}
pInfo.UnschedulablePlugins.Clear()
pInfo.PendingPlugins.Clear()
return pInfo, nil
}
// list returns all pods that are in the queue.
func (aq *activeQueue) list() []*v1.Pod {
aq.lock.RLock()
defer aq.lock.RUnlock()
var result []*v1.Pod
for _, pInfo := range aq.queue.List() {
result = append(result, pInfo.Pod)
}
return result
}
// len returns length of the queue.
func (aq *activeQueue) len() int {
return aq.queue.Len()
}
// has inform if pInfo exists in the queue.
func (aq *activeQueue) has(pInfo *framework.QueuedPodInfo) bool {
aq.lock.RLock()
defer aq.lock.RUnlock()
return aq.queue.Has(pInfo)
}
// listInFlightEvents returns all inFlightEvents.
func (aq *activeQueue) listInFlightEvents() []interface{} {
aq.lock.RLock()
defer aq.lock.RUnlock()
var values []interface{}
for event := aq.inFlightEvents.Front(); event != nil; event = event.Next() {
values = append(values, event.Value)
}
return values
}
// listInFlightPods returns all inFlightPods.
func (aq *activeQueue) listInFlightPods() []*v1.Pod {
aq.lock.RLock()
defer aq.lock.RUnlock()
var pods []*v1.Pod
for _, obj := range aq.inFlightPods {
pods = append(pods, obj.Value.(*v1.Pod))
}
return pods
}
// clusterEventsForPod gets all cluster events that have happened during pod for pInfo is being scheduled.
func (aq *activeQueue) clusterEventsForPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) ([]*clusterEvent, error) {
aq.lock.RLock()
defer aq.lock.RUnlock()
logger.V(5).Info("Checking events for in-flight pod", "pod", klog.KObj(pInfo.Pod), "unschedulablePlugins", pInfo.UnschedulablePlugins, "inFlightEventsSize", aq.inFlightEvents.Len(), "inFlightPodsSize", len(aq.inFlightPods))
// AddUnschedulableIfNotPresent is called with the Pod at the end of scheduling or binding.
// So, given pInfo should have been Pop()ed before,
// we can assume pInfo must be recorded in inFlightPods and thus inFlightEvents.
inFlightPod, ok := aq.inFlightPods[pInfo.Pod.UID]
if !ok {
return nil, fmt.Errorf("in flight Pod isn't found in the scheduling queue. If you see this error log, it's likely a bug in the scheduler")
}
var events []*clusterEvent
for event := inFlightPod.Next(); event != nil; event = event.Next() {
e, ok := event.Value.(*clusterEvent)
if !ok {
// Must be another in-flight Pod (*v1.Pod). Can be ignored.
continue
}
events = append(events, e)
}
return events, nil
}
// addEventsIfPodInFlight adds clusterEvent to inFlightEvents if the newPod is in inFlightPods.
// It returns true if pushed the event to the inFlightEvents.
func (aq *activeQueue) addEventsIfPodInFlight(oldPod, newPod *v1.Pod, events []framework.ClusterEvent) bool {
aq.lock.Lock()
defer aq.lock.Unlock()
_, ok := aq.inFlightPods[newPod.UID]
if ok {
for _, event := range events {
aq.metricsRecorder.ObserveInFlightEventsAsync(event.Label(), 1, false)
aq.inFlightEvents.PushBack(&clusterEvent{
event: event,
oldObj: oldPod,
newObj: newPod,
})
}
}
return ok
}
// addEventIfAnyInFlight adds clusterEvent to inFlightEvents if any pod is in inFlightPods.
// It returns true if pushed the event to the inFlightEvents.
func (aq *activeQueue) addEventIfAnyInFlight(oldObj, newObj interface{}, event framework.ClusterEvent) bool {
aq.lock.Lock()
defer aq.lock.Unlock()
if len(aq.inFlightPods) != 0 {
aq.metricsRecorder.ObserveInFlightEventsAsync(event.Label(), 1, false)
aq.inFlightEvents.PushBack(&clusterEvent{
event: event,
oldObj: oldObj,
newObj: newObj,
})
return true
}
return false
}
func (aq *activeQueue) schedulingCycle() int64 {
aq.lock.RLock()
defer aq.lock.RUnlock()
return aq.schedCycle
}
// done must be called for pod returned by Pop. This allows the queue to
// keep track of which pods are currently being processed.
func (aq *activeQueue) done(pod types.UID) {
aq.lock.Lock()
defer aq.lock.Unlock()
inFlightPod, ok := aq.inFlightPods[pod]
if !ok {
// This Pod is already done()ed.
return
}
delete(aq.inFlightPods, pod)
// Remove the pod from the list.
aq.inFlightEvents.Remove(inFlightPod)
aggrMetricsCounter := map[string]int{}
// Remove events which are only referred to by this Pod
// so that the inFlightEvents list doesn't grow infinitely.
// If the pod was at the head of the list, then all
// events between it and the next pod are no longer needed
// and can be removed.
for {
e := aq.inFlightEvents.Front()
if e == nil {
// Empty list.
break
}
ev, ok := e.Value.(*clusterEvent)
if !ok {
// A pod, must stop pruning.
break
}
aq.inFlightEvents.Remove(e)
aggrMetricsCounter[ev.event.Label()]--
}
for evLabel, count := range aggrMetricsCounter {
aq.metricsRecorder.ObserveInFlightEventsAsync(evLabel, float64(count), false)
}
aq.metricsRecorder.ObserveInFlightEventsAsync(metrics.PodPoppedInFlightEvent, -1,
// If it's the last Pod in inFlightPods, we should force-flush the metrics.
// Otherwise, especially in small clusters, which don't get a new Pod frequently,
// the metrics might not be flushed for a long time.
len(aq.inFlightPods) == 0)
}
// close closes the activeQueue.
func (aq *activeQueue) close() {
// We should call done() for all in-flight pods to clean up the inFlightEvents metrics.
// It's safe even if the binding cycle running asynchronously calls done() afterwards
// done() will just be a no-op.
for pod := range aq.inFlightPods {
aq.done(pod)
}
aq.lock.Lock()
aq.closed = true
aq.lock.Unlock()
}
// broadcast notifies the pop() operation that new pod(s) was added to the activeQueue.
func (aq *activeQueue) broadcast() {
aq.cond.Broadcast()
}

View File

@ -0,0 +1,195 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"slices"
"sync"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
listersv1 "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// nominator is a structure that stores pods nominated to run on nodes.
// It exists because nominatedNodeName of pod objects stored in the structure
// may be different than what scheduler has here. We should be able to find pods
// by their UID and update/delete them.
type nominator struct {
// nLock synchronizes all operations related to nominator.
// It should not be used anywhere else.
// Caution: DO NOT take ("SchedulingQueue.lock" or "activeQueue.lock") after taking "nLock".
// You should always take "SchedulingQueue.lock" and "activeQueue.lock" first,
// otherwise the nominator could end up in deadlock.
// Correct locking order is: SchedulingQueue.lock > activeQueue.lock > nLock.
nLock sync.RWMutex
// podLister is used to verify if the given pod is alive.
podLister listersv1.PodLister
// nominatedPods is a map keyed by a node name and the value is a list of
// pods which are nominated to run on the node. These are pods which can be in
// the activeQ or unschedulablePods.
nominatedPods map[string][]podRef
// nominatedPodToNode is map keyed by a Pod UID to the node name where it is
// nominated.
nominatedPodToNode map[types.UID]string
}
func newPodNominator(podLister listersv1.PodLister) *nominator {
return &nominator{
podLister: podLister,
nominatedPods: make(map[string][]podRef),
nominatedPodToNode: make(map[types.UID]string),
}
}
// AddNominatedPod adds a pod to the nominated pods of the given node.
// This is called during the preemption process after a node is nominated to run
// the pod. We update the structure before sending a request to update the pod
// object to avoid races with the following scheduling cycles.
func (npm *nominator) AddNominatedPod(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) {
npm.nLock.Lock()
npm.addNominatedPodUnlocked(logger, pi, nominatingInfo)
npm.nLock.Unlock()
}
func (npm *nominator) addNominatedPodUnlocked(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) {
// Always delete the pod if it already exists, to ensure we never store more than
// one instance of the pod.
npm.deleteUnlocked(pi.Pod)
var nodeName string
if nominatingInfo.Mode() == framework.ModeOverride {
nodeName = nominatingInfo.NominatedNodeName
} else if nominatingInfo.Mode() == framework.ModeNoop {
if pi.Pod.Status.NominatedNodeName == "" {
return
}
nodeName = pi.Pod.Status.NominatedNodeName
}
if npm.podLister != nil {
// If the pod was removed or if it was already scheduled, don't nominate it.
updatedPod, err := npm.podLister.Pods(pi.Pod.Namespace).Get(pi.Pod.Name)
if err != nil {
logger.V(4).Info("Pod doesn't exist in podLister, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod))
return
}
if updatedPod.Spec.NodeName != "" {
logger.V(4).Info("Pod is already scheduled to a node, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod), "node", updatedPod.Spec.NodeName)
return
}
}
npm.nominatedPodToNode[pi.Pod.UID] = nodeName
for _, np := range npm.nominatedPods[nodeName] {
if np.uid == pi.Pod.UID {
logger.V(4).Info("Pod already exists in the nominator", "pod", np.uid)
return
}
}
npm.nominatedPods[nodeName] = append(npm.nominatedPods[nodeName], podToRef(pi.Pod))
}
// UpdateNominatedPod updates the <oldPod> with <newPod>.
func (npm *nominator) UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *framework.PodInfo) {
npm.nLock.Lock()
defer npm.nLock.Unlock()
// In some cases, an Update event with no "NominatedNode" present is received right
// after a node("NominatedNode") is reserved for this pod in memory.
// In this case, we need to keep reserving the NominatedNode when updating the pod pointer.
var nominatingInfo *framework.NominatingInfo
// We won't fall into below `if` block if the Update event represents:
// (1) NominatedNode info is added
// (2) NominatedNode info is updated
// (3) NominatedNode info is removed
if nominatedNodeName(oldPod) == "" && nominatedNodeName(newPodInfo.Pod) == "" {
if nnn, ok := npm.nominatedPodToNode[oldPod.UID]; ok {
// This is the only case we should continue reserving the NominatedNode
nominatingInfo = &framework.NominatingInfo{
NominatingMode: framework.ModeOverride,
NominatedNodeName: nnn,
}
}
}
// We update irrespective of the nominatedNodeName changed or not, to ensure
// that pod pointer is updated.
npm.deleteUnlocked(oldPod)
npm.addNominatedPodUnlocked(logger, newPodInfo, nominatingInfo)
}
// DeleteNominatedPodIfExists deletes <pod> from nominatedPods.
func (npm *nominator) DeleteNominatedPodIfExists(pod *v1.Pod) {
npm.nLock.Lock()
npm.deleteUnlocked(pod)
npm.nLock.Unlock()
}
func (npm *nominator) deleteUnlocked(p *v1.Pod) {
nnn, ok := npm.nominatedPodToNode[p.UID]
if !ok {
return
}
for i, np := range npm.nominatedPods[nnn] {
if np.uid == p.UID {
npm.nominatedPods[nnn] = append(npm.nominatedPods[nnn][:i], npm.nominatedPods[nnn][i+1:]...)
if len(npm.nominatedPods[nnn]) == 0 {
delete(npm.nominatedPods, nnn)
}
break
}
}
delete(npm.nominatedPodToNode, p.UID)
}
func (npm *nominator) nominatedPodsForNode(nodeName string) []podRef {
npm.nLock.RLock()
defer npm.nLock.RUnlock()
return slices.Clone(npm.nominatedPods[nodeName])
}
// nominatedNodeName returns nominated node name of a Pod.
func nominatedNodeName(pod *v1.Pod) string {
return pod.Status.NominatedNodeName
}
type podRef struct {
name string
namespace string
uid types.UID
}
func podToRef(pod *v1.Pod) podRef {
return podRef{
name: pod.Name,
namespace: pod.Namespace,
uid: pod.UID,
}
}
func (np podRef) toPod() *v1.Pod {
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: np.name,
Namespace: np.namespace,
UID: np.uid,
},
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,63 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queue
import (
"context"
"time"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
"k8s.io/client-go/kubernetes/fake"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
// NewTestQueue creates a priority queue with an empty informer factory.
func NewTestQueue(ctx context.Context, lessFn framework.LessFunc, opts ...Option) *PriorityQueue {
return NewTestQueueWithObjects(ctx, lessFn, nil, opts...)
}
// NewTestQueueWithObjects creates a priority queue with an informer factory
// populated with the provided objects.
func NewTestQueueWithObjects(
ctx context.Context,
lessFn framework.LessFunc,
objs []runtime.Object,
opts ...Option,
) *PriorityQueue {
informerFactory := informers.NewSharedInformerFactory(fake.NewClientset(objs...), 0)
// Because some major functions (e.g., Pop) requires the metric recorder to be set,
// we always set a metric recorder here.
recorder := metrics.NewMetricsAsyncRecorder(10, 20*time.Microsecond, ctx.Done())
// We set it before the options that users provide, so that users can override it.
opts = append([]Option{WithMetricsRecorder(*recorder)}, opts...)
return NewTestQueueWithInformerFactory(ctx, lessFn, informerFactory, opts...)
}
func NewTestQueueWithInformerFactory(
ctx context.Context,
lessFn framework.LessFunc,
informerFactory informers.SharedInformerFactory,
opts ...Option,
) *PriorityQueue {
pq := NewPriorityQueue(lessFn, informerFactory, opts...)
informerFactory.Start(ctx.Done())
informerFactory.WaitForCacheSync(ctx.Done())
return pq
}

View File

@ -0,0 +1,665 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"context"
"fmt"
"strings"
"time"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/client-go/dynamic/dynamicinformer"
"k8s.io/client-go/informers"
"k8s.io/client-go/tools/cache"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
corev1nodeaffinity "k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/backend/queue"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/profile"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
)
func (sched *Scheduler) addNodeToCache(obj interface{}) {
evt := framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add}
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(metrics.SinceInSeconds(start))
logger := sched.logger
node, ok := obj.(*v1.Node)
if !ok {
logger.Error(nil, "Cannot convert to *v1.Node", "obj", obj)
return
}
logger.V(3).Info("Add event for node", "node", klog.KObj(node))
nodeInfo := sched.Cache.AddNode(logger, node)
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, evt, nil, node, preCheckForNode(nodeInfo))
}
func (sched *Scheduler) updateNodeInCache(oldObj, newObj interface{}) {
start := time.Now()
logger := sched.logger
oldNode, ok := oldObj.(*v1.Node)
if !ok {
logger.Error(nil, "Cannot convert oldObj to *v1.Node", "oldObj", oldObj)
return
}
newNode, ok := newObj.(*v1.Node)
if !ok {
logger.Error(nil, "Cannot convert newObj to *v1.Node", "newObj", newObj)
return
}
logger.V(4).Info("Update event for node", "node", klog.KObj(newNode))
nodeInfo := sched.Cache.UpdateNode(logger, oldNode, newNode)
events := framework.NodeSchedulingPropertiesChange(newNode, oldNode)
// Save the time it takes to update the node in the cache.
updatingDuration := metrics.SinceInSeconds(start)
// Only requeue unschedulable pods if the node became more schedulable.
for _, evt := range events {
startMoving := time.Now()
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, evt, oldNode, newNode, preCheckForNode(nodeInfo))
movingDuration := metrics.SinceInSeconds(startMoving)
metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(updatingDuration + movingDuration)
}
}
func (sched *Scheduler) deleteNodeFromCache(obj interface{}) {
evt := framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Delete}
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(metrics.SinceInSeconds(start))
logger := sched.logger
var node *v1.Node
switch t := obj.(type) {
case *v1.Node:
node = t
case cache.DeletedFinalStateUnknown:
var ok bool
node, ok = t.Obj.(*v1.Node)
if !ok {
logger.Error(nil, "Cannot convert to *v1.Node", "obj", t.Obj)
return
}
default:
logger.Error(nil, "Cannot convert to *v1.Node", "obj", t)
return
}
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, evt, node, nil, nil)
logger.V(3).Info("Delete event for node", "node", klog.KObj(node))
if err := sched.Cache.RemoveNode(logger, node); err != nil {
logger.Error(err, "Scheduler cache RemoveNode failed")
}
}
func (sched *Scheduler) addPodToSchedulingQueue(obj interface{}) {
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(framework.EventUnscheduledPodAdd.Label()).Observe(metrics.SinceInSeconds(start))
logger := sched.logger
pod := obj.(*v1.Pod)
logger.V(3).Info("Add event for unscheduled pod", "pod", klog.KObj(pod))
sched.SchedulingQueue.Add(logger, pod)
}
func (sched *Scheduler) updatePodInSchedulingQueue(oldObj, newObj interface{}) {
start := time.Now()
logger := sched.logger
oldPod, newPod := oldObj.(*v1.Pod), newObj.(*v1.Pod)
// Bypass update event that carries identical objects; otherwise, a duplicated
// Pod may go through scheduling and cause unexpected behavior (see #96071).
if oldPod.ResourceVersion == newPod.ResourceVersion {
return
}
defer metrics.EventHandlingLatency.WithLabelValues(framework.EventUnscheduledPodUpdate.Label()).Observe(metrics.SinceInSeconds(start))
for _, evt := range framework.PodSchedulingPropertiesChange(newPod, oldPod) {
if evt.Label() != framework.EventUnscheduledPodUpdate.Label() {
defer metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(metrics.SinceInSeconds(start))
}
}
isAssumed, err := sched.Cache.IsAssumedPod(newPod)
if err != nil {
utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", newPod.Namespace, newPod.Name, err))
}
if isAssumed {
return
}
logger.V(4).Info("Update event for unscheduled pod", "pod", klog.KObj(newPod))
sched.SchedulingQueue.Update(logger, oldPod, newPod)
}
func (sched *Scheduler) deletePodFromSchedulingQueue(obj interface{}) {
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(framework.EventUnscheduledPodDelete.Label()).Observe(metrics.SinceInSeconds(start))
logger := sched.logger
var pod *v1.Pod
switch t := obj.(type) {
case *v1.Pod:
pod = obj.(*v1.Pod)
case cache.DeletedFinalStateUnknown:
var ok bool
pod, ok = t.Obj.(*v1.Pod)
if !ok {
utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, sched))
return
}
default:
utilruntime.HandleError(fmt.Errorf("unable to handle object in %T: %T", sched, obj))
return
}
logger.V(3).Info("Delete event for unscheduled pod", "pod", klog.KObj(pod))
sched.SchedulingQueue.Delete(pod)
fwk, err := sched.frameworkForPod(pod)
if err != nil {
// This shouldn't happen, because we only accept for scheduling the pods
// which specify a scheduler name that matches one of the profiles.
logger.Error(err, "Unable to get profile", "pod", klog.KObj(pod))
return
}
// If a waiting pod is rejected, it indicates it's previously assumed and we're
// removing it from the scheduler cache. In this case, signal a AssignedPodDelete
// event to immediately retry some unscheduled Pods.
if fwk.RejectWaitingPod(pod.UID) {
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, pod, nil, nil)
}
}
func (sched *Scheduler) addPodToCache(obj interface{}) {
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(framework.EventAssignedPodAdd.Label()).Observe(metrics.SinceInSeconds(start))
logger := sched.logger
pod, ok := obj.(*v1.Pod)
if !ok {
logger.Error(nil, "Cannot convert to *v1.Pod", "obj", obj)
return
}
logger.V(3).Info("Add event for scheduled pod", "pod", klog.KObj(pod))
if err := sched.Cache.AddPod(logger, pod); err != nil {
logger.Error(err, "Scheduler cache AddPod failed", "pod", klog.KObj(pod))
}
// SchedulingQueue.AssignedPodAdded has a problem:
// It internally pre-filters Pods to move to activeQ,
// while taking only in-tree plugins into consideration.
// Consequently, if custom plugins that subscribes Pod/Add events reject Pods,
// those Pods will never be requeued to activeQ by an assigned Pod related events,
// and they may be stuck in unschedulableQ.
//
// Here we use MoveAllToActiveOrBackoffQueue only when QueueingHint is enabled.
// (We cannot switch to MoveAllToActiveOrBackoffQueue right away because of throughput concern.)
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodAdd, nil, pod, nil)
} else {
sched.SchedulingQueue.AssignedPodAdded(logger, pod)
}
}
func (sched *Scheduler) updatePodInCache(oldObj, newObj interface{}) {
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(framework.EventAssignedPodUpdate.Label()).Observe(metrics.SinceInSeconds(start))
logger := sched.logger
oldPod, ok := oldObj.(*v1.Pod)
if !ok {
logger.Error(nil, "Cannot convert oldObj to *v1.Pod", "oldObj", oldObj)
return
}
newPod, ok := newObj.(*v1.Pod)
if !ok {
logger.Error(nil, "Cannot convert newObj to *v1.Pod", "newObj", newObj)
return
}
logger.V(4).Info("Update event for scheduled pod", "pod", klog.KObj(oldPod))
if err := sched.Cache.UpdatePod(logger, oldPod, newPod); err != nil {
logger.Error(err, "Scheduler cache UpdatePod failed", "pod", klog.KObj(oldPod))
}
events := framework.PodSchedulingPropertiesChange(newPod, oldPod)
// Save the time it takes to update the pod in the cache.
updatingDuration := metrics.SinceInSeconds(start)
for _, evt := range events {
startMoving := time.Now()
// SchedulingQueue.AssignedPodUpdated has a problem:
// It internally pre-filters Pods to move to activeQ,
// while taking only in-tree plugins into consideration.
// Consequently, if custom plugins that subscribes Pod/Update events reject Pods,
// those Pods will never be requeued to activeQ by an assigned Pod related events,
// and they may be stuck in unschedulableQ.
//
// Here we use MoveAllToActiveOrBackoffQueue only when QueueingHint is enabled.
// (We cannot switch to MoveAllToActiveOrBackoffQueue right away because of throughput concern.)
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, evt, oldPod, newPod, nil)
} else {
sched.SchedulingQueue.AssignedPodUpdated(logger, oldPod, newPod, evt)
}
movingDuration := metrics.SinceInSeconds(startMoving)
metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(updatingDuration + movingDuration)
}
}
func (sched *Scheduler) deletePodFromCache(obj interface{}) {
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(framework.EventAssignedPodDelete.Label()).Observe(metrics.SinceInSeconds(start))
logger := sched.logger
var pod *v1.Pod
switch t := obj.(type) {
case *v1.Pod:
pod = t
case cache.DeletedFinalStateUnknown:
var ok bool
pod, ok = t.Obj.(*v1.Pod)
if !ok {
logger.Error(nil, "Cannot convert to *v1.Pod", "obj", t.Obj)
return
}
default:
logger.Error(nil, "Cannot convert to *v1.Pod", "obj", t)
return
}
logger.V(3).Info("Delete event for scheduled pod", "pod", klog.KObj(pod))
if err := sched.Cache.RemovePod(logger, pod); err != nil {
logger.Error(err, "Scheduler cache RemovePod failed", "pod", klog.KObj(pod))
}
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, framework.EventAssignedPodDelete, pod, nil, nil)
}
// assignedPod selects pods that are assigned (scheduled and running).
func assignedPod(pod *v1.Pod) bool {
return len(pod.Spec.NodeName) != 0
}
// responsibleForPod returns true if the pod has asked to be scheduled by the given scheduler.
func responsibleForPod(pod *v1.Pod, profiles profile.Map) bool {
return profiles.HandlesSchedulerName(pod.Spec.SchedulerName)
}
const (
// syncedPollPeriod controls how often you look at the status of your sync funcs
syncedPollPeriod = 100 * time.Millisecond
)
// WaitForHandlersSync waits for EventHandlers to sync.
// It returns true if it was successful, false if the controller should shut down
func (sched *Scheduler) WaitForHandlersSync(ctx context.Context) error {
return wait.PollUntilContextCancel(ctx, syncedPollPeriod, true, func(ctx context.Context) (done bool, err error) {
for _, handler := range sched.registeredHandlers {
if !handler.HasSynced() {
return false, nil
}
}
return true, nil
})
}
// addAllEventHandlers is a helper function used in tests and in Scheduler
// to add event handlers for various informers.
func addAllEventHandlers(
sched *Scheduler,
informerFactory informers.SharedInformerFactory,
dynInformerFactory dynamicinformer.DynamicSharedInformerFactory,
resourceClaimCache *assumecache.AssumeCache,
gvkMap map[framework.EventResource]framework.ActionType,
) error {
var (
handlerRegistration cache.ResourceEventHandlerRegistration
err error
handlers []cache.ResourceEventHandlerRegistration
)
// scheduled pod cache
if handlerRegistration, err = informerFactory.Core().V1().Pods().Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
return assignedPod(t)
case cache.DeletedFinalStateUnknown:
if _, ok := t.Obj.(*v1.Pod); ok {
// The carried object may be stale, so we don't use it to check if
// it's assigned or not. Attempting to cleanup anyways.
return true
}
utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, sched))
return false
default:
utilruntime.HandleError(fmt.Errorf("unable to handle object in %T: %T", sched, obj))
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: sched.addPodToCache,
UpdateFunc: sched.updatePodInCache,
DeleteFunc: sched.deletePodFromCache,
},
},
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
// unscheduled pod queue
if handlerRegistration, err = informerFactory.Core().V1().Pods().Informer().AddEventHandler(
cache.FilteringResourceEventHandler{
FilterFunc: func(obj interface{}) bool {
switch t := obj.(type) {
case *v1.Pod:
return !assignedPod(t) && responsibleForPod(t, sched.Profiles)
case cache.DeletedFinalStateUnknown:
if pod, ok := t.Obj.(*v1.Pod); ok {
// The carried object may be stale, so we don't use it to check if
// it's assigned or not.
return responsibleForPod(pod, sched.Profiles)
}
utilruntime.HandleError(fmt.Errorf("unable to convert object %T to *v1.Pod in %T", obj, sched))
return false
default:
utilruntime.HandleError(fmt.Errorf("unable to handle object in %T: %T", sched, obj))
return false
}
},
Handler: cache.ResourceEventHandlerFuncs{
AddFunc: sched.addPodToSchedulingQueue,
UpdateFunc: sched.updatePodInSchedulingQueue,
DeleteFunc: sched.deletePodFromSchedulingQueue,
},
},
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
if handlerRegistration, err = informerFactory.Core().V1().Nodes().Informer().AddEventHandler(
cache.ResourceEventHandlerFuncs{
AddFunc: sched.addNodeToCache,
UpdateFunc: sched.updateNodeInCache,
DeleteFunc: sched.deleteNodeFromCache,
},
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
logger := sched.logger
buildEvtResHandler := func(at framework.ActionType, resource framework.EventResource) cache.ResourceEventHandlerFuncs {
funcs := cache.ResourceEventHandlerFuncs{}
if at&framework.Add != 0 {
evt := framework.ClusterEvent{Resource: resource, ActionType: framework.Add}
funcs.AddFunc = func(obj interface{}) {
start := time.Now()
defer metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(metrics.SinceInSeconds(start))
if resource == framework.StorageClass && !utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
sc, ok := obj.(*storagev1.StorageClass)
if !ok {
logger.Error(nil, "Cannot convert to *storagev1.StorageClass", "obj", obj)
return
}
// CheckVolumeBindingPred fails if pod has unbound immediate PVCs. If these
// PVCs have specified StorageClass name, creating StorageClass objects
// with late binding will cause predicates to pass, so we need to move pods
// to active queue.
// We don't need to invalidate cached results because results will not be
// cached for pod that has unbound immediate PVCs.
if sc.VolumeBindingMode == nil || *sc.VolumeBindingMode != storagev1.VolumeBindingWaitForFirstConsumer {
return
}
}
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, evt, nil, obj, nil)
}
}
if at&framework.Update != 0 {
evt := framework.ClusterEvent{Resource: resource, ActionType: framework.Update}
funcs.UpdateFunc = func(old, obj interface{}) {
start := time.Now()
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, evt, old, obj, nil)
metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(metrics.SinceInSeconds(start))
}
}
if at&framework.Delete != 0 {
evt := framework.ClusterEvent{Resource: resource, ActionType: framework.Delete}
funcs.DeleteFunc = func(obj interface{}) {
start := time.Now()
sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, evt, obj, nil, nil)
metrics.EventHandlingLatency.WithLabelValues(evt.Label()).Observe(metrics.SinceInSeconds(start))
}
}
return funcs
}
for gvk, at := range gvkMap {
switch gvk {
case framework.Node, framework.Pod:
// Do nothing.
case framework.CSINode:
if handlerRegistration, err = informerFactory.Storage().V1().CSINodes().Informer().AddEventHandler(
buildEvtResHandler(at, framework.CSINode),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
case framework.CSIDriver:
if handlerRegistration, err = informerFactory.Storage().V1().CSIDrivers().Informer().AddEventHandler(
buildEvtResHandler(at, framework.CSIDriver),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
case framework.CSIStorageCapacity:
if handlerRegistration, err = informerFactory.Storage().V1().CSIStorageCapacities().Informer().AddEventHandler(
buildEvtResHandler(at, framework.CSIStorageCapacity),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
case framework.PersistentVolume:
// MaxPDVolumeCountPredicate: since it relies on the counts of PV.
//
// PvAdd: Pods created when there are no PVs available will be stuck in
// unschedulable queue. But unbound PVs created for static provisioning and
// delay binding storage class are skipped in PV controller dynamic
// provisioning and binding process, will not trigger events to schedule pod
// again. So we need to move pods to active queue on PV add for this
// scenario.
//
// PvUpdate: Scheduler.bindVolumesWorker may fail to update assumed pod volume
// bindings due to conflicts if PVs are updated by PV controller or other
// parties, then scheduler will add pod back to unschedulable queue. We
// need to move pods to active queue on PV update for this scenario.
if handlerRegistration, err = informerFactory.Core().V1().PersistentVolumes().Informer().AddEventHandler(
buildEvtResHandler(at, framework.PersistentVolume),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
case framework.PersistentVolumeClaim:
// MaxPDVolumeCountPredicate: add/update PVC will affect counts of PV when it is bound.
if handlerRegistration, err = informerFactory.Core().V1().PersistentVolumeClaims().Informer().AddEventHandler(
buildEvtResHandler(at, framework.PersistentVolumeClaim),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
case framework.ResourceClaim:
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
handlerRegistration = resourceClaimCache.AddEventHandler(
buildEvtResHandler(at, framework.ResourceClaim),
)
handlers = append(handlers, handlerRegistration)
}
case framework.ResourceSlice:
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
if handlerRegistration, err = informerFactory.Resource().V1beta1().ResourceSlices().Informer().AddEventHandler(
buildEvtResHandler(at, framework.ResourceSlice),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
}
case framework.DeviceClass:
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
if handlerRegistration, err = informerFactory.Resource().V1beta1().DeviceClasses().Informer().AddEventHandler(
buildEvtResHandler(at, framework.DeviceClass),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
}
case framework.StorageClass:
if handlerRegistration, err = informerFactory.Storage().V1().StorageClasses().Informer().AddEventHandler(
buildEvtResHandler(at, framework.StorageClass),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
case framework.VolumeAttachment:
if handlerRegistration, err = informerFactory.Storage().V1().VolumeAttachments().Informer().AddEventHandler(
buildEvtResHandler(at, framework.VolumeAttachment),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
default:
// Tests may not instantiate dynInformerFactory.
if dynInformerFactory == nil {
continue
}
// GVK is expected to be at least 3-folded, separated by dots.
// <kind in plural>.<version>.<group>
// Valid examples:
// - foos.v1.example.com
// - bars.v1beta1.a.b.c
// Invalid examples:
// - foos.v1 (2 sections)
// - foo.v1.example.com (the first section should be plural)
if strings.Count(string(gvk), ".") < 2 {
logger.Error(nil, "incorrect event registration", "gvk", gvk)
continue
}
// Fall back to try dynamic informers.
gvr, _ := schema.ParseResourceArg(string(gvk))
dynInformer := dynInformerFactory.ForResource(*gvr).Informer()
if handlerRegistration, err = dynInformer.AddEventHandler(
buildEvtResHandler(at, gvk),
); err != nil {
return err
}
handlers = append(handlers, handlerRegistration)
}
}
sched.registeredHandlers = handlers
return nil
}
func preCheckForNode(nodeInfo *framework.NodeInfo) queue.PreEnqueueCheck {
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
// QHint is initially created from the motivation of replacing this preCheck.
// It assumes that the scheduler only has in-tree plugins, which is problematic for our extensibility.
// Here, we skip preCheck if QHint is enabled, and we eventually remove it after QHint is graduated.
return nil
}
// Note: the following checks doesn't take preemption into considerations, in very rare
// cases (e.g., node resizing), "pod" may still fail a check but preemption helps. We deliberately
// chose to ignore those cases as unschedulable pods will be re-queued eventually.
return func(pod *v1.Pod) bool {
admissionResults := AdmissionCheck(pod, nodeInfo, false)
if len(admissionResults) != 0 {
return false
}
_, isUntolerated := corev1helpers.FindMatchingUntoleratedTaint(nodeInfo.Node().Spec.Taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
return t.Effect == v1.TaintEffectNoSchedule
})
return !isUntolerated
}
}
// AdmissionCheck calls the filtering logic of noderesources/nodeport/nodeAffinity/nodename
// and returns the failure reasons. It's used in kubelet(pkg/kubelet/lifecycle/predicate.go) and scheduler.
// It returns the first failure if `includeAllFailures` is set to false; otherwise
// returns all failures.
func AdmissionCheck(pod *v1.Pod, nodeInfo *framework.NodeInfo, includeAllFailures bool) []AdmissionResult {
var admissionResults []AdmissionResult
insufficientResources := noderesources.Fits(pod, nodeInfo, noderesources.ResourceRequestsOptions{
EnablePodLevelResources: utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
})
if len(insufficientResources) != 0 {
for i := range insufficientResources {
admissionResults = append(admissionResults, AdmissionResult{InsufficientResource: &insufficientResources[i]})
}
if !includeAllFailures {
return admissionResults
}
}
if matches, _ := corev1nodeaffinity.GetRequiredNodeAffinity(pod).Match(nodeInfo.Node()); !matches {
admissionResults = append(admissionResults, AdmissionResult{Name: nodeaffinity.Name, Reason: nodeaffinity.ErrReasonPod})
if !includeAllFailures {
return admissionResults
}
}
if !nodename.Fits(pod, nodeInfo) {
admissionResults = append(admissionResults, AdmissionResult{Name: nodename.Name, Reason: nodename.ErrReason})
if !includeAllFailures {
return admissionResults
}
}
if !nodeports.Fits(pod, nodeInfo) {
admissionResults = append(admissionResults, AdmissionResult{Name: nodeports.Name, Reason: nodeports.ErrReason})
if !includeAllFailures {
return admissionResults
}
}
return admissionResults
}
// AdmissionResult describes the reason why Scheduler can't admit the pod.
// If the reason is a resource fit one, then AdmissionResult.InsufficientResource includes the details.
type AdmissionResult struct {
Name string
Reason string
InsufficientResource *noderesources.InsufficientResource
}

456
e2e/vendor/k8s.io/kubernetes/pkg/scheduler/extender.go generated vendored Normal file
View File

@ -0,0 +1,456 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package scheduler
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"net/http"
"strings"
"time"
v1 "k8s.io/api/core/v1"
utilnet "k8s.io/apimachinery/pkg/util/net"
"k8s.io/apimachinery/pkg/util/sets"
restclient "k8s.io/client-go/rest"
extenderv1 "k8s.io/kube-scheduler/extender/v1"
schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
const (
// DefaultExtenderTimeout defines the default extender timeout in second.
DefaultExtenderTimeout = 5 * time.Second
)
// HTTPExtender implements the Extender interface.
type HTTPExtender struct {
extenderURL string
preemptVerb string
filterVerb string
prioritizeVerb string
bindVerb string
weight int64
client *http.Client
nodeCacheCapable bool
managedResources sets.Set[string]
ignorable bool
}
func makeTransport(config *schedulerapi.Extender) (http.RoundTripper, error) {
var cfg restclient.Config
if config.TLSConfig != nil {
cfg.TLSClientConfig.Insecure = config.TLSConfig.Insecure
cfg.TLSClientConfig.ServerName = config.TLSConfig.ServerName
cfg.TLSClientConfig.CertFile = config.TLSConfig.CertFile
cfg.TLSClientConfig.KeyFile = config.TLSConfig.KeyFile
cfg.TLSClientConfig.CAFile = config.TLSConfig.CAFile
cfg.TLSClientConfig.CertData = config.TLSConfig.CertData
cfg.TLSClientConfig.KeyData = config.TLSConfig.KeyData
cfg.TLSClientConfig.CAData = config.TLSConfig.CAData
}
if config.EnableHTTPS {
hasCA := len(cfg.CAFile) > 0 || len(cfg.CAData) > 0
if !hasCA {
cfg.Insecure = true
}
}
tlsConfig, err := restclient.TLSConfigFor(&cfg)
if err != nil {
return nil, err
}
if tlsConfig != nil {
return utilnet.SetTransportDefaults(&http.Transport{
TLSClientConfig: tlsConfig,
}), nil
}
return utilnet.SetTransportDefaults(&http.Transport{}), nil
}
// NewHTTPExtender creates an HTTPExtender object.
func NewHTTPExtender(config *schedulerapi.Extender) (framework.Extender, error) {
if config.HTTPTimeout.Duration.Nanoseconds() == 0 {
config.HTTPTimeout.Duration = time.Duration(DefaultExtenderTimeout)
}
transport, err := makeTransport(config)
if err != nil {
return nil, err
}
client := &http.Client{
Transport: transport,
Timeout: config.HTTPTimeout.Duration,
}
managedResources := sets.New[string]()
for _, r := range config.ManagedResources {
managedResources.Insert(string(r.Name))
}
return &HTTPExtender{
extenderURL: config.URLPrefix,
preemptVerb: config.PreemptVerb,
filterVerb: config.FilterVerb,
prioritizeVerb: config.PrioritizeVerb,
bindVerb: config.BindVerb,
weight: config.Weight,
client: client,
nodeCacheCapable: config.NodeCacheCapable,
managedResources: managedResources,
ignorable: config.Ignorable,
}, nil
}
// Name returns extenderURL to identify the extender.
func (h *HTTPExtender) Name() string {
return h.extenderURL
}
// IsIgnorable returns true indicates scheduling should not fail when this extender
// is unavailable
func (h *HTTPExtender) IsIgnorable() bool {
return h.ignorable
}
// SupportsPreemption returns true if an extender supports preemption.
// An extender should have preempt verb defined and enabled its own node cache.
func (h *HTTPExtender) SupportsPreemption() bool {
return len(h.preemptVerb) > 0
}
// ProcessPreemption returns filtered candidate nodes and victims after running preemption logic in extender.
func (h *HTTPExtender) ProcessPreemption(
pod *v1.Pod,
nodeNameToVictims map[string]*extenderv1.Victims,
nodeInfos framework.NodeInfoLister,
) (map[string]*extenderv1.Victims, error) {
var (
result extenderv1.ExtenderPreemptionResult
args *extenderv1.ExtenderPreemptionArgs
)
if !h.SupportsPreemption() {
return nil, fmt.Errorf("preempt verb is not defined for extender %v but run into ProcessPreemption", h.extenderURL)
}
if h.nodeCacheCapable {
// If extender has cached node info, pass NodeNameToMetaVictims in args.
nodeNameToMetaVictims := convertToMetaVictims(nodeNameToVictims)
args = &extenderv1.ExtenderPreemptionArgs{
Pod: pod,
NodeNameToMetaVictims: nodeNameToMetaVictims,
}
} else {
args = &extenderv1.ExtenderPreemptionArgs{
Pod: pod,
NodeNameToVictims: nodeNameToVictims,
}
}
if err := h.send(h.preemptVerb, args, &result); err != nil {
return nil, err
}
// Extender will always return NodeNameToMetaVictims.
// So let's convert it to NodeNameToVictims by using <nodeInfos>.
newNodeNameToVictims, err := h.convertToVictims(result.NodeNameToMetaVictims, nodeInfos)
if err != nil {
return nil, err
}
// Do not override <nodeNameToVictims>.
return newNodeNameToVictims, nil
}
// convertToVictims converts "nodeNameToMetaVictims" from object identifiers,
// such as UIDs and names, to object pointers.
func (h *HTTPExtender) convertToVictims(
nodeNameToMetaVictims map[string]*extenderv1.MetaVictims,
nodeInfos framework.NodeInfoLister,
) (map[string]*extenderv1.Victims, error) {
nodeNameToVictims := map[string]*extenderv1.Victims{}
for nodeName, metaVictims := range nodeNameToMetaVictims {
nodeInfo, err := nodeInfos.Get(nodeName)
if err != nil {
return nil, err
}
victims := &extenderv1.Victims{
Pods: []*v1.Pod{},
NumPDBViolations: metaVictims.NumPDBViolations,
}
for _, metaPod := range metaVictims.Pods {
pod, err := h.convertPodUIDToPod(metaPod, nodeInfo)
if err != nil {
return nil, err
}
victims.Pods = append(victims.Pods, pod)
}
nodeNameToVictims[nodeName] = victims
}
return nodeNameToVictims, nil
}
// convertPodUIDToPod returns v1.Pod object for given MetaPod and node info.
// The v1.Pod object is restored by nodeInfo.Pods().
// It returns an error if there's cache inconsistency between default scheduler
// and extender, i.e. when the pod is not found in nodeInfo.Pods.
func (h *HTTPExtender) convertPodUIDToPod(
metaPod *extenderv1.MetaPod,
nodeInfo *framework.NodeInfo) (*v1.Pod, error) {
for _, p := range nodeInfo.Pods {
if string(p.Pod.UID) == metaPod.UID {
return p.Pod, nil
}
}
return nil, fmt.Errorf("extender: %v claims to preempt pod (UID: %v) on node: %v, but the pod is not found on that node",
h.extenderURL, metaPod, nodeInfo.Node().Name)
}
// convertToMetaVictims converts from struct type to meta types.
func convertToMetaVictims(
nodeNameToVictims map[string]*extenderv1.Victims,
) map[string]*extenderv1.MetaVictims {
nodeNameToMetaVictims := map[string]*extenderv1.MetaVictims{}
for node, victims := range nodeNameToVictims {
metaVictims := &extenderv1.MetaVictims{
Pods: []*extenderv1.MetaPod{},
NumPDBViolations: victims.NumPDBViolations,
}
for _, pod := range victims.Pods {
metaPod := &extenderv1.MetaPod{
UID: string(pod.UID),
}
metaVictims.Pods = append(metaVictims.Pods, metaPod)
}
nodeNameToMetaVictims[node] = metaVictims
}
return nodeNameToMetaVictims
}
// Filter based on extender implemented predicate functions. The filtered list is
// expected to be a subset of the supplied list; otherwise the function returns an error.
// The failedNodes and failedAndUnresolvableNodes optionally contains the list
// of failed nodes and failure reasons, except nodes in the latter are
// unresolvable.
func (h *HTTPExtender) Filter(
pod *v1.Pod,
nodes []*framework.NodeInfo,
) (filteredList []*framework.NodeInfo, failedNodes, failedAndUnresolvableNodes extenderv1.FailedNodesMap, err error) {
var (
result extenderv1.ExtenderFilterResult
nodeList *v1.NodeList
nodeNames *[]string
nodeResult []*framework.NodeInfo
args *extenderv1.ExtenderArgs
)
fromNodeName := make(map[string]*framework.NodeInfo)
for _, n := range nodes {
fromNodeName[n.Node().Name] = n
}
if h.filterVerb == "" {
return nodes, extenderv1.FailedNodesMap{}, extenderv1.FailedNodesMap{}, nil
}
if h.nodeCacheCapable {
nodeNameSlice := make([]string, 0, len(nodes))
for _, node := range nodes {
nodeNameSlice = append(nodeNameSlice, node.Node().Name)
}
nodeNames = &nodeNameSlice
} else {
nodeList = &v1.NodeList{}
for _, node := range nodes {
nodeList.Items = append(nodeList.Items, *node.Node())
}
}
args = &extenderv1.ExtenderArgs{
Pod: pod,
Nodes: nodeList,
NodeNames: nodeNames,
}
if err := h.send(h.filterVerb, args, &result); err != nil {
return nil, nil, nil, err
}
if result.Error != "" {
return nil, nil, nil, errors.New(result.Error)
}
if h.nodeCacheCapable && result.NodeNames != nil {
nodeResult = make([]*framework.NodeInfo, len(*result.NodeNames))
for i, nodeName := range *result.NodeNames {
if n, ok := fromNodeName[nodeName]; ok {
nodeResult[i] = n
} else {
return nil, nil, nil, fmt.Errorf(
"extender %q claims a filtered node %q which is not found in the input node list",
h.extenderURL, nodeName)
}
}
} else if result.Nodes != nil {
nodeResult = make([]*framework.NodeInfo, len(result.Nodes.Items))
for i := range result.Nodes.Items {
nodeResult[i] = framework.NewNodeInfo()
nodeResult[i].SetNode(&result.Nodes.Items[i])
}
}
return nodeResult, result.FailedNodes, result.FailedAndUnresolvableNodes, nil
}
// Prioritize based on extender implemented priority functions. Weight*priority is added
// up for each such priority function. The returned score is added to the score computed
// by Kubernetes scheduler. The total score is used to do the host selection.
func (h *HTTPExtender) Prioritize(pod *v1.Pod, nodes []*framework.NodeInfo) (*extenderv1.HostPriorityList, int64, error) {
var (
result extenderv1.HostPriorityList
nodeList *v1.NodeList
nodeNames *[]string
args *extenderv1.ExtenderArgs
)
if h.prioritizeVerb == "" {
result := extenderv1.HostPriorityList{}
for _, node := range nodes {
result = append(result, extenderv1.HostPriority{Host: node.Node().Name, Score: 0})
}
return &result, 0, nil
}
if h.nodeCacheCapable {
nodeNameSlice := make([]string, 0, len(nodes))
for _, node := range nodes {
nodeNameSlice = append(nodeNameSlice, node.Node().Name)
}
nodeNames = &nodeNameSlice
} else {
nodeList = &v1.NodeList{}
for _, node := range nodes {
nodeList.Items = append(nodeList.Items, *node.Node())
}
}
args = &extenderv1.ExtenderArgs{
Pod: pod,
Nodes: nodeList,
NodeNames: nodeNames,
}
if err := h.send(h.prioritizeVerb, args, &result); err != nil {
return nil, 0, err
}
return &result, h.weight, nil
}
// Bind delegates the action of binding a pod to a node to the extender.
func (h *HTTPExtender) Bind(binding *v1.Binding) error {
var result extenderv1.ExtenderBindingResult
if !h.IsBinder() {
// This shouldn't happen as this extender wouldn't have become a Binder.
return fmt.Errorf("unexpected empty bindVerb in extender")
}
req := &extenderv1.ExtenderBindingArgs{
PodName: binding.Name,
PodNamespace: binding.Namespace,
PodUID: binding.UID,
Node: binding.Target.Name,
}
if err := h.send(h.bindVerb, req, &result); err != nil {
return err
}
if result.Error != "" {
return errors.New(result.Error)
}
return nil
}
// IsBinder returns whether this extender is configured for the Bind method.
func (h *HTTPExtender) IsBinder() bool {
return h.bindVerb != ""
}
// IsPrioritizer returns whether this extender is configured for the Prioritize method.
func (h *HTTPExtender) IsPrioritizer() bool {
return h.prioritizeVerb != ""
}
// IsFilter returns whether this extender is configured for the Filter method.
func (h *HTTPExtender) IsFilter() bool {
return h.filterVerb != ""
}
// Helper function to send messages to the extender
func (h *HTTPExtender) send(action string, args interface{}, result interface{}) error {
out, err := json.Marshal(args)
if err != nil {
return err
}
url := strings.TrimRight(h.extenderURL, "/") + "/" + action
req, err := http.NewRequest("POST", url, bytes.NewReader(out))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/json")
resp, err := h.client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("failed %v with extender at URL %v, code %v", action, url, resp.StatusCode)
}
return json.NewDecoder(resp.Body).Decode(result)
}
// IsInterested returns true if at least one extended resource requested by
// this pod is managed by this extender.
func (h *HTTPExtender) IsInterested(pod *v1.Pod) bool {
if h.managedResources.Len() == 0 {
return true
}
if h.hasManagedResources(pod.Spec.Containers) {
return true
}
if h.hasManagedResources(pod.Spec.InitContainers) {
return true
}
return false
}
func (h *HTTPExtender) hasManagedResources(containers []v1.Container) bool {
for i := range containers {
container := &containers[i]
for resourceName := range container.Resources.Requests {
if h.managedResources.Has(string(resourceName)) {
return true
}
}
for resourceName := range container.Resources.Limits {
if h.managedResources.Has(string(resourceName)) {
return true
}
}
}
return false
}

View File

@ -0,0 +1,123 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
"errors"
"sync"
"k8s.io/apimachinery/pkg/util/sets"
)
var (
// ErrNotFound is the not found error message.
ErrNotFound = errors.New("not found")
)
// StateData is a generic type for arbitrary data stored in CycleState.
type StateData interface {
// Clone is an interface to make a copy of StateData. For performance reasons,
// clone should make shallow copies for members (e.g., slices or maps) that are not
// impacted by PreFilter's optional AddPod/RemovePod methods.
Clone() StateData
}
// StateKey is the type of keys stored in CycleState.
type StateKey string
// CycleState provides a mechanism for plugins to store and retrieve arbitrary data.
// StateData stored by one plugin can be read, altered, or deleted by another plugin.
// CycleState does not provide any data protection, as all plugins are assumed to be
// trusted.
// Note: CycleState uses a sync.Map to back the storage, because it is thread safe. It's aimed to optimize for the "write once and read many times" scenarios.
// It is the recommended pattern used in all in-tree plugins - plugin-specific state is written once in PreFilter/PreScore and afterward read many times in Filter/Score.
type CycleState struct {
// storage is keyed with StateKey, and valued with StateData.
storage sync.Map
// if recordPluginMetrics is true, metrics.PluginExecutionDuration will be recorded for this cycle.
recordPluginMetrics bool
// SkipFilterPlugins are plugins that will be skipped in the Filter extension point.
SkipFilterPlugins sets.Set[string]
// SkipScorePlugins are plugins that will be skipped in the Score extension point.
SkipScorePlugins sets.Set[string]
}
// NewCycleState initializes a new CycleState and returns its pointer.
func NewCycleState() *CycleState {
return &CycleState{}
}
// ShouldRecordPluginMetrics returns whether metrics.PluginExecutionDuration metrics should be recorded.
func (c *CycleState) ShouldRecordPluginMetrics() bool {
if c == nil {
return false
}
return c.recordPluginMetrics
}
// SetRecordPluginMetrics sets recordPluginMetrics to the given value.
func (c *CycleState) SetRecordPluginMetrics(flag bool) {
if c == nil {
return
}
c.recordPluginMetrics = flag
}
// Clone creates a copy of CycleState and returns its pointer. Clone returns
// nil if the context being cloned is nil.
func (c *CycleState) Clone() *CycleState {
if c == nil {
return nil
}
copy := NewCycleState()
// Safe copy storage in case of overwriting.
c.storage.Range(func(k, v interface{}) bool {
copy.storage.Store(k, v.(StateData).Clone())
return true
})
// The below are not mutated, so we don't have to safe copy.
copy.recordPluginMetrics = c.recordPluginMetrics
copy.SkipFilterPlugins = c.SkipFilterPlugins
copy.SkipScorePlugins = c.SkipScorePlugins
return copy
}
// Read retrieves data with the given "key" from CycleState. If the key is not
// present, ErrNotFound is returned.
//
// See CycleState for notes on concurrency.
func (c *CycleState) Read(key StateKey) (StateData, error) {
if v, ok := c.storage.Load(key); ok {
return v.(StateData), nil
}
return nil, ErrNotFound
}
// Write stores the given "val" in CycleState with the given "key".
//
// See CycleState for notes on concurrency.
func (c *CycleState) Write(key StateKey, val StateData) {
c.storage.Store(key, val)
}
// Delete deletes data with the given key from CycleState.
//
// See CycleState for notes on concurrency.
func (c *CycleState) Delete(key StateKey) {
c.storage.Delete(key)
}

View File

@ -0,0 +1,229 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-helpers/resource"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/kubernetes/pkg/features"
)
// Special event labels.
const (
// ScheduleAttemptFailure is the event when a schedule attempt fails.
ScheduleAttemptFailure = "ScheduleAttemptFailure"
// BackoffComplete is the event when a pod finishes backoff.
BackoffComplete = "BackoffComplete"
// ForceActivate is the event when a pod is moved from unschedulablePods/backoffQ
// to activeQ. Usually it's triggered by plugin implementations.
ForceActivate = "ForceActivate"
// UnschedulableTimeout is the event when a pod is moved from unschedulablePods
// due to the timeout specified at pod-max-in-unschedulable-pods-duration.
UnschedulableTimeout = "UnschedulableTimeout"
)
var (
// EventAssignedPodAdd is the event when an assigned pod is added.
EventAssignedPodAdd = ClusterEvent{Resource: assignedPod, ActionType: Add}
// EventAssignedPodUpdate is the event when an assigned pod is updated.
EventAssignedPodUpdate = ClusterEvent{Resource: assignedPod, ActionType: Update}
// EventAssignedPodDelete is the event when an assigned pod is deleted.
EventAssignedPodDelete = ClusterEvent{Resource: assignedPod, ActionType: Delete}
// EventUnscheduledPodAdd is the event when an unscheduled pod is added.
EventUnscheduledPodAdd = ClusterEvent{Resource: unschedulablePod, ActionType: Add}
// EventUnscheduledPodUpdate is the event when an unscheduled pod is updated.
EventUnscheduledPodUpdate = ClusterEvent{Resource: unschedulablePod, ActionType: Update}
// EventUnscheduledPodDelete is the event when an unscheduled pod is deleted.
EventUnscheduledPodDelete = ClusterEvent{Resource: unschedulablePod, ActionType: Delete}
// EventUnschedulableTimeout is the event when a pod stays in unschedulable for longer than timeout.
EventUnschedulableTimeout = ClusterEvent{Resource: WildCard, ActionType: All, label: UnschedulableTimeout}
// EventForceActivate is the event when a pod is moved from unschedulablePods/backoffQ to activeQ.
EventForceActivate = ClusterEvent{Resource: WildCard, ActionType: All, label: ForceActivate}
)
// PodSchedulingPropertiesChange interprets the update of a pod and returns corresponding UpdatePodXYZ event(s).
// Once we have other pod update events, we should update here as well.
func PodSchedulingPropertiesChange(newPod *v1.Pod, oldPod *v1.Pod) (events []ClusterEvent) {
r := assignedPod
if newPod.Spec.NodeName == "" {
r = unschedulablePod
}
podChangeExtracters := []podChangeExtractor{
extractPodLabelsChange,
extractPodScaleDown,
extractPodSchedulingGateEliminatedChange,
extractPodTolerationChange,
}
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
podChangeExtracters = append(podChangeExtracters, extractPodGeneratedResourceClaimChange)
}
for _, fn := range podChangeExtracters {
if event := fn(newPod, oldPod); event != none {
events = append(events, ClusterEvent{Resource: r, ActionType: event})
}
}
if len(events) == 0 {
// When no specific event is found, we use AssignedPodOtherUpdate,
// which should only trigger plugins registering a general Pod/Update event.
events = append(events, ClusterEvent{Resource: r, ActionType: updatePodOther})
}
return
}
type podChangeExtractor func(newPod *v1.Pod, oldPod *v1.Pod) ActionType
// extractPodScaleDown interprets the update of a pod and returns PodRequestScaledDown event if any pod's resource request(s) is scaled down.
func extractPodScaleDown(newPod, oldPod *v1.Pod) ActionType {
opt := resource.PodResourcesOptions{
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
}
newPodRequests := resource.PodRequests(newPod, opt)
oldPodRequests := resource.PodRequests(oldPod, opt)
for rName, oldReq := range oldPodRequests {
newReq, ok := newPodRequests[rName]
if !ok {
// The resource request of rName is removed.
return UpdatePodScaleDown
}
if oldReq.MilliValue() > newReq.MilliValue() {
// The resource request of rName is scaled down.
return UpdatePodScaleDown
}
}
return none
}
func extractPodLabelsChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if isLabelChanged(newPod.GetLabels(), oldPod.GetLabels()) {
return UpdatePodLabel
}
return none
}
func extractPodTolerationChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if len(newPod.Spec.Tolerations) != len(oldPod.Spec.Tolerations) {
// A Pod got a new toleration.
// Due to API validation, the user can add, but cannot modify or remove tolerations.
// So, it's enough to just check the length of tolerations to notice the update.
// And, any updates in tolerations could make Pod schedulable.
return UpdatePodTolerations
}
return none
}
func extractPodSchedulingGateEliminatedChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if len(newPod.Spec.SchedulingGates) == 0 && len(oldPod.Spec.SchedulingGates) != 0 {
// A scheduling gate on the pod is completely removed.
return UpdatePodSchedulingGatesEliminated
}
return none
}
func extractPodGeneratedResourceClaimChange(newPod *v1.Pod, oldPod *v1.Pod) ActionType {
if !resourceclaim.PodStatusEqual(newPod.Status.ResourceClaimStatuses, oldPod.Status.ResourceClaimStatuses) {
return UpdatePodGeneratedResourceClaim
}
return none
}
// NodeSchedulingPropertiesChange interprets the update of a node and returns corresponding UpdateNodeXYZ event(s).
func NodeSchedulingPropertiesChange(newNode *v1.Node, oldNode *v1.Node) (events []ClusterEvent) {
nodeChangeExtracters := []nodeChangeExtractor{
extractNodeSpecUnschedulableChange,
extractNodeAllocatableChange,
extractNodeLabelsChange,
extractNodeTaintsChange,
extractNodeConditionsChange,
extractNodeAnnotationsChange,
}
for _, fn := range nodeChangeExtracters {
if event := fn(newNode, oldNode); event != none {
events = append(events, ClusterEvent{Resource: Node, ActionType: event})
}
}
return
}
type nodeChangeExtractor func(newNode *v1.Node, oldNode *v1.Node) ActionType
func extractNodeAllocatableChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if !equality.Semantic.DeepEqual(oldNode.Status.Allocatable, newNode.Status.Allocatable) {
return UpdateNodeAllocatable
}
return none
}
func extractNodeLabelsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if isLabelChanged(newNode.GetLabels(), oldNode.GetLabels()) {
return UpdateNodeLabel
}
return none
}
func isLabelChanged(newLabels map[string]string, oldLabels map[string]string) bool {
return !equality.Semantic.DeepEqual(newLabels, oldLabels)
}
func extractNodeTaintsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if !equality.Semantic.DeepEqual(newNode.Spec.Taints, oldNode.Spec.Taints) {
return UpdateNodeTaint
}
return none
}
func extractNodeConditionsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
strip := func(conditions []v1.NodeCondition) map[v1.NodeConditionType]v1.ConditionStatus {
conditionStatuses := make(map[v1.NodeConditionType]v1.ConditionStatus, len(conditions))
for i := range conditions {
conditionStatuses[conditions[i].Type] = conditions[i].Status
}
return conditionStatuses
}
if !equality.Semantic.DeepEqual(strip(oldNode.Status.Conditions), strip(newNode.Status.Conditions)) {
return UpdateNodeCondition
}
return none
}
func extractNodeSpecUnschedulableChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if newNode.Spec.Unschedulable != oldNode.Spec.Unschedulable && !newNode.Spec.Unschedulable {
// TODO: create UpdateNodeSpecUnschedulable ActionType
return UpdateNodeTaint
}
return none
}
func extractNodeAnnotationsChange(newNode *v1.Node, oldNode *v1.Node) ActionType {
if !equality.Semantic.DeepEqual(oldNode.GetAnnotations(), newNode.GetAnnotations()) {
return UpdateNodeAnnotation
}
return none
}

View File

@ -0,0 +1,79 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
v1 "k8s.io/api/core/v1"
extenderv1 "k8s.io/kube-scheduler/extender/v1"
)
// Extender is an interface for external processes to influence scheduling
// decisions made by Kubernetes. This is typically needed for resources not directly
// managed by Kubernetes.
type Extender interface {
// Name returns a unique name that identifies the extender.
Name() string
// Filter based on extender-implemented predicate functions. The filtered list is
// expected to be a subset of the supplied list.
// The failedNodes and failedAndUnresolvableNodes optionally contains the list
// of failed nodes and failure reasons, except nodes in the latter are
// unresolvable.
Filter(pod *v1.Pod, nodes []*NodeInfo) (filteredNodes []*NodeInfo, failedNodesMap extenderv1.FailedNodesMap, failedAndUnresolvable extenderv1.FailedNodesMap, err error)
// Prioritize based on extender-implemented priority functions. The returned scores & weight
// are used to compute the weighted score for an extender. The weighted scores are added to
// the scores computed by Kubernetes scheduler. The total scores are used to do the host selection.
Prioritize(pod *v1.Pod, nodes []*NodeInfo) (hostPriorities *extenderv1.HostPriorityList, weight int64, err error)
// Bind delegates the action of binding a pod to a node to the extender.
Bind(binding *v1.Binding) error
// IsBinder returns whether this extender is configured for the Bind method.
IsBinder() bool
// IsInterested returns true if at least one extended resource requested by
// this pod is managed by this extender.
IsInterested(pod *v1.Pod) bool
// IsPrioritizer returns whether this extender is configured for the Prioritize method.
IsPrioritizer() bool
// IsFilter returns whether this extender is configured for the Filter method.
IsFilter() bool
// ProcessPreemption returns nodes with their victim pods processed by extender based on
// given:
// 1. Pod to schedule
// 2. Candidate nodes and victim pods (nodeNameToVictims) generated by previous scheduling process.
// The possible changes made by extender may include:
// 1. Subset of given candidate nodes after preemption phase of extender.
// 2. A different set of victim pod for every given candidate node after preemption phase of extender.
ProcessPreemption(
pod *v1.Pod,
nodeNameToVictims map[string]*extenderv1.Victims,
nodeInfos NodeInfoLister,
) (map[string]*extenderv1.Victims, error)
// SupportsPreemption returns if the scheduler extender support preemption or not.
SupportsPreemption() bool
// IsIgnorable returns true indicates scheduling should not fail when this extender
// is unavailable. This gives scheduler ability to fail fast and tolerate non-critical extenders as well.
// Both Filter and Bind actions are supported.
IsIgnorable() bool
}

View File

@ -0,0 +1,954 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// This file defines the scheduling framework plugin interfaces.
package framework
import (
"context"
"errors"
"math"
"strings"
"sync"
"time"
"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/informers"
clientset "k8s.io/client-go/kubernetes"
restclient "k8s.io/client-go/rest"
"k8s.io/client-go/tools/events"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
)
// NodeScoreList declares a list of nodes and their scores.
type NodeScoreList []NodeScore
// NodeScore is a struct with node name and score.
type NodeScore struct {
Name string
Score int64
}
// NodeToStatusReader is a read-only interface of NodeToStatus passed to each PostFilter plugin.
type NodeToStatusReader interface {
// Get returns the status for given nodeName.
// If the node is not in the map, the AbsentNodesStatus is returned.
Get(nodeName string) *Status
// NodesForStatusCode returns a list of NodeInfos for the nodes that have a given status code.
// It returns the NodeInfos for all matching nodes denoted by AbsentNodesStatus as well.
NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error)
}
// NodeToStatusMap is an alias for NodeToStatusReader to keep partial backwards compatibility.
// NodeToStatusReader should be used if possible.
type NodeToStatusMap = NodeToStatusReader
// NodeToStatus contains the statuses of the Nodes where the incoming Pod was not schedulable.
type NodeToStatus struct {
// nodeToStatus contains specific statuses of the nodes.
nodeToStatus map[string]*Status
// absentNodesStatus defines a status for all nodes that are absent in nodeToStatus map.
// By default, all absent nodes are UnschedulableAndUnresolvable.
absentNodesStatus *Status
}
// NewDefaultNodeToStatus creates NodeToStatus without any node in the map.
// The absentNodesStatus is set by default to UnschedulableAndUnresolvable.
func NewDefaultNodeToStatus() *NodeToStatus {
return NewNodeToStatus(make(map[string]*Status), NewStatus(UnschedulableAndUnresolvable))
}
// NewNodeToStatus creates NodeToStatus initialized with given nodeToStatus and absentNodesStatus.
func NewNodeToStatus(nodeToStatus map[string]*Status, absentNodesStatus *Status) *NodeToStatus {
return &NodeToStatus{
nodeToStatus: nodeToStatus,
absentNodesStatus: absentNodesStatus,
}
}
// Get returns the status for given nodeName. If the node is not in the map, the absentNodesStatus is returned.
func (m *NodeToStatus) Get(nodeName string) *Status {
if status, ok := m.nodeToStatus[nodeName]; ok {
return status
}
return m.absentNodesStatus
}
// Set sets status for given nodeName.
func (m *NodeToStatus) Set(nodeName string, status *Status) {
m.nodeToStatus[nodeName] = status
}
// Len returns length of nodeToStatus map. It is not aware of number of absent nodes.
func (m *NodeToStatus) Len() int {
return len(m.nodeToStatus)
}
// AbsentNodesStatus returns absentNodesStatus value.
func (m *NodeToStatus) AbsentNodesStatus() *Status {
return m.absentNodesStatus
}
// SetAbsentNodesStatus sets absentNodesStatus value.
func (m *NodeToStatus) SetAbsentNodesStatus(status *Status) {
m.absentNodesStatus = status
}
// ForEachExplicitNode runs fn for each node which status is explicitly set.
// Imporatant note, it runs the fn only for nodes with a status explicitly registered,
// and hence may not run the fn for all existing nodes.
// For example, if PreFilter rejects all Nodes, the scheduler would NOT set a failure status to every Node,
// but set a failure status as AbsentNodesStatus.
// You're supposed to get a status from AbsentNodesStatus(), and consider all other nodes that are rejected by them.
func (m *NodeToStatus) ForEachExplicitNode(fn func(nodeName string, status *Status)) {
for nodeName, status := range m.nodeToStatus {
fn(nodeName, status)
}
}
// NodesForStatusCode returns a list of NodeInfos for the nodes that matches a given status code.
// If the absentNodesStatus matches the code, all existing nodes are fetched using nodeLister
// and filtered using NodeToStatus.Get.
// If the absentNodesStatus doesn't match the code, nodeToStatus map is used to create a list of nodes
// and nodeLister.Get is used to obtain NodeInfo for each.
func (m *NodeToStatus) NodesForStatusCode(nodeLister NodeInfoLister, code Code) ([]*NodeInfo, error) {
var resultNodes []*NodeInfo
if m.AbsentNodesStatus().Code() == code {
allNodes, err := nodeLister.List()
if err != nil {
return nil, err
}
if m.Len() == 0 {
// All nodes are absent and status code is matching, so can return all nodes.
return allNodes, nil
}
// Need to find all the nodes that are absent or have a matching code using the allNodes.
for _, node := range allNodes {
nodeName := node.Node().Name
if status := m.Get(nodeName); status.Code() == code {
resultNodes = append(resultNodes, node)
}
}
return resultNodes, nil
}
m.ForEachExplicitNode(func(nodeName string, status *Status) {
if status.Code() == code {
if nodeInfo, err := nodeLister.Get(nodeName); err == nil {
resultNodes = append(resultNodes, nodeInfo)
}
}
})
return resultNodes, nil
}
// NodePluginScores is a struct with node name and scores for that node.
type NodePluginScores struct {
// Name is node name.
Name string
// Scores is scores from plugins and extenders.
Scores []PluginScore
// TotalScore is the total score in Scores.
TotalScore int64
}
// PluginScore is a struct with plugin/extender name and score.
type PluginScore struct {
// Name is the name of plugin or extender.
Name string
Score int64
}
// Code is the Status code/type which is returned from plugins.
type Code int
// These are predefined codes used in a Status.
// Note: when you add a new status, you have to add it in `codes` slice below.
const (
// Success means that plugin ran correctly and found pod schedulable.
// NOTE: A nil status is also considered as "Success".
Success Code = iota
// Error is one of the failures, used for internal plugin errors, unexpected input, etc.
// Plugin shouldn't return this code for expected failures, like Unschedulable.
// Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins.
// Meaning, the Pod will be requeued to activeQ/backoffQ soon.
Error
// Unschedulable is one of the failures, used when a plugin finds a pod unschedulable.
// If it's returned from PreFilter or Filter, the scheduler might attempt to
// run other postFilter plugins like preemption to get this pod scheduled.
// Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins.
// The accompanying status message should explain why the pod is unschedulable.
//
// We regard the backoff as a penalty of wasting the scheduling cycle.
// When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling,
// the Pod goes through backoff.
Unschedulable
// UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and
// other postFilter plugins like preemption would not change anything.
// See the comment on PostFilter interface for more details about how PostFilter should handle this status.
// Plugins should return Unschedulable if it is possible that the pod can get scheduled
// after running other postFilter plugins.
// The accompanying status message should explain why the pod is unschedulable.
//
// We regard the backoff as a penalty of wasting the scheduling cycle.
// When the scheduling queue requeues Pods, which was rejected with UnschedulableAndUnresolvable in the last scheduling,
// the Pod goes through backoff.
UnschedulableAndUnresolvable
// Wait is used when a Permit plugin finds a pod scheduling should wait.
Wait
// Skip is used in the following scenarios:
// - when a Bind plugin chooses to skip binding.
// - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped.
// - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped.
Skip
// Pending means that the scheduling process is finished successfully,
// but the plugin wants to stop the scheduling cycle/binding cycle here.
//
// For example, the DRA plugin sometimes needs to wait for the external device driver
// to provision the resource for the Pod.
// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
// because in this case, the scheduler decides where the Pod can go successfully,
// but we need to wait for the external component to do something based on that scheduling result.
//
// We regard the backoff as a penalty of wasting the scheduling cycle.
// In the case of returning Pending, we cannot say the scheduling cycle is wasted
// because the scheduling result is used to proceed the Pod's scheduling forward,
// that particular scheduling cycle is failed though.
// So, Pods rejected by such reasons don't need to suffer a penalty (backoff).
// When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling,
// the Pod goes to activeQ directly ignoring backoff.
Pending
)
// This list should be exactly the same as the codes iota defined above in the same order.
var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"}
func (c Code) String() string {
return codes[c]
}
const (
// MaxNodeScore is the maximum score a Score plugin is expected to return.
MaxNodeScore int64 = 100
// MinNodeScore is the minimum score a Score plugin is expected to return.
MinNodeScore int64 = 0
// MaxTotalScore is the maximum total score.
MaxTotalScore int64 = math.MaxInt64
)
// PodsToActivateKey is a reserved state key for stashing pods.
// If the stashed pods are present in unschedulablePods or backoffQthey will be
// activated (i.e., moved to activeQ) in two phases:
// - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated)
// - end of a binding cycle if it succeeds
var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate"
// PodsToActivate stores pods to be activated.
type PodsToActivate struct {
sync.Mutex
// Map is keyed with namespaced pod name, and valued with the pod.
Map map[string]*v1.Pod
}
// Clone just returns the same state.
func (s *PodsToActivate) Clone() StateData {
return s
}
// NewPodsToActivate instantiates a PodsToActivate object.
func NewPodsToActivate() *PodsToActivate {
return &PodsToActivate{Map: make(map[string]*v1.Pod)}
}
// Status indicates the result of running a plugin. It consists of a code, a
// message, (optionally) an error, and a plugin name it fails by.
// When the status code is not Success, the reasons should explain why.
// And, when code is Success, all the other fields should be empty.
// NOTE: A nil Status is also considered as Success.
type Status struct {
code Code
reasons []string
err error
// plugin is an optional field that records the plugin name causes this status.
// It's set by the framework when code is Unschedulable, UnschedulableAndUnresolvable or Pending.
plugin string
}
func (s *Status) WithError(err error) *Status {
s.err = err
return s
}
// Code returns code of the Status.
func (s *Status) Code() Code {
if s == nil {
return Success
}
return s.code
}
// Message returns a concatenated message on reasons of the Status.
func (s *Status) Message() string {
if s == nil {
return ""
}
return strings.Join(s.Reasons(), ", ")
}
// SetPlugin sets the given plugin name to s.plugin.
func (s *Status) SetPlugin(plugin string) {
s.plugin = plugin
}
// WithPlugin sets the given plugin name to s.plugin,
// and returns the given status object.
func (s *Status) WithPlugin(plugin string) *Status {
s.SetPlugin(plugin)
return s
}
// Plugin returns the plugin name which caused this status.
func (s *Status) Plugin() string {
return s.plugin
}
// Reasons returns reasons of the Status.
func (s *Status) Reasons() []string {
if s.err != nil {
return append([]string{s.err.Error()}, s.reasons...)
}
return s.reasons
}
// AppendReason appends given reason to the Status.
func (s *Status) AppendReason(reason string) {
s.reasons = append(s.reasons, reason)
}
// IsSuccess returns true if and only if "Status" is nil or Code is "Success".
func (s *Status) IsSuccess() bool {
return s.Code() == Success
}
// IsWait returns true if and only if "Status" is non-nil and its Code is "Wait".
func (s *Status) IsWait() bool {
return s.Code() == Wait
}
// IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip".
func (s *Status) IsSkip() bool {
return s.Code() == Skip
}
// IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending).
func (s *Status) IsRejected() bool {
code := s.Code()
return code == Unschedulable || code == UnschedulableAndUnresolvable || code == Pending
}
// AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object
// with a concatenated message on reasons of the Status.
func (s *Status) AsError() error {
if s.IsSuccess() || s.IsWait() || s.IsSkip() {
return nil
}
if s.err != nil {
return s.err
}
return errors.New(s.Message())
}
// Equal checks equality of two statuses. This is useful for testing with
// cmp.Equal.
func (s *Status) Equal(x *Status) bool {
if s == nil || x == nil {
return s.IsSuccess() && x.IsSuccess()
}
if s.code != x.code {
return false
}
if !cmp.Equal(s.err, x.err, cmpopts.EquateErrors()) {
return false
}
if !cmp.Equal(s.reasons, x.reasons) {
return false
}
return cmp.Equal(s.plugin, x.plugin)
}
func (s *Status) String() string {
return s.Message()
}
// NewStatus makes a Status out of the given arguments and returns its pointer.
func NewStatus(code Code, reasons ...string) *Status {
s := &Status{
code: code,
reasons: reasons,
}
return s
}
// AsStatus wraps an error in a Status.
func AsStatus(err error) *Status {
if err == nil {
return nil
}
return &Status{
code: Error,
err: err,
}
}
// WaitingPod represents a pod currently waiting in the permit phase.
type WaitingPod interface {
// GetPod returns a reference to the waiting pod.
GetPod() *v1.Pod
// GetPendingPlugins returns a list of pending Permit plugin's name.
GetPendingPlugins() []string
// Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName".
// If this is the last remaining plugin to allow, then a success signal is delivered
// to unblock the pod.
Allow(pluginName string)
// Reject declares the waiting pod unschedulable.
Reject(pluginName, msg string)
}
// Plugin is the parent type for all the scheduling framework plugins.
type Plugin interface {
Name() string
}
// PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins.
// These plugins are called prior to adding Pods to activeQ.
// Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to
// involve expensive calls like accessing external endpoints; otherwise it'd block other
// Pods' enqueuing in event handlers.
type PreEnqueuePlugin interface {
Plugin
// PreEnqueue is called prior to adding Pods to activeQ.
PreEnqueue(ctx context.Context, p *v1.Pod) *Status
}
// LessFunc is the function to sort pod info
type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool
// QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins.
// These plugins are used to sort pods in the scheduling queue. Only one queue sort
// plugin may be enabled at a time.
type QueueSortPlugin interface {
Plugin
// Less are used to sort pods in the scheduling queue.
Less(*QueuedPodInfo, *QueuedPodInfo) bool
}
// EnqueueExtensions is an optional interface that plugins can implement to efficiently
// move unschedulable Pods in internal scheduling queues.
// In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins,
// and Pods rejected by these plugins are requeued based on this extension point.
// Failures from other extension points are regarded as temporal errors (e.g., network failure),
// and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff.
// This is because such temporal errors cannot be resolved by specific cluster events,
// and we have no choose but keep retrying scheduling until the failure is resolved.
//
// Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface,
// otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin.
// And, if plugins other than above extension points support this interface, they are just ignored.
type EnqueueExtensions interface {
Plugin
// EventsToRegister returns a series of possible events that may cause a Pod
// failed by this plugin schedulable. Each event has a callback function that
// filters out events to reduce useless retry of Pod's scheduling.
// The events will be registered when instantiating the internal scheduling queue,
// and leveraged to build event handlers dynamically.
// When it returns an error, the scheduler fails to start.
// Note: the returned list needs to be determined at a startup,
// and the scheduler only evaluates it once during start up.
// Do not change the result during runtime, for example, based on the cluster's state etc.
//
// Appropriate implementation of this function will make Pod's re-scheduling accurate and performant.
EventsToRegister(context.Context) ([]ClusterEventWithHint, error)
}
// PreFilterExtensions is an interface that is included in plugins that allow specifying
// callbacks to make incremental updates to its supposedly pre-calculated
// state.
type PreFilterExtensions interface {
// AddPod is called by the framework while trying to evaluate the impact
// of adding podToAdd to the node while scheduling podToSchedule.
AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
// RemovePod is called by the framework while trying to evaluate the impact
// of removing podToRemove from the node while scheduling podToSchedule.
RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
}
// PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins.
// These plugins are called at the beginning of the scheduling cycle.
type PreFilterPlugin interface {
Plugin
// PreFilter is called at the beginning of the scheduling cycle. All PreFilter
// plugins must return success or the pod will be rejected. PreFilter could optionally
// return a PreFilterResult to influence which nodes to evaluate downstream. This is useful
// for cases where it is possible to determine the subset of nodes to process in O(1) time.
// When PreFilterResult filters out some Nodes, the framework considers Nodes that are filtered out as getting "UnschedulableAndUnresolvable".
// i.e., those Nodes will be out of the candidates of the preemption.
//
// When it returns Skip status, returned PreFilterResult and other fields in status are just ignored,
// and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle.
PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status)
// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one,
// or nil if it does not. A Pre-filter plugin can provide extensions to incrementally
// modify its pre-processed info. The framework guarantees that the extensions
// AddPod/RemovePod will only be called after PreFilter, possibly on a cloned
// CycleState, and may call those functions more than once before calling
// Filter again on a specific node.
PreFilterExtensions() PreFilterExtensions
}
// FilterPlugin is an interface for Filter plugins. These plugins are called at the
// filter extension point for filtering out hosts that cannot run a pod.
// This concept used to be called 'predicate' in the original scheduler.
// These plugins should return "Success", "Unschedulable" or "Error" in Status.code.
// However, the scheduler accepts other valid codes as well.
// Anything other than "Success" will lead to exclusion of the given host from
// running the pod.
type FilterPlugin interface {
Plugin
// Filter is called by the scheduling framework.
// All FilterPlugins should return "Success" to declare that
// the given node fits the pod. If Filter doesn't return "Success",
// it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error".
//
// "Error" aborts pod scheduling and puts the pod into the backoff queue.
//
// For the node being evaluated, Filter plugins should look at the passed
// nodeInfo reference for this particular node's information (e.g., pods
// considered to be running on the node) instead of looking it up in the
// NodeInfoSnapshot because we don't guarantee that they will be the same.
// For example, during preemption, we may pass a copy of the original
// nodeInfo object that has some pods removed from it to evaluate the
// possibility of preempting them to schedule the target pod.
Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status
}
// PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called
// after a pod cannot be scheduled.
type PostFilterPlugin interface {
Plugin
// PostFilter is called by the scheduling framework
// when the scheduling cycle failed at PreFilter or Filter by Unschedulable or UnschedulableAndUnresolvable.
// NodeToStatusReader has statuses that each Node got in PreFilter or Filter phase.
//
// If you're implementing a custom preemption with PostFilter, ignoring Nodes with UnschedulableAndUnresolvable is the responsibility of your plugin,
// meaning NodeToStatusReader could have Nodes with UnschedulableAndUnresolvable
// and the scheduling framework does call PostFilter plugins even when all Nodes in NodeToStatusReader are UnschedulableAndUnresolvable.
//
// A PostFilter plugin should return one of the following statuses:
// - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable.
// - Success: the plugin gets executed successfully and the pod can be made schedulable.
// - Error: the plugin aborts due to some internal error.
//
// Informational plugins should be configured ahead of other ones, and always return Unschedulable status.
// Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example,
// a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the
// preemptor pod's .spec.status.nominatedNodeName field.
PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)
}
// PreScorePlugin is an interface for "PreScore" plugin. PreScore is an
// informational extension point. Plugins will be called with a list of nodes
// that passed the filtering phase. A plugin may use this data to update internal
// state or to generate logs/metrics.
type PreScorePlugin interface {
Plugin
// PreScore is called by the scheduling framework after a list of nodes
// passed the filtering phase. All prescore plugins must return success or
// the pod will be rejected
// When it returns Skip status, other fields in status are just ignored,
// and coupled Score plugin will be skipped in this scheduling cycle.
PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*NodeInfo) *Status
}
// ScoreExtensions is an interface for Score extended functionality.
type ScoreExtensions interface {
// NormalizeScore is called for all node scores produced by the same plugin's "Score"
// method. A successful run of NormalizeScore will update the scores list and return
// a success status.
NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status
}
// ScorePlugin is an interface that must be implemented by "Score" plugins to rank
// nodes that passed the filtering phase.
type ScorePlugin interface {
Plugin
// Score is called on each filtered node. It must return success and an integer
// indicating the rank of the node. All scoring plugins must return success or
// the pod will be rejected.
Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
ScoreExtensions() ScoreExtensions
}
// ReservePlugin is an interface for plugins with Reserve and Unreserve
// methods. These are meant to update the state of the plugin. This concept
// used to be called 'assume' in the original scheduler. These plugins should
// return only Success or Error in Status.code. However, the scheduler accepts
// other valid codes as well. Anything other than Success will lead to
// rejection of the pod.
type ReservePlugin interface {
Plugin
// Reserve is called by the scheduling framework when the scheduler cache is
// updated. If this method returns a failed Status, the scheduler will call
// the Unreserve method for all enabled ReservePlugins.
Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
// Unreserve is called by the scheduling framework when a reserved pod was
// rejected, an error occurred during reservation of subsequent plugins, or
// in a later phase. The Unreserve method implementation must be idempotent
// and may be called by the scheduler even if the corresponding Reserve
// method for the same plugin was not called.
Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
}
// PreBindPlugin is an interface that must be implemented by "PreBind" plugins.
// These plugins are called before a pod being scheduled.
type PreBindPlugin interface {
Plugin
// PreBind is called before binding a pod. All prebind plugins must return
// success or the pod will be rejected and won't be sent for binding.
PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
}
// PostBindPlugin is an interface that must be implemented by "PostBind" plugins.
// These plugins are called after a pod is successfully bound to a node.
type PostBindPlugin interface {
Plugin
// PostBind is called after a pod is successfully bound. These plugins are
// informational. A common application of this extension point is for cleaning
// up. If a plugin needs to clean-up its state after a pod is scheduled and
// bound, PostBind is the extension point that it should register.
PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
}
// PermitPlugin is an interface that must be implemented by "Permit" plugins.
// These plugins are called before a pod is bound to a node.
type PermitPlugin interface {
Plugin
// Permit is called before binding a pod (and before prebind plugins). Permit
// plugins are used to prevent or delay the binding of a Pod. A permit plugin
// must return success or wait with timeout duration, or the pod will be rejected.
// The pod will also be rejected if the wait timeout or the pod is rejected while
// waiting. Note that if the plugin returns "wait", the framework will wait only
// after running the remaining plugins given that no other plugin rejects the pod.
Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration)
}
// BindPlugin is an interface that must be implemented by "Bind" plugins. Bind
// plugins are used to bind a pod to a Node.
type BindPlugin interface {
Plugin
// Bind plugins will not be called until all pre-bind plugins have completed. Each
// bind plugin is called in the configured order. A bind plugin may choose whether
// or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the
// remaining bind plugins are skipped. When a bind plugin does not handle a pod,
// it must return Skip in its Status code. If a bind plugin returns an Error, the
// pod is rejected and will not be bound.
Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
}
// Framework manages the set of plugins in use by the scheduling framework.
// Configured plugins are called at specified points in a scheduling context.
type Framework interface {
Handle
// PreEnqueuePlugins returns the registered preEnqueue plugins.
PreEnqueuePlugins() []PreEnqueuePlugin
// EnqueueExtensions returns the registered Enqueue extensions.
EnqueueExtensions() []EnqueueExtensions
// QueueSortFunc returns the function to sort pods in scheduling queue
QueueSortFunc() LessFunc
// RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns
// *Status and its code is set to non-success if any of the plugins returns
// anything but Success. If a non-success status is returned, then the scheduling
// cycle is aborted.
// It also returns a PreFilterResult, which may influence what or how many nodes to
// evaluate downstream.
// The third returns value contains PreFilter plugin that rejected some or all Nodes with PreFilterResult.
// But, note that it doesn't contain any plugin when a plugin rejects this Pod with non-success status,
// not with PreFilterResult.
RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status, sets.Set[string])
// RunPostFilterPlugins runs the set of configured PostFilter plugins.
// PostFilter plugins can either be informational, in which case should be configured
// to execute first and return Unschedulable status, or ones that try to change the
// cluster state to make the pod potentially schedulable in a future scheduling cycle.
RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusReader) (*PostFilterResult, *Status)
// RunPreBindPlugins runs the set of configured PreBind plugins. It returns
// *Status and its code is set to non-success if any of the plugins returns
// anything but Success. If the Status code is "Unschedulable", it is
// considered as a scheduling check failure, otherwise, it is considered as an
// internal error. In either case the pod is not going to be bound.
RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// RunPostBindPlugins runs the set of configured PostBind plugins.
RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
// RunReservePluginsReserve runs the Reserve method of the set of
// configured Reserve plugins. If any of these calls returns an error, it
// does not continue running the remaining ones and returns the error. In
// such case, pod will not be scheduled.
RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// RunReservePluginsUnreserve runs the Unreserve method of the set of
// configured Reserve plugins.
RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
// RunPermitPlugins runs the set of configured Permit plugins. If any of these
// plugins returns a status other than "Success" or "Wait", it does not continue
// running the remaining plugins and returns an error. Otherwise, if any of the
// plugins returns "Wait", then this function will create and add waiting pod
// to a map of currently waiting pods and return status with "Wait" code.
// Pod will remain waiting pod for the minimum duration returned by the Permit plugins.
RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed.
WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status
// RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose
// whether or not to handle the given Pod. If a Bind plugin chooses to skip the
// binding, it should return code=5("skip") status. Otherwise, it should return "Error"
// or "Success". If none of the plugins handled binding, RunBindPlugins returns
// code=5("skip") status.
RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
// HasFilterPlugins returns true if at least one Filter plugin is defined.
HasFilterPlugins() bool
// HasPostFilterPlugins returns true if at least one PostFilter plugin is defined.
HasPostFilterPlugins() bool
// HasScorePlugins returns true if at least one Score plugin is defined.
HasScorePlugins() bool
// ListPlugins returns a map of extension point name to list of configured Plugins.
ListPlugins() *config.Plugins
// ProfileName returns the profile name associated to a profile.
ProfileName() string
// PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile.
PercentageOfNodesToScore() *int32
// SetPodNominator sets the PodNominator
SetPodNominator(nominator PodNominator)
// SetPodActivator sets the PodActivator
SetPodActivator(activator PodActivator)
// Close calls Close method of each plugin.
Close() error
}
// Handle provides data and some tools that plugins can use. It is
// passed to the plugin factories at the time of plugin initialization. Plugins
// must store and use this handle to call framework functions.
type Handle interface {
// PodNominator abstracts operations to maintain nominated Pods.
PodNominator
// PluginsRunner abstracts operations to run some plugins.
PluginsRunner
// PodActivator abstracts operations in the scheduling queue.
PodActivator
// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
// is taken at the beginning of a scheduling cycle and remains unchanged until
// a pod finishes "Permit" point.
//
// It should be used only during scheduling cycle:
// - There is no guarantee that the information remains unchanged in the binding phase of scheduling.
// So, plugins shouldn't use it in the binding cycle (pre-bind/bind/post-bind/un-reserve plugin)
// otherwise, a concurrent read/write error might occur.
// - There is no guarantee that the information is always up-to-date.
// So, plugins shouldn't use it in QueueingHint and PreEnqueue
// otherwise, they might make a decision based on stale information.
//
// Instead, they should use the resources getting from Informer created from SharedInformerFactory().
SnapshotSharedLister() SharedLister
// IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map.
IterateOverWaitingPods(callback func(WaitingPod))
// GetWaitingPod returns a waiting pod given its UID.
GetWaitingPod(uid types.UID) WaitingPod
// RejectWaitingPod rejects a waiting pod given its UID.
// The return value indicates if the pod is waiting or not.
RejectWaitingPod(uid types.UID) bool
// ClientSet returns a kubernetes clientSet.
ClientSet() clientset.Interface
// KubeConfig returns the raw kube config.
KubeConfig() *restclient.Config
// EventRecorder returns an event recorder.
EventRecorder() events.EventRecorder
SharedInformerFactory() informers.SharedInformerFactory
// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.
// A non-default implementation can be plugged into the framework to simulate the state of DRA objects.
SharedDRAManager() SharedDRAManager
// RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node.
RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status
// Extenders returns registered scheduler extenders.
Extenders() []Extender
// Parallelizer returns a parallelizer holding parallelism for scheduler.
Parallelizer() parallelize.Parallelizer
}
// PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase.
type PreFilterResult struct {
// The set of nodes that should be considered downstream; if nil then
// all nodes are eligible.
NodeNames sets.Set[string]
}
func (p *PreFilterResult) AllNodes() bool {
return p == nil || p.NodeNames == nil
}
func (p *PreFilterResult) Merge(in *PreFilterResult) *PreFilterResult {
if p.AllNodes() && in.AllNodes() {
return nil
}
r := PreFilterResult{}
if p.AllNodes() {
r.NodeNames = in.NodeNames.Clone()
return &r
}
if in.AllNodes() {
r.NodeNames = p.NodeNames.Clone()
return &r
}
r.NodeNames = p.NodeNames.Intersection(in.NodeNames)
return &r
}
type NominatingMode int
const (
ModeNoop NominatingMode = iota
ModeOverride
)
type NominatingInfo struct {
NominatedNodeName string
NominatingMode NominatingMode
}
// PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase.
type PostFilterResult struct {
*NominatingInfo
}
func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult {
return &PostFilterResult{
NominatingInfo: &NominatingInfo{
NominatedNodeName: name,
NominatingMode: ModeOverride,
},
}
}
func (ni *NominatingInfo) Mode() NominatingMode {
if ni == nil {
return ModeNoop
}
return ni.NominatingMode
}
// PodActivator abstracts operations in the scheduling queue.
type PodActivator interface {
// Activate moves the given pods to activeQ.
// If a pod isn't found in unschedulablePods or backoffQ and it's in-flight,
// the wildcard event is registered so that the pod will be requeued when it comes back.
// But, if a pod isn't found in unschedulablePods or backoffQ and it's not in-flight (i.e., completely unknown pod),
// Activate would ignore the pod.
Activate(logger klog.Logger, pods map[string]*v1.Pod)
}
// PodNominator abstracts operations to maintain nominated Pods.
type PodNominator interface {
// AddNominatedPod adds the given pod to the nominator or
// updates it if it already exists.
AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo)
// DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist.
DeleteNominatedPodIfExists(pod *v1.Pod)
// UpdateNominatedPod updates the <oldPod> with <newPod>.
UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo)
// NominatedPodsForNode returns nominatedPods on the given node.
NominatedPodsForNode(nodeName string) []*PodInfo
}
// PluginsRunner abstracts operations to run some plugins.
// This is used by preemption PostFilter plugins when evaluating the feasibility of
// scheduling the pod on nodes when certain running pods get evicted.
type PluginsRunner interface {
// RunPreScorePlugins runs the set of configured PreScore plugins. If any
// of these plugins returns any status other than "Success", the given pod is rejected.
RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) *Status
// RunScorePlugins runs the set of configured scoring plugins.
// It returns a list that stores scores from each plugin and total score for each Node.
// It also returns *Status, which is set to non-success if any of the plugins returns
// a non-success status.
RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*NodeInfo) ([]NodePluginScores, *Status)
// RunFilterPlugins runs the set of configured Filter plugins for pod on
// the given node. Note that for the node being evaluated, the passed nodeInfo
// reference could be different from the one in NodeInfoSnapshot map (e.g., pods
// considered to be running on the node could be different). For example, during
// preemption, we may pass a copy of the original nodeInfo object that has some pods
// removed from it to evaluate the possibility of preempting them to
// schedule the target pod.
RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status
// RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured
// PreFilter plugins. It returns directly if any of the plugins return any
// status other than Success.
RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
// RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured
// PreFilter plugins. It returns directly if any of the plugins return any
// status other than Success.
RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
}

View File

@ -0,0 +1,111 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package framework
import (
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/dynamic-resource-allocation/structured"
)
// NodeInfoLister interface represents anything that can list/get NodeInfo objects from node name.
type NodeInfoLister interface {
// List returns the list of NodeInfos.
List() ([]*NodeInfo, error)
// HavePodsWithAffinityList returns the list of NodeInfos of nodes with pods with affinity terms.
HavePodsWithAffinityList() ([]*NodeInfo, error)
// HavePodsWithRequiredAntiAffinityList returns the list of NodeInfos of nodes with pods with required anti-affinity terms.
HavePodsWithRequiredAntiAffinityList() ([]*NodeInfo, error)
// Get returns the NodeInfo of the given node name.
Get(nodeName string) (*NodeInfo, error)
}
// StorageInfoLister interface represents anything that handles storage-related operations and resources.
type StorageInfoLister interface {
// IsPVCUsedByPods returns true/false on whether the PVC is used by one or more scheduled pods,
// keyed in the format "namespace/name".
IsPVCUsedByPods(key string) bool
}
// SharedLister groups scheduler-specific listers.
type SharedLister interface {
NodeInfos() NodeInfoLister
StorageInfos() StorageInfoLister
}
// ResourceSliceLister can be used to obtain ResourceSlices.
type ResourceSliceLister interface {
// List returns a list of all ResourceSlices.
List() ([]*resourceapi.ResourceSlice, error)
}
// DeviceClassLister can be used to obtain DeviceClasses.
type DeviceClassLister interface {
// List returns a list of all DeviceClasses.
List() ([]*resourceapi.DeviceClass, error)
// Get returns the DeviceClass with the given className.
Get(className string) (*resourceapi.DeviceClass, error)
}
// ResourceClaimTracker can be used to obtain ResourceClaims, and track changes to ResourceClaims in-memory.
//
// If the claims are meant to be allocated in the API during the binding phase (when used by scheduler), the tracker helps avoid
// race conditions between scheduling and binding phases (as well as between the binding phase and the informer cache update).
//
// If the binding phase is not run (e.g. when used by Cluster Autoscaler which only runs the scheduling phase, and simulates binding in-memory),
// the tracker allows the framework user to obtain the claim allocations produced by the DRA plugin, and persist them outside of the API (e.g. in-memory).
type ResourceClaimTracker interface {
// List lists ResourceClaims. The result is guaranteed to immediately include any changes made via AssumeClaimAfterAPICall(),
// and SignalClaimPendingAllocation().
List() ([]*resourceapi.ResourceClaim, error)
// Get works like List(), but for a single claim.
Get(namespace, claimName string) (*resourceapi.ResourceClaim, error)
// ListAllAllocatedDevices lists all allocated Devices from allocated ResourceClaims. The result is guaranteed to immediately include
// any changes made via AssumeClaimAfterAPICall(), and SignalClaimPendingAllocation().
ListAllAllocatedDevices() (sets.Set[structured.DeviceID], error)
// SignalClaimPendingAllocation signals to the tracker that the given ResourceClaim will be allocated via an API call in the
// binding phase. This change is immediately reflected in the result of List() and the other accessors.
SignalClaimPendingAllocation(claimUID types.UID, allocatedClaim *resourceapi.ResourceClaim) error
// ClaimHasPendingAllocation answers whether a given claim has a pending allocation during the binding phase. It can be used to avoid
// race conditions in subsequent scheduling phases.
ClaimHasPendingAllocation(claimUID types.UID) bool
// RemoveClaimPendingAllocation removes the pending allocation for the given ResourceClaim from the tracker if any was signaled via
// SignalClaimPendingAllocation(). Returns whether there was a pending allocation to remove. List() and the other accessors immediately
// stop reflecting the pending allocation in the results.
RemoveClaimPendingAllocation(claimUID types.UID) (deleted bool)
// AssumeClaimAfterAPICall signals to the tracker that an API call modifying the given ResourceClaim was made in the binding phase, and the
// changes should be reflected in informers very soon. This change is immediately reflected in the result of List() and the other accessors.
// This mechanism can be used to avoid race conditions between the informer update and subsequent scheduling phases.
AssumeClaimAfterAPICall(claim *resourceapi.ResourceClaim) error
// AssumedClaimRestore signals to the tracker that something went wrong with the API call modifying the given ResourceClaim, and
// the changes won't be reflected in informers after all. List() and the other accessors immediately stop reflecting the assumed change,
// and go back to the informer version.
AssumedClaimRestore(namespace, claimName string)
}
// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.
// The plugin's default implementation obtains the objects from the API. A different implementation can be
// plugged into the framework in order to simulate the state of DRA objects. For example, Cluster Autoscaler
// can use this to provide the correct DRA object state to the DRA plugin when simulating scheduling changes in-memory.
type SharedDRAManager interface {
ResourceClaims() ResourceClaimTracker
ResourceSlices() ResourceSliceLister
DeviceClasses() DeviceClassLister
}

View File

@ -0,0 +1,59 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package parallelize
import "context"
// ErrorChannel supports non-blocking send and receive operation to capture error.
// A maximum of one error is kept in the channel and the rest of the errors sent
// are ignored, unless the existing error is received and the channel becomes empty
// again.
type ErrorChannel struct {
errCh chan error
}
// SendError sends an error without blocking the sender.
func (e *ErrorChannel) SendError(err error) {
select {
case e.errCh <- err:
default:
}
}
// SendErrorWithCancel sends an error without blocking the sender and calls
// cancel function.
func (e *ErrorChannel) SendErrorWithCancel(err error, cancel context.CancelFunc) {
e.SendError(err)
cancel()
}
// ReceiveError receives an error from channel without blocking on the receiver.
func (e *ErrorChannel) ReceiveError() error {
select {
case err := <-e.errCh:
return err
default:
return nil
}
}
// NewErrorChannel returns a new ErrorChannel.
func NewErrorChannel() *ErrorChannel {
return &ErrorChannel{
errCh: make(chan error, 1),
}
}

View File

@ -0,0 +1,65 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package parallelize
import (
"context"
"math"
"k8s.io/client-go/util/workqueue"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
// DefaultParallelism is the default parallelism used in scheduler.
const DefaultParallelism int = 16
// Parallelizer holds the parallelism for scheduler.
type Parallelizer struct {
parallelism int
}
// NewParallelizer returns an object holding the parallelism.
func NewParallelizer(p int) Parallelizer {
return Parallelizer{parallelism: p}
}
// chunkSizeFor returns a chunk size for the given number of items to use for
// parallel work. The size aims to produce good CPU utilization.
// returns max(1, min(sqrt(n), n/Parallelism))
func chunkSizeFor(n, parallelism int) int {
s := int(math.Sqrt(float64(n)))
if r := n/parallelism + 1; s > r {
s = r
} else if s < 1 {
s = 1
}
return s
}
// Until is a wrapper around workqueue.ParallelizeUntil to use in scheduling algorithms.
// A given operation will be a label that is recorded in the goroutine metric.
func (p Parallelizer) Until(ctx context.Context, pieces int, doWorkPiece workqueue.DoWorkPieceFunc, operation string) {
goroutinesMetric := metrics.Goroutines.WithLabelValues(operation)
withMetrics := func(piece int) {
goroutinesMetric.Inc()
doWorkPiece(piece)
goroutinesMetric.Dec()
}
workqueue.ParallelizeUntil(ctx, p.parallelism, pieces, withMetrics, workqueue.WithChunkSize(chunkSizeFor(pieces, p.parallelism)))
}

View File

@ -0,0 +1,3 @@
# Scheduler Framework Plugins
Moved [here](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-scheduling/scheduler_framework_plugins.md).

View File

@ -0,0 +1,63 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package defaultbinder
import (
"context"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// Name of the plugin used in the plugin registry and configurations.
const Name = names.DefaultBinder
// DefaultBinder binds pods to nodes using a k8s client.
type DefaultBinder struct {
handle framework.Handle
}
var _ framework.BindPlugin = &DefaultBinder{}
// New creates a DefaultBinder.
func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
return &DefaultBinder{handle: handle}, nil
}
// Name returns the name of the plugin.
func (b DefaultBinder) Name() string {
return Name
}
// Bind binds pods to nodes using the k8s client.
func (b DefaultBinder) Bind(ctx context.Context, state *framework.CycleState, p *v1.Pod, nodeName string) *framework.Status {
logger := klog.FromContext(ctx)
logger.V(3).Info("Attempting to bind pod to node", "pod", klog.KObj(p), "node", klog.KRef("", nodeName))
binding := &v1.Binding{
ObjectMeta: metav1.ObjectMeta{Namespace: p.Namespace, Name: p.Name, UID: p.UID},
Target: v1.ObjectReference{Kind: "Node", Name: nodeName},
}
err := b.handle.ClientSet().CoreV1().Pods(binding.Namespace).Bind(ctx, binding, metav1.CreateOptions{})
if err != nil {
return framework.AsStatus(err)
}
return nil
}

View File

@ -0,0 +1,364 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package defaultpreemption
import (
"context"
"fmt"
"math/rand"
"sort"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
corelisters "k8s.io/client-go/listers/core/v1"
policylisters "k8s.io/client-go/listers/policy/v1"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
extenderv1 "k8s.io/kube-scheduler/extender/v1"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/framework/preemption"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Name of the plugin used in the plugin registry and configurations.
const Name = names.DefaultPreemption
// DefaultPreemption is a PostFilter plugin implements the preemption logic.
type DefaultPreemption struct {
fh framework.Handle
fts feature.Features
args config.DefaultPreemptionArgs
podLister corelisters.PodLister
pdbLister policylisters.PodDisruptionBudgetLister
Evaluator *preemption.Evaluator
}
var _ framework.PostFilterPlugin = &DefaultPreemption{}
var _ framework.PreEnqueuePlugin = &DefaultPreemption{}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *DefaultPreemption) Name() string {
return Name
}
// New initializes a new plugin and returns it.
func New(_ context.Context, dpArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := dpArgs.(*config.DefaultPreemptionArgs)
if !ok {
return nil, fmt.Errorf("got args of type %T, want *DefaultPreemptionArgs", dpArgs)
}
if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil {
return nil, err
}
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
pdbLister := getPDBLister(fh.SharedInformerFactory())
pl := DefaultPreemption{
fh: fh,
fts: fts,
args: *args,
podLister: podLister,
pdbLister: pdbLister,
}
pl.Evaluator = preemption.NewEvaluator(Name, fh, &pl, fts.EnableAsyncPreemption)
return &pl, nil
}
// PostFilter invoked at the postFilter extension point.
func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
defer func() {
metrics.PreemptionAttempts.Inc()
}()
result, status := pl.Evaluator.Preempt(ctx, state, pod, m)
msg := status.Message()
if len(msg) > 0 {
return result, framework.NewStatus(status.Code(), "preemption: "+msg)
}
return result, status
}
func (pl *DefaultPreemption) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
if !pl.fts.EnableAsyncPreemption {
return nil
}
if pl.Evaluator.IsPodRunningPreemption(p.GetUID()) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, "waiting for the preemption for this pod to be finished")
}
return nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *DefaultPreemption) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// The plugin moves the preemptor Pod to acviteQ/backoffQ once the preemption API calls are all done,
// and we don't need to move the Pod with any events.
return nil, nil
}
// calculateNumCandidates returns the number of candidates the FindCandidates
// method must produce from dry running based on the constraints given by
// <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of
// candidates returned will never be greater than <numNodes>.
func (pl *DefaultPreemption) calculateNumCandidates(numNodes int32) int32 {
n := (numNodes * pl.args.MinCandidateNodesPercentage) / 100
if n < pl.args.MinCandidateNodesAbsolute {
n = pl.args.MinCandidateNodesAbsolute
}
if n > numNodes {
n = numNodes
}
return n
}
// GetOffsetAndNumCandidates chooses a random offset and calculates the number
// of candidates that should be shortlisted for dry running preemption.
func (pl *DefaultPreemption) GetOffsetAndNumCandidates(numNodes int32) (int32, int32) {
return rand.Int31n(numNodes), pl.calculateNumCandidates(numNodes)
}
// This function is not applicable for out-of-tree preemption plugins that exercise
// different preemption candidates on the same nominated node.
func (pl *DefaultPreemption) CandidatesToVictimsMap(candidates []preemption.Candidate) map[string]*extenderv1.Victims {
m := make(map[string]*extenderv1.Victims, len(candidates))
for _, c := range candidates {
m[c.Name()] = c.Victims()
}
return m
}
// SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room
// for "pod" to be scheduled.
func (pl *DefaultPreemption) SelectVictimsOnNode(
ctx context.Context,
state *framework.CycleState,
pod *v1.Pod,
nodeInfo *framework.NodeInfo,
pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status) {
logger := klog.FromContext(ctx)
var potentialVictims []*framework.PodInfo
removePod := func(rpi *framework.PodInfo) error {
if err := nodeInfo.RemovePod(logger, rpi.Pod); err != nil {
return err
}
status := pl.fh.RunPreFilterExtensionRemovePod(ctx, state, pod, rpi, nodeInfo)
if !status.IsSuccess() {
return status.AsError()
}
return nil
}
addPod := func(api *framework.PodInfo) error {
nodeInfo.AddPodInfo(api)
status := pl.fh.RunPreFilterExtensionAddPod(ctx, state, pod, api, nodeInfo)
if !status.IsSuccess() {
return status.AsError()
}
return nil
}
// As the first step, remove all the lower priority pods from the node and
// check if the given pod can be scheduled.
podPriority := corev1helpers.PodPriority(pod)
for _, pi := range nodeInfo.Pods {
if corev1helpers.PodPriority(pi.Pod) < podPriority {
potentialVictims = append(potentialVictims, pi)
if err := removePod(pi); err != nil {
return nil, 0, framework.AsStatus(err)
}
}
}
// No potential victims are found, and so we don't need to evaluate the node again since its state didn't change.
if len(potentialVictims) == 0 {
return nil, 0, framework.NewStatus(framework.UnschedulableAndUnresolvable, "No preemption victims found for incoming pod")
}
// If the new pod does not fit after removing all the lower priority pods,
// we are almost done and this node is not suitable for preemption. The only
// condition that we could check is if the "pod" is failing to schedule due to
// inter-pod affinity to one or more victims, but we have decided not to
// support this case for performance reasons. Having affinity to lower
// priority pods is not a recommended configuration anyway.
if status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo); !status.IsSuccess() {
return nil, 0, status
}
var victims []*v1.Pod
numViolatingVictim := 0
// Sort potentialVictims by pod priority from high to low, which ensures to
// reprieve higher priority pods first.
sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i].Pod, potentialVictims[j].Pod) })
// Try to reprieve as many pods as possible. We first try to reprieve the PDB
// violating victims and then other non-violating ones. In both cases, we start
// from the highest priority victims.
violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims, pdbs)
reprievePod := func(pi *framework.PodInfo) (bool, error) {
if err := addPod(pi); err != nil {
return false, err
}
status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo)
fits := status.IsSuccess()
if !fits {
if err := removePod(pi); err != nil {
return false, err
}
rpi := pi.Pod
victims = append(victims, rpi)
logger.V(5).Info("Pod is a potential preemption victim on node", "pod", klog.KObj(rpi), "node", klog.KObj(nodeInfo.Node()))
}
return fits, nil
}
for _, p := range violatingVictims {
if fits, err := reprievePod(p); err != nil {
return nil, 0, framework.AsStatus(err)
} else if !fits {
numViolatingVictim++
}
}
// Now we try to reprieve non-violating victims.
for _, p := range nonViolatingVictims {
if _, err := reprievePod(p); err != nil {
return nil, 0, framework.AsStatus(err)
}
}
// Sort victims after reprieving pods to keep the pods in the victims sorted in order of priority from high to low.
if len(violatingVictims) != 0 && len(nonViolatingVictims) != 0 {
sort.Slice(victims, func(i, j int) bool { return util.MoreImportantPod(victims[i], victims[j]) })
}
return victims, numViolatingVictim, framework.NewStatus(framework.Success)
}
// PodEligibleToPreemptOthers returns one bool and one string. The bool
// indicates whether this pod should be considered for preempting other pods or
// not. The string includes the reason if this pod isn't eligible.
// There're several reasons:
// 1. The pod has a preemptionPolicy of Never.
// 2. The pod has already preempted other pods and the victims are in their graceful termination period.
// Currently we check the node that is nominated for this pod, and as long as there are
// terminating pods on this node, we don't attempt to preempt more pods.
func (pl *DefaultPreemption) PodEligibleToPreemptOthers(_ context.Context, pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string) {
if pod.Spec.PreemptionPolicy != nil && *pod.Spec.PreemptionPolicy == v1.PreemptNever {
return false, "not eligible due to preemptionPolicy=Never."
}
nodeInfos := pl.fh.SnapshotSharedLister().NodeInfos()
nomNodeName := pod.Status.NominatedNodeName
if len(nomNodeName) > 0 {
// If the pod's nominated node is considered as UnschedulableAndUnresolvable by the filters,
// then the pod should be considered for preempting again.
if nominatedNodeStatus.Code() == framework.UnschedulableAndUnresolvable {
return true, ""
}
if nodeInfo, _ := nodeInfos.Get(nomNodeName); nodeInfo != nil {
podPriority := corev1helpers.PodPriority(pod)
for _, p := range nodeInfo.Pods {
if corev1helpers.PodPriority(p.Pod) < podPriority && podTerminatingByPreemption(p.Pod) {
// There is a terminating pod on the nominated node.
return false, "not eligible due to a terminating pod on the nominated node."
}
}
}
}
return true, ""
}
// OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted.
func (pl *DefaultPreemption) OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64 {
return nil
}
// podTerminatingByPreemption returns true if the pod is in the termination state caused by scheduler preemption.
func podTerminatingByPreemption(p *v1.Pod) bool {
if p.DeletionTimestamp == nil {
return false
}
for _, condition := range p.Status.Conditions {
if condition.Type == v1.DisruptionTarget {
return condition.Status == v1.ConditionTrue && condition.Reason == v1.PodReasonPreemptionByScheduler
}
}
return false
}
// filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods"
// and "nonViolatingPods" based on whether their PDBs will be violated if they are
// preempted.
// This function is stable and does not change the order of received pods. So, if it
// receives a sorted list, grouping will preserve the order of the input list.
func filterPodsWithPDBViolation(podInfos []*framework.PodInfo, pdbs []*policy.PodDisruptionBudget) (violatingPodInfos, nonViolatingPodInfos []*framework.PodInfo) {
pdbsAllowed := make([]int32, len(pdbs))
for i, pdb := range pdbs {
pdbsAllowed[i] = pdb.Status.DisruptionsAllowed
}
for _, podInfo := range podInfos {
pod := podInfo.Pod
pdbForPodIsViolated := false
// A pod with no labels will not match any PDB. So, no need to check.
if len(pod.Labels) != 0 {
for i, pdb := range pdbs {
if pdb.Namespace != pod.Namespace {
continue
}
selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector)
if err != nil {
// This object has an invalid selector, it does not match the pod
continue
}
// A PDB with a nil or empty selector matches nothing.
if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) {
continue
}
// Existing in DisruptedPods means it has been processed in API server,
// we don't treat it as a violating case.
if _, exist := pdb.Status.DisruptedPods[pod.Name]; exist {
continue
}
// Only decrement the matched pdb when it's not in its <DisruptedPods>;
// otherwise we may over-decrement the budget number.
pdbsAllowed[i]--
// We have found a matching PDB.
if pdbsAllowed[i] < 0 {
pdbForPodIsViolated = true
}
}
}
if pdbForPodIsViolated {
violatingPodInfos = append(violatingPodInfos, podInfo)
} else {
nonViolatingPodInfos = append(nonViolatingPodInfos, podInfo)
}
}
return violatingPodInfos, nonViolatingPodInfos
}
func getPDBLister(informerFactory informers.SharedInformerFactory) policylisters.PodDisruptionBudgetLister {
return informerFactory.Policy().V1().PodDisruptionBudgets().Lister()
}

View File

@ -0,0 +1,9 @@
# See the OWNERS docs at https://go.k8s.io/owners
reviewers:
- klueska
- pohly
- bart0sh
labels:
- sig/node
- wg/device-management

View File

@ -0,0 +1,175 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dynamicresources
import (
"sync"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/cache"
"k8s.io/dynamic-resource-allocation/structured"
"k8s.io/klog/v2"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
"k8s.io/utils/ptr"
)
// foreachAllocatedDevice invokes the provided callback for each
// device in the claim's allocation result which was allocated
// exclusively for the claim.
//
// Devices allocated with admin access can be shared with other
// claims and are skipped without invoking the callback.
//
// foreachAllocatedDevice does nothing if the claim is not allocated.
func foreachAllocatedDevice(claim *resourceapi.ResourceClaim, cb func(deviceID structured.DeviceID)) {
if claim.Status.Allocation == nil {
return
}
for _, result := range claim.Status.Allocation.Devices.Results {
// Kubernetes 1.31 did not set this, 1.32 always does.
// Supporting 1.31 is not worth the additional code that
// would have to be written (= looking up in request) because
// it is extremely unlikely that there really is a result
// that still exists in a cluster from 1.31 where this matters.
if ptr.Deref(result.AdminAccess, false) {
// Is not considered as allocated.
continue
}
deviceID := structured.MakeDeviceID(result.Driver, result.Pool, result.Device)
// None of the users of this helper need to abort iterating,
// therefore it's not supported as it only would add overhead.
cb(deviceID)
}
}
// allocatedDevices reacts to events in a cache and maintains a set of all allocated devices.
// This is cheaper than repeatedly calling List, making strings unique, and building the set
// each time PreFilter is called.
//
// All methods are thread-safe. Get returns a cloned set.
type allocatedDevices struct {
logger klog.Logger
mutex sync.RWMutex
ids sets.Set[structured.DeviceID]
}
func newAllocatedDevices(logger klog.Logger) *allocatedDevices {
return &allocatedDevices{
logger: logger,
ids: sets.New[structured.DeviceID](),
}
}
func (a *allocatedDevices) Get() sets.Set[structured.DeviceID] {
a.mutex.RLock()
defer a.mutex.RUnlock()
return a.ids.Clone()
}
func (a *allocatedDevices) handlers() cache.ResourceEventHandler {
return cache.ResourceEventHandlerFuncs{
AddFunc: a.onAdd,
UpdateFunc: a.onUpdate,
DeleteFunc: a.onDelete,
}
}
func (a *allocatedDevices) onAdd(obj any) {
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
if err != nil {
// Shouldn't happen.
a.logger.Error(err, "unexpected object in allocatedDevices.onAdd")
return
}
if claim.Status.Allocation != nil {
a.addDevices(claim)
}
}
func (a *allocatedDevices) onUpdate(oldObj, newObj any) {
originalClaim, modifiedClaim, err := schedutil.As[*resourceapi.ResourceClaim](oldObj, newObj)
if err != nil {
// Shouldn't happen.
a.logger.Error(err, "unexpected object in allocatedDevices.onUpdate")
return
}
switch {
case originalClaim.Status.Allocation == nil && modifiedClaim.Status.Allocation != nil:
a.addDevices(modifiedClaim)
case originalClaim.Status.Allocation != nil && modifiedClaim.Status.Allocation == nil:
a.removeDevices(originalClaim)
default:
// Nothing to do. Either both nil or both non-nil, in which case the content
// also must be the same (immutable!).
}
}
func (a *allocatedDevices) onDelete(obj any) {
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
if err != nil {
// Shouldn't happen.
a.logger.Error(err, "unexpected object in allocatedDevices.onDelete")
return
}
a.removeDevices(claim)
}
func (a *allocatedDevices) addDevices(claim *resourceapi.ResourceClaim) {
if claim.Status.Allocation == nil {
return
}
// Locking of the mutex gets minimized by pre-computing what needs to be done
// without holding the lock.
deviceIDs := make([]structured.DeviceID, 0, 20)
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
a.logger.V(6).Info("Observed device allocation", "device", deviceID, "claim", klog.KObj(claim))
deviceIDs = append(deviceIDs, deviceID)
})
a.mutex.Lock()
defer a.mutex.Unlock()
for _, deviceID := range deviceIDs {
a.ids.Insert(deviceID)
}
}
func (a *allocatedDevices) removeDevices(claim *resourceapi.ResourceClaim) {
if claim.Status.Allocation == nil {
return
}
// Locking of the mutex gets minimized by pre-computing what needs to be done
// without holding the lock.
deviceIDs := make([]structured.DeviceID, 0, 20)
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
a.logger.V(6).Info("Observed device deallocation", "device", deviceID, "claim", klog.KObj(claim))
deviceIDs = append(deviceIDs, deviceID)
})
a.mutex.Lock()
defer a.mutex.Unlock()
for _, deviceID := range deviceIDs {
a.ids.Delete(deviceID)
}
}

View File

@ -0,0 +1,226 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dynamicresources
import (
"context"
"fmt"
"sync"
resourceapi "k8s.io/api/resource/v1beta1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/informers"
resourcelisters "k8s.io/client-go/listers/resource/v1beta1"
"k8s.io/dynamic-resource-allocation/structured"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
)
var _ framework.SharedDRAManager = &DefaultDRAManager{}
// DefaultDRAManager is the default implementation of SharedDRAManager. It obtains the DRA objects
// from API informers, and uses an AssumeCache and a map of in-flight allocations in order
// to avoid race conditions when modifying ResourceClaims.
type DefaultDRAManager struct {
resourceClaimTracker *claimTracker
resourceSliceLister *resourceSliceLister
deviceClassLister *deviceClassLister
}
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
logger := klog.FromContext(ctx)
manager := &DefaultDRAManager{
resourceClaimTracker: &claimTracker{
cache: claimsCache,
inFlightAllocations: &sync.Map{},
allocatedDevices: newAllocatedDevices(logger),
logger: logger,
},
resourceSliceLister: &resourceSliceLister{sliceLister: informerFactory.Resource().V1beta1().ResourceSlices().Lister()},
deviceClassLister: &deviceClassLister{classLister: informerFactory.Resource().V1beta1().DeviceClasses().Lister()},
}
// Reacting to events is more efficient than iterating over the list
// repeatedly in PreFilter.
manager.resourceClaimTracker.cache.AddEventHandler(manager.resourceClaimTracker.allocatedDevices.handlers())
return manager
}
func (s *DefaultDRAManager) ResourceClaims() framework.ResourceClaimTracker {
return s.resourceClaimTracker
}
func (s *DefaultDRAManager) ResourceSlices() framework.ResourceSliceLister {
return s.resourceSliceLister
}
func (s *DefaultDRAManager) DeviceClasses() framework.DeviceClassLister {
return s.deviceClassLister
}
var _ framework.ResourceSliceLister = &resourceSliceLister{}
type resourceSliceLister struct {
sliceLister resourcelisters.ResourceSliceLister
}
func (l *resourceSliceLister) List() ([]*resourceapi.ResourceSlice, error) {
return l.sliceLister.List(labels.Everything())
}
var _ framework.DeviceClassLister = &deviceClassLister{}
type deviceClassLister struct {
classLister resourcelisters.DeviceClassLister
}
func (l *deviceClassLister) Get(className string) (*resourceapi.DeviceClass, error) {
return l.classLister.Get(className)
}
func (l *deviceClassLister) List() ([]*resourceapi.DeviceClass, error) {
return l.classLister.List(labels.Everything())
}
var _ framework.ResourceClaimTracker = &claimTracker{}
type claimTracker struct {
// cache enables temporarily storing a newer claim object
// while the scheduler has allocated it and the corresponding object
// update from the apiserver has not been processed by the claim
// informer callbacks. ResourceClaimTracker get added here in PreBind and removed by
// the informer callback (based on the "newer than" comparison in the
// assume cache).
//
// It uses cache.MetaNamespaceKeyFunc to generate object names, which
// therefore are "<namespace>/<name>".
//
// This is necessary to ensure that reconstructing the resource usage
// at the start of a pod scheduling cycle doesn't reuse the resources
// assigned to such a claim. Alternatively, claim allocation state
// could also get tracked across pod scheduling cycles, but that
// - adds complexity (need to carefully sync state with informer events
// for claims and ResourceSlices)
// - would make integration with cluster autoscaler harder because it would need
// to trigger informer callbacks.
cache *assumecache.AssumeCache
// inFlightAllocations is a map from claim UUIDs to claim objects for those claims
// for which allocation was triggered during a scheduling cycle and the
// corresponding claim status update call in PreBind has not been done
// yet. If another pod needs the claim, the pod is treated as "not
// schedulable yet". The cluster event for the claim status update will
// make it schedulable.
//
// This mechanism avoids the following problem:
// - Pod A triggers allocation for claim X.
// - Pod B shares access to that claim and gets scheduled because
// the claim is assumed to be allocated.
// - PreBind for pod B is called first, tries to update reservedFor and
// fails because the claim is not really allocated yet.
//
// We could avoid the ordering problem by allowing either pod A or pod B
// to set the allocation. But that is more complicated and leads to another
// problem:
// - Pod A and B get scheduled as above.
// - PreBind for pod A gets called first, then fails with a temporary API error.
// It removes the updated claim from the assume cache because of that.
// - PreBind for pod B gets called next and succeeds with adding the
// allocation and its own reservedFor entry.
// - The assume cache is now not reflecting that the claim is allocated,
// which could lead to reusing the same resource for some other claim.
//
// A sync.Map is used because in practice sharing of a claim between
// pods is expected to be rare compared to per-pod claim, so we end up
// hitting the "multiple goroutines read, write, and overwrite entries
// for disjoint sets of keys" case that sync.Map is optimized for.
inFlightAllocations *sync.Map
allocatedDevices *allocatedDevices
logger klog.Logger
}
func (c *claimTracker) ClaimHasPendingAllocation(claimUID types.UID) bool {
_, found := c.inFlightAllocations.Load(claimUID)
return found
}
func (c *claimTracker) SignalClaimPendingAllocation(claimUID types.UID, allocatedClaim *resourceapi.ResourceClaim) error {
c.inFlightAllocations.Store(claimUID, allocatedClaim)
// There's no reason to return an error in this implementation, but the error is helpful for other implementations.
// For example, implementations that have to deal with fake claims might want to return an error if the allocation
// is for an invalid claim.
return nil
}
func (c *claimTracker) RemoveClaimPendingAllocation(claimUID types.UID) (deleted bool) {
_, found := c.inFlightAllocations.LoadAndDelete(claimUID)
return found
}
func (c *claimTracker) Get(namespace, claimName string) (*resourceapi.ResourceClaim, error) {
obj, err := c.cache.Get(namespace + "/" + claimName)
if err != nil {
return nil, err
}
claim, ok := obj.(*resourceapi.ResourceClaim)
if !ok {
return nil, fmt.Errorf("unexpected object type %T for assumed object %s/%s", obj, namespace, claimName)
}
return claim, nil
}
func (c *claimTracker) List() ([]*resourceapi.ResourceClaim, error) {
var result []*resourceapi.ResourceClaim
// Probably not worth adding an index for?
objs := c.cache.List(nil)
for _, obj := range objs {
claim, ok := obj.(*resourceapi.ResourceClaim)
if ok {
result = append(result, claim)
}
}
return result, nil
}
func (c *claimTracker) ListAllAllocatedDevices() (sets.Set[structured.DeviceID], error) {
// Start with a fresh set that matches the current known state of the
// world according to the informers.
allocated := c.allocatedDevices.Get()
// Whatever is in flight also has to be checked.
c.inFlightAllocations.Range(func(key, value any) bool {
claim := value.(*resourceapi.ResourceClaim)
foreachAllocatedDevice(claim, func(deviceID structured.DeviceID) {
c.logger.V(6).Info("Device is in flight for allocation", "device", deviceID, "claim", klog.KObj(claim))
allocated.Insert(deviceID)
})
return true
})
// There's no reason to return an error in this implementation, but the error might be helpful for other implementations.
return allocated, nil
}
func (c *claimTracker) AssumeClaimAfterAPICall(claim *resourceapi.ResourceClaim) error {
return c.cache.Assume(claim)
}
func (c *claimTracker) AssumedClaimRestore(namespace, claimName string) {
c.cache.Restore(namespace + "/" + claimName)
}

View File

@ -0,0 +1,905 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package dynamicresources
import (
"context"
"errors"
"fmt"
"slices"
"sync"
"github.com/google/go-cmp/cmp"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1beta1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/util/retry"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/dynamic-resource-allocation/cel"
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/dynamic-resource-allocation/structured"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)
const (
// Name is the name of the plugin used in Registry and configurations.
Name = names.DynamicResources
stateKey framework.StateKey = Name
)
// The state is initialized in PreFilter phase. Because we save the pointer in
// framework.CycleState, in the later phases we don't need to call Write method
// to update the value
type stateData struct {
// A copy of all claims for the Pod (i.e. 1:1 match with
// pod.Spec.ResourceClaims), initially with the status from the start
// of the scheduling cycle. Each claim instance is read-only because it
// might come from the informer cache. The instances get replaced when
// the plugin itself successfully does an Update.
//
// Empty if the Pod has no claims.
claims []*resourceapi.ResourceClaim
// Allocator handles claims with structured parameters.
allocator *structured.Allocator
// mutex must be locked while accessing any of the fields below.
mutex sync.Mutex
// The indices of all claims that:
// - are allocated
// - use delayed allocation or the builtin controller
// - were not available on at least one node
//
// Set in parallel during Filter, so write access there must be
// protected by the mutex. Used by PostFilter.
unavailableClaims sets.Set[int]
informationsForClaim []informationForClaim
// nodeAllocations caches the result of Filter for the nodes.
nodeAllocations map[string][]resourceapi.AllocationResult
}
func (d *stateData) Clone() framework.StateData {
return d
}
type informationForClaim struct {
// Node selector based on the claim status if allocated.
availableOnNodes *nodeaffinity.NodeSelector
// Set by Reserved, published by PreBind.
allocation *resourceapi.AllocationResult
}
// DynamicResources is a plugin that ensures that ResourceClaims are allocated.
type DynamicResources struct {
enabled bool
enableAdminAccess bool
enableSchedulingQueueHint bool
fh framework.Handle
clientset kubernetes.Interface
celCache *cel.Cache
draManager framework.SharedDRAManager
}
// New initializes a new plugin and returns it.
func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
if !fts.EnableDynamicResourceAllocation {
// Disabled, won't do anything.
return &DynamicResources{}, nil
}
pl := &DynamicResources{
enabled: true,
enableAdminAccess: fts.EnableDRAAdminAccess,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
fh: fh,
clientset: fh.ClientSet(),
// This is a LRU cache for compiled CEL expressions. The most
// recent 10 of them get reused across different scheduling
// cycles.
celCache: cel.NewCache(10),
draManager: fh.SharedDRAManager(),
}
return pl, nil
}
var _ framework.PreEnqueuePlugin = &DynamicResources{}
var _ framework.PreFilterPlugin = &DynamicResources{}
var _ framework.FilterPlugin = &DynamicResources{}
var _ framework.PostFilterPlugin = &DynamicResources{}
var _ framework.ReservePlugin = &DynamicResources{}
var _ framework.EnqueueExtensions = &DynamicResources{}
var _ framework.PreBindPlugin = &DynamicResources{}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *DynamicResources) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *DynamicResources) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if !pl.enabled {
return nil, nil
}
// A resource might depend on node labels for topology filtering.
// A new or updated node may make pods schedulable.
//
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// When QHint is enabled, the problematic preCheck is already removed, and we can remove UpdateNodeTaint.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
events := []framework.ClusterEventWithHint{
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// Allocation is tracked in ResourceClaims, so any changes may make the pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.ResourceClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimChange},
// Adding the ResourceClaim name to the pod status makes pods waiting for their ResourceClaim schedulable.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodGeneratedResourceClaim}, QueueingHintFn: pl.isSchedulableAfterPodChange},
// A pod might be waiting for a class to get created or modified.
{Event: framework.ClusterEvent{Resource: framework.DeviceClass, ActionType: framework.Add | framework.Update}},
// Adding or updating a ResourceSlice might make a pod schedulable because new resources became available.
{Event: framework.ClusterEvent{Resource: framework.ResourceSlice, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterResourceSliceChange},
}
return events, nil
}
// PreEnqueue checks if there are known reasons why a pod currently cannot be
// scheduled. When this fails, one of the registered events can trigger another
// attempt.
func (pl *DynamicResources) PreEnqueue(ctx context.Context, pod *v1.Pod) (status *framework.Status) {
if !pl.enabled {
return nil
}
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
return statusUnschedulable(klog.FromContext(ctx), err.Error())
}
return nil
}
// isSchedulableAfterClaimChange is invoked for add and update claim events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable. It errs on the side of letting a pod scheduling attempt
// happen. The delete claim event will not invoke it, so newObj will never be nil.
func (pl *DynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalClaim, modifiedClaim, err := schedutil.As[*resourceapi.ResourceClaim](oldObj, newObj)
if err != nil {
// Shouldn't happen.
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err)
}
usesClaim := false
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourceapi.ResourceClaim) {
if claim.UID == modifiedClaim.UID {
usesClaim = true
}
}); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
if loggerV := logger.V(6); loggerV.Enabled() {
owner := metav1.GetControllerOf(modifiedClaim)
loggerV.Info("pod is not schedulable after resource claim change", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "claimOwner", owner, "reason", err.Error())
}
return framework.QueueSkip, nil
}
if originalClaim != nil &&
originalClaim.Status.Allocation != nil &&
modifiedClaim.Status.Allocation == nil {
// A claim with structured parameters was deallocated. This might have made
// resources available for other pods.
logger.V(6).Info("claim with structured parameters got deallocated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.Queue, nil
}
if !usesClaim {
// This was not the claim the pod was waiting for.
logger.V(6).Info("unrelated claim got modified", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.QueueSkip, nil
}
if originalClaim == nil {
logger.V(5).Info("claim for pod got created", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.Queue, nil
}
// Modifications may or may not be relevant. If the entire
// status is as before, then something else must have changed
// and we don't care. What happens in practice is that the
// resource driver adds the finalizer.
if apiequality.Semantic.DeepEqual(&originalClaim.Status, &modifiedClaim.Status) {
if loggerV := logger.V(7); loggerV.Enabled() {
// Log more information.
loggerV.Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "diff", cmp.Diff(originalClaim, modifiedClaim))
} else {
logger.V(6).Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
}
return framework.QueueSkip, nil
}
logger.V(5).Info("status of claim for pod got updated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
return framework.Queue, nil
}
// isSchedulableAfterPodChange is invoked for update pod events reported by
// an informer. It checks whether that change adds the ResourceClaim(s) that the
// pod has been waiting for.
func (pl *DynamicResources) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := schedutil.As[*v1.Pod](nil, newObj)
if err != nil {
// Shouldn't happen.
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err)
}
if pod.UID != modifiedPod.UID {
logger.V(7).Info("pod is not schedulable after change in other pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
if err := pl.foreachPodResourceClaim(modifiedPod, nil); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
logger.V(6).Info("pod is not schedulable after being updated", "pod", klog.KObj(pod))
return framework.QueueSkip, nil
}
logger.V(5).Info("pod got updated and is schedulable", "pod", klog.KObj(pod))
return framework.Queue, nil
}
// isSchedulableAfterResourceSliceChange is invoked for add and update slice events reported by
// an informer. Such changes can make an unschedulable pod schedulable when the pod requests a device
// and the change adds a suitable device.
//
// For the sake of faster execution and avoiding code duplication, isSchedulableAfterResourceSliceChange
// only checks whether the pod uses claims. All of the more detailed checks are done in the scheduling
// attempt.
//
// The delete claim event will not invoke it, so newObj will never be nil.
func (pl *DynamicResources) isSchedulableAfterResourceSliceChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedSlice, err := schedutil.As[*resourceapi.ResourceSlice](oldObj, newObj)
if err != nil {
// Shouldn't happen.
return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterResourceSliceChange: %w", err)
}
if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
// This is not an unexpected error: we know that
// foreachPodResourceClaim only returns errors for "not
// schedulable".
logger.V(6).Info("pod is not schedulable after resource slice change", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice), "reason", err.Error())
return framework.QueueSkip, nil
}
// We could check what got changed in the slice, but right now that's likely to be
// about the spec (there's no status yet...).
// We could check whether all claims use classic DRA, but that doesn't seem worth it.
// Let's assume that changing the slice may make the pod schedulable.
logger.V(5).Info("ResourceSlice change might make pod schedulable", "pod", klog.KObj(pod), "resourceSlice", klog.KObj(modifiedSlice))
return framework.Queue, nil
}
// podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
func (pl *DynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourceapi.ResourceClaim, error) {
claims := make([]*resourceapi.ResourceClaim, 0, len(pod.Spec.ResourceClaims))
if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourceapi.ResourceClaim) {
// We store the pointer as returned by the lister. The
// assumption is that if a claim gets modified while our code
// runs, the cache will store a new pointer, not mutate the
// existing object that we point to here.
claims = append(claims, claim)
}); err != nil {
return nil, err
}
return claims, nil
}
// foreachPodResourceClaim checks that each ResourceClaim for the pod exists.
// It calls an optional handler for those claims that it finds.
func (pl *DynamicResources) foreachPodResourceClaim(pod *v1.Pod, cb func(podResourceName string, claim *resourceapi.ResourceClaim)) error {
for _, resource := range pod.Spec.ResourceClaims {
claimName, mustCheckOwner, err := resourceclaim.Name(pod, &resource)
if err != nil {
return err
}
// The claim name might be nil if no underlying resource claim
// was generated for the referenced claim. There are valid use
// cases when this might happen, so we simply skip it.
if claimName == nil {
continue
}
claim, err := pl.draManager.ResourceClaims().Get(pod.Namespace, *claimName)
if err != nil {
return err
}
if claim.DeletionTimestamp != nil {
return fmt.Errorf("resourceclaim %q is being deleted", claim.Name)
}
if mustCheckOwner {
if err := resourceclaim.IsForPod(pod, claim); err != nil {
return err
}
}
if cb != nil {
cb(resource.Name, claim)
}
}
return nil
}
// PreFilter invoked at the prefilter extension point to check if pod has all
// immediate claims bound. UnschedulableAndUnresolvable is returned if
// the pod cannot be scheduled at the moment on any node.
func (pl *DynamicResources) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
if !pl.enabled {
return nil, framework.NewStatus(framework.Skip)
}
logger := klog.FromContext(ctx)
// If the pod does not reference any claim, we don't need to do
// anything for it. We just initialize an empty state to record that
// observation for the other functions. This gets updated below
// if we get that far.
s := &stateData{}
state.Write(stateKey, s)
claims, err := pl.podResourceClaims(pod)
if err != nil {
return nil, statusUnschedulable(logger, err.Error())
}
logger.V(5).Info("pod resource claims", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(claims))
// If the pod does not reference any claim,
// DynamicResources Filter has nothing to do with the Pod.
if len(claims) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
// All claims which the scheduler needs to allocate itself.
allocateClaims := make([]*resourceapi.ResourceClaim, 0, len(claims))
s.informationsForClaim = make([]informationForClaim, len(claims))
for index, claim := range claims {
if claim.Status.Allocation != nil &&
!resourceclaim.CanBeReserved(claim) &&
!resourceclaim.IsReservedForPod(pod, claim) {
// Resource is in use. The pod has to wait.
return nil, statusUnschedulable(logger, "resourceclaim in use", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
}
if claim.Status.Allocation != nil {
if claim.Status.Allocation.NodeSelector != nil {
nodeSelector, err := nodeaffinity.NewNodeSelector(claim.Status.Allocation.NodeSelector)
if err != nil {
return nil, statusError(logger, err)
}
s.informationsForClaim[index].availableOnNodes = nodeSelector
}
} else {
allocateClaims = append(allocateClaims, claim)
// Allocation in flight? Better wait for that
// to finish, see inFlightAllocations
// documentation for details.
if pl.draManager.ResourceClaims().ClaimHasPendingAllocation(claim.UID) {
return nil, statusUnschedulable(logger, fmt.Sprintf("resource claim %s is in the process of being allocated", klog.KObj(claim)))
}
// Check all requests and device classes. If a class
// does not exist, scheduling cannot proceed, no matter
// how the claim is being allocated.
//
// When using a control plane controller, a class might
// have a node filter. This is useful for trimming the
// initial set of potential nodes before we ask the
// driver(s) for information about the specific pod.
for _, request := range claim.Spec.Devices.Requests {
if request.DeviceClassName == "" {
return nil, statusError(logger, fmt.Errorf("request %s: unsupported request type", request.Name))
}
_, err := pl.draManager.DeviceClasses().Get(request.DeviceClassName)
if err != nil {
// If the class cannot be retrieved, allocation cannot proceed.
if apierrors.IsNotFound(err) {
// Here we mark the pod as "unschedulable", so it'll sleep in
// the unscheduleable queue until a DeviceClass event occurs.
return nil, statusUnschedulable(logger, fmt.Sprintf("request %s: device class %s does not exist", request.Name, request.DeviceClassName))
}
// Other error, retry with backoff.
return nil, statusError(logger, fmt.Errorf("request %s: look up device class: %w", request.Name, err))
}
}
}
}
if len(allocateClaims) > 0 {
logger.V(5).Info("Preparing allocation with structured parameters", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(allocateClaims))
// Doing this over and over again for each pod could be avoided
// by setting the allocator up once and then keeping it up-to-date
// as changes are observed.
//
// But that would cause problems for using the plugin in the
// Cluster Autoscaler. If this step here turns out to be
// expensive, we may have to maintain and update state more
// persistently.
//
// Claims (and thus their devices) are treated as "allocated" if they are in the assume cache
// or currently their allocation is in-flight. This does not change
// during filtering, so we can determine that once.
allAllocatedDevices, err := pl.draManager.ResourceClaims().ListAllAllocatedDevices()
if err != nil {
return nil, statusError(logger, err)
}
slices, err := pl.draManager.ResourceSlices().List()
if err != nil {
return nil, statusError(logger, err)
}
allocator, err := structured.NewAllocator(ctx, pl.enableAdminAccess, allocateClaims, allAllocatedDevices, pl.draManager.DeviceClasses(), slices, pl.celCache)
if err != nil {
return nil, statusError(logger, err)
}
s.allocator = allocator
s.nodeAllocations = make(map[string][]resourceapi.AllocationResult)
}
s.claims = claims
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *DynamicResources) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getStateData(cs *framework.CycleState) (*stateData, error) {
state, err := cs.Read(stateKey)
if err != nil {
return nil, err
}
s, ok := state.(*stateData)
if !ok {
return nil, errors.New("unable to convert state into stateData")
}
return s, nil
}
// Filter invoked at the filter extension point.
// It evaluates if a pod can fit due to the resources it requests,
// for both allocated and unallocated claims.
//
// For claims that are bound, then it checks that the node affinity is
// satisfied by the given node.
//
// For claims that are unbound, it checks whether the claim might get allocated
// for the node.
func (pl *DynamicResources) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
if !pl.enabled {
return nil
}
state, err := getStateData(cs)
if err != nil {
return statusError(klog.FromContext(ctx), err)
}
if len(state.claims) == 0 {
return nil
}
logger := klog.FromContext(ctx)
node := nodeInfo.Node()
var unavailableClaims []int
for index, claim := range state.claims {
logger.V(10).Info("filtering based on resource claims of the pod", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
// This node selector only gets set if the claim is allocated.
if nodeSelector := state.informationsForClaim[index].availableOnNodes; nodeSelector != nil && !nodeSelector.Match(node) {
logger.V(5).Info("allocation's node selector does not match", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
unavailableClaims = append(unavailableClaims, index)
}
}
// Use allocator to check the node and cache the result in case that the node is picked.
var allocations []resourceapi.AllocationResult
if state.allocator != nil {
allocCtx := ctx
if loggerV := logger.V(5); loggerV.Enabled() {
allocCtx = klog.NewContext(allocCtx, klog.LoggerWithValues(logger, "node", klog.KObj(node)))
}
a, err := state.allocator.Allocate(allocCtx, node)
if err != nil {
// This should only fail if there is something wrong with the claim or class.
// Return an error to abort scheduling of it.
//
// This will cause retries. It would be slightly nicer to mark it as unschedulable
// *and* abort scheduling. Then only cluster event for updating the claim or class
// with the broken CEL expression would trigger rescheduling.
//
// But we cannot do both. As this shouldn't occur often, aborting like this is
// better than the more complicated alternative (return Unschedulable here, remember
// the error, then later raise it again later if needed).
return statusError(logger, err, "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaims", klog.KObjSlice(state.allocator.ClaimsToAllocate()))
}
// Check for exact length just to be sure. In practice this is all-or-nothing.
if len(a) != len(state.allocator.ClaimsToAllocate()) {
return statusUnschedulable(logger, "cannot allocate all claims", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaims", klog.KObjSlice(state.allocator.ClaimsToAllocate()))
}
// Reserve uses this information.
allocations = a
}
// Store information in state while holding the mutex.
if state.allocator != nil || len(unavailableClaims) > 0 {
state.mutex.Lock()
defer state.mutex.Unlock()
}
if len(unavailableClaims) > 0 {
// Remember all unavailable claims. This might be observed
// concurrently, so we have to lock the state before writing.
if state.unavailableClaims == nil {
state.unavailableClaims = sets.New[int]()
}
for _, index := range unavailableClaims {
state.unavailableClaims.Insert(index)
}
return statusUnschedulable(logger, "resourceclaim not available on the node", "pod", klog.KObj(pod))
}
if state.allocator != nil {
state.nodeAllocations[node.Name] = allocations
}
return nil
}
// PostFilter checks whether there are allocated claims that could get
// deallocated to help get the Pod schedulable. If yes, it picks one and
// requests its deallocation. This only gets called when filtering found no
// suitable node.
func (pl *DynamicResources) PostFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, filteredNodeStatusMap framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
if !pl.enabled {
return nil, framework.NewStatus(framework.Unschedulable, "plugin disabled")
}
logger := klog.FromContext(ctx)
state, err := getStateData(cs)
if err != nil {
return nil, statusError(logger, err)
}
if len(state.claims) == 0 {
return nil, framework.NewStatus(framework.Unschedulable, "no new claims to deallocate")
}
// Iterating over a map is random. This is intentional here, we want to
// pick one claim randomly because there is no better heuristic.
for index := range state.unavailableClaims {
claim := state.claims[index]
if len(claim.Status.ReservedFor) == 0 ||
len(claim.Status.ReservedFor) == 1 && claim.Status.ReservedFor[0].UID == pod.UID {
claim := claim.DeepCopy()
claim.Status.ReservedFor = nil
claim.Status.Allocation = nil
logger.V(5).Info("Deallocation of ResourceClaim", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
if _, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil {
return nil, statusError(logger, err)
}
return nil, framework.NewStatus(framework.Unschedulable, "deallocation of ResourceClaim completed")
}
}
return nil, framework.NewStatus(framework.Unschedulable, "still not schedulable")
}
// Reserve reserves claims for the pod.
func (pl *DynamicResources) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (status *framework.Status) {
if !pl.enabled {
return nil
}
state, err := getStateData(cs)
if err != nil {
return statusError(klog.FromContext(ctx), err)
}
if len(state.claims) == 0 {
return nil
}
logger := klog.FromContext(ctx)
numClaimsWithAllocator := 0
for _, claim := range state.claims {
if claim.Status.Allocation != nil {
// Allocated, but perhaps not reserved yet. We checked in PreFilter that
// the pod could reserve the claim. Instead of reserving here by
// updating the ResourceClaim status, we assume that reserving
// will work and only do it for real during binding. If it fails at
// that time, some other pod was faster and we have to try again.
continue
}
numClaimsWithAllocator++
}
if numClaimsWithAllocator == 0 {
// Nothing left to do.
return nil
}
// Prepare allocation of claims handled by the schedulder.
if state.allocator != nil {
// Entries in these two slices match each other.
claimsToAllocate := state.allocator.ClaimsToAllocate()
allocations, ok := state.nodeAllocations[nodeName]
if !ok {
// We checked before that the node is suitable. This shouldn't have failed,
// so treat this as an error.
return statusError(logger, errors.New("claim allocation not found for node"))
}
// Sanity check: do we have results for all pending claims?
if len(allocations) != len(claimsToAllocate) ||
len(allocations) != numClaimsWithAllocator {
return statusError(logger, fmt.Errorf("internal error, have %d allocations, %d claims to allocate, want %d claims", len(allocations), len(claimsToAllocate), numClaimsWithAllocator))
}
for i, claim := range claimsToAllocate {
index := slices.Index(state.claims, claim)
if index < 0 {
return statusError(logger, fmt.Errorf("internal error, claim %s with allocation not found", claim.Name))
}
allocation := &allocations[i]
state.informationsForClaim[index].allocation = allocation
// Strictly speaking, we don't need to store the full modified object.
// The allocation would be enough. The full object is useful for
// debugging, testing and the allocator, so let's make it realistic.
claim = claim.DeepCopy()
if !slices.Contains(claim.Finalizers, resourceapi.Finalizer) {
claim.Finalizers = append(claim.Finalizers, resourceapi.Finalizer)
}
claim.Status.Allocation = allocation
err := pl.draManager.ResourceClaims().SignalClaimPendingAllocation(claim.UID, claim)
if err != nil {
return statusError(logger, fmt.Errorf("internal error, couldn't signal allocation for claim %s", claim.Name))
}
logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "allocation", klog.Format(allocation))
}
}
return nil
}
// Unreserve clears the ReservedFor field for all claims.
// It's idempotent, and does nothing if no state found for the given pod.
func (pl *DynamicResources) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
if !pl.enabled {
return
}
state, err := getStateData(cs)
if err != nil {
return
}
if len(state.claims) == 0 {
return
}
logger := klog.FromContext(ctx)
for index, claim := range state.claims {
// If allocation was in-flight, then it's not anymore and we need to revert the
// claim object in the assume cache to what it was before.
if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(state.claims[index].UID); deleted {
pl.draManager.ResourceClaims().AssumedClaimRestore(claim.Namespace, claim.Name)
}
if claim.Status.Allocation != nil &&
resourceclaim.IsReservedForPod(pod, claim) {
// Remove pod from ReservedFor. A strategic-merge-patch is used
// because that allows removing an individual entry without having
// the latest slice.
patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": { "reservedFor": [ {"$patch": "delete", "uid": %q} ] }}`,
claim.UID,
pod.UID,
)
logger.V(5).Info("unreserve", "resourceclaim", klog.KObj(claim), "pod", klog.KObj(pod))
claim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
if err != nil {
// We will get here again when pod scheduling is retried.
logger.Error(err, "unreserve", "resourceclaim", klog.KObj(claim))
}
}
}
}
// PreBind gets called in a separate goroutine after it has been determined
// that the pod should get bound to this node. Because Reserve did not actually
// reserve claims, we need to do it now. For claims with the builtin controller,
// we also handle the allocation.
//
// If anything fails, we return an error and
// the pod will have to go into the backoff queue. The scheduler will call
// Unreserve as part of the error handling.
func (pl *DynamicResources) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
if !pl.enabled {
return nil
}
state, err := getStateData(cs)
if err != nil {
return statusError(klog.FromContext(ctx), err)
}
if len(state.claims) == 0 {
return nil
}
logger := klog.FromContext(ctx)
for index, claim := range state.claims {
if !resourceclaim.IsReservedForPod(pod, claim) {
claim, err := pl.bindClaim(ctx, state, index, pod, nodeName)
if err != nil {
return statusError(logger, err)
}
state.claims[index] = claim
}
}
// If we get here, we know that reserving the claim for
// the pod worked and we can proceed with binding it.
return nil
}
// bindClaim gets called by PreBind for claim which is not reserved for the pod yet.
// It might not even be allocated. bindClaim then ensures that the allocation
// and reservation are recorded. This finishes the work started in Reserve.
func (pl *DynamicResources) bindClaim(ctx context.Context, state *stateData, index int, pod *v1.Pod, nodeName string) (patchedClaim *resourceapi.ResourceClaim, finalErr error) {
logger := klog.FromContext(ctx)
claim := state.claims[index].DeepCopy()
allocation := state.informationsForClaim[index].allocation
defer func() {
if allocation != nil {
// The scheduler was handling allocation. Now that has
// completed, either successfully or with a failure.
if finalErr == nil {
// This can fail, but only for reasons that are okay (concurrent delete or update).
// Shouldn't happen in this case.
if err := pl.draManager.ResourceClaims().AssumeClaimAfterAPICall(claim); err != nil {
logger.V(5).Info("Claim not stored in assume cache", "err", finalErr)
}
}
pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claim.UID)
}
}()
logger.V(5).Info("preparing claim status update", "claim", klog.KObj(state.claims[index]), "allocation", klog.Format(allocation))
// We may run into a ResourceVersion conflict because there may be some
// benign concurrent changes. In that case we get the latest claim and
// try again.
refreshClaim := false
retryErr := retry.RetryOnConflict(retry.DefaultRetry, func() error {
if refreshClaim {
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Get(ctx, claim.Name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("get updated claim %s after conflict: %w", klog.KObj(claim), err)
}
logger.V(5).Info("retrying update after conflict", "claim", klog.KObj(claim))
claim = updatedClaim
} else {
// All future retries must get a new claim first.
refreshClaim = true
}
if claim.DeletionTimestamp != nil {
return fmt.Errorf("claim %s got deleted in the meantime", klog.KObj(claim))
}
// Do we need to store an allocation result from Reserve?
if allocation != nil {
if claim.Status.Allocation != nil {
return fmt.Errorf("claim %s got allocated elsewhere in the meantime", klog.KObj(claim))
}
// The finalizer needs to be added in a normal update.
// If we were interrupted in the past, it might already be set and we simply continue.
if !slices.Contains(claim.Finalizers, resourceapi.Finalizer) {
claim.Finalizers = append(claim.Finalizers, resourceapi.Finalizer)
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{})
if err != nil {
return fmt.Errorf("add finalizer to claim %s: %w", klog.KObj(claim), err)
}
claim = updatedClaim
}
claim.Status.Allocation = allocation
}
// We can simply try to add the pod here without checking
// preconditions. The apiserver will tell us with a
// non-conflict error if this isn't possible.
claim.Status.ReservedFor = append(claim.Status.ReservedFor, resourceapi.ResourceClaimConsumerReference{Resource: "pods", Name: pod.Name, UID: pod.UID})
updatedClaim, err := pl.clientset.ResourceV1beta1().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
if err != nil {
if allocation != nil {
return fmt.Errorf("add allocation and reservation to claim %s: %w", klog.KObj(claim), err)
}
return fmt.Errorf("add reservation to claim %s: %w", klog.KObj(claim), err)
}
claim = updatedClaim
return nil
})
if retryErr != nil {
return nil, retryErr
}
logger.V(5).Info("reserved", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.Format(claim))
return claim, nil
}
// statusUnschedulable ensures that there is a log message associated with the
// line where the status originated.
func statusUnschedulable(logger klog.Logger, reason string, kv ...interface{}) *framework.Status {
if loggerV := logger.V(5); loggerV.Enabled() {
helper, loggerV := loggerV.WithCallStackHelper()
helper()
kv = append(kv, "reason", reason)
// nolint: logcheck // warns because it cannot check key/values
loggerV.Info("pod unschedulable", kv...)
}
return framework.NewStatus(framework.UnschedulableAndUnresolvable, reason)
}
// statusError ensures that there is a log message associated with the
// line where the error originated.
func statusError(logger klog.Logger, err error, kv ...interface{}) *framework.Status {
if loggerV := logger.V(5); loggerV.Enabled() {
helper, loggerV := loggerV.WithCallStackHelper()
helper()
// nolint: logcheck // warns because it cannot check key/values
loggerV.Error(err, "dynamic resource plugin failed", kv...)
}
return framework.AsStatus(err)
}

View File

@ -0,0 +1,33 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package feature
// Features carries feature gate values used by various plugins.
// This struct allows us to break the dependency of the plugins on
// the internal k8s features pkg.
type Features struct {
EnableDRAAdminAccess bool
EnableDynamicResourceAllocation bool
EnableVolumeCapacityPriority bool
EnableNodeInclusionPolicyInPodTopologySpread bool
EnableMatchLabelKeysInPodTopologySpread bool
EnableInPlacePodVerticalScaling bool
EnableSidecarContainers bool
EnableSchedulingQueueHint bool
EnableAsyncPreemption bool
EnablePodLevelResources bool
}

View File

@ -0,0 +1,55 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
import (
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// DefaultNormalizeScore generates a Normalize Score function that can normalize the
// scores from [0, max(scores)] to [0, maxPriority]. If reverse is set to true, it
// reverses the scores by subtracting it from maxPriority.
// Note: The input scores are always assumed to be non-negative integers.
func DefaultNormalizeScore(maxPriority int64, reverse bool, scores framework.NodeScoreList) *framework.Status {
var maxCount int64
for i := range scores {
if scores[i].Score > maxCount {
maxCount = scores[i].Score
}
}
if maxCount == 0 {
if reverse {
for i := range scores {
scores[i].Score = maxPriority
}
}
return nil
}
for i := range scores {
score := scores[i].Score
score = maxPriority * score / maxCount
if reverse {
score = maxPriority - score
}
scores[i].Score = score
}
return nil
}

View File

@ -0,0 +1,52 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
// FunctionShape represents a collection of FunctionShapePoint.
type FunctionShape []FunctionShapePoint
// FunctionShapePoint represents a shape point.
type FunctionShapePoint struct {
// Utilization is function argument.
Utilization int64
// Score is function value.
Score int64
}
// BuildBrokenLinearFunction creates a function which is built using linear segments. Segments are defined via shape array.
// Shape[i].Utilization slice represents points on "Utilization" axis where different segments meet.
// Shape[i].Score represents function values at meeting points.
//
// function f(p) is defined as:
//
// shape[0].Score for p < shape[0].Utilization
// shape[n-1].Score for p > shape[n-1].Utilization
//
// and linear between points (p < shape[i].Utilization)
func BuildBrokenLinearFunction(shape FunctionShape) func(int64) int64 {
return func(p int64) int64 {
for i := 0; i < len(shape); i++ {
if p <= int64(shape[i].Utilization) {
if i == 0 {
return shape[0].Score
}
return shape[i-1].Score + (shape[i].Score-shape[i-1].Score)*(p-shape[i-1].Utilization)/(shape[i].Utilization-shape[i-1].Utilization)
}
}
return shape[len(shape)-1].Score
}
}

View File

@ -0,0 +1,116 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
import (
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
appslisters "k8s.io/client-go/listers/apps/v1"
corelisters "k8s.io/client-go/listers/core/v1"
)
var (
rcKind = v1.SchemeGroupVersion.WithKind("ReplicationController")
rsKind = appsv1.SchemeGroupVersion.WithKind("ReplicaSet")
ssKind = appsv1.SchemeGroupVersion.WithKind("StatefulSet")
)
// DefaultSelector returns a selector deduced from the Services, Replication
// Controllers, Replica Sets, and Stateful Sets matching the given pod.
func DefaultSelector(
pod *v1.Pod,
sl corelisters.ServiceLister,
cl corelisters.ReplicationControllerLister,
rsl appslisters.ReplicaSetLister,
ssl appslisters.StatefulSetLister,
) labels.Selector {
labelSet := make(labels.Set)
// Since services, RCs, RSs and SSs match the pod, they won't have conflicting
// labels. Merging is safe.
if services, err := GetPodServices(sl, pod); err == nil {
for _, service := range services {
labelSet = labels.Merge(labelSet, service.Spec.Selector)
}
}
selector := labelSet.AsSelector()
owner := metav1.GetControllerOfNoCopy(pod)
if owner == nil {
return selector
}
gv, err := schema.ParseGroupVersion(owner.APIVersion)
if err != nil {
return selector
}
gvk := gv.WithKind(owner.Kind)
switch gvk {
case rcKind:
if rc, err := cl.ReplicationControllers(pod.Namespace).Get(owner.Name); err == nil {
labelSet = labels.Merge(labelSet, rc.Spec.Selector)
selector = labelSet.AsSelector()
}
case rsKind:
if rs, err := rsl.ReplicaSets(pod.Namespace).Get(owner.Name); err == nil {
if other, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector); err == nil {
if r, ok := other.Requirements(); ok {
selector = selector.Add(r...)
}
}
}
case ssKind:
if ss, err := ssl.StatefulSets(pod.Namespace).Get(owner.Name); err == nil {
if other, err := metav1.LabelSelectorAsSelector(ss.Spec.Selector); err == nil {
if r, ok := other.Requirements(); ok {
selector = selector.Add(r...)
}
}
}
default:
// Not owned by a supported controller.
}
return selector
}
// GetPodServices gets the services that have the selector that match the labels on the given pod.
func GetPodServices(sl corelisters.ServiceLister, pod *v1.Pod) ([]*v1.Service, error) {
allServices, err := sl.Services(pod.Namespace).List(labels.Everything())
if err != nil {
return nil, err
}
var services []*v1.Service
for i := range allServices {
service := allServices[i]
if service.Spec.Selector == nil {
// services with nil selectors match nothing, not everything.
continue
}
selector := labels.Set(service.Spec.Selector).AsSelectorPreValidated()
if selector.Matches(labels.Set(pod.Labels)) {
services = append(services, service)
}
}
return services, nil
}

View File

@ -0,0 +1,28 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package helper
import v1 "k8s.io/api/core/v1"
// DoNotScheduleTaintsFilterFunc returns the filter function that can
// filter out the node taints that reject scheduling Pod on a Node.
func DoNotScheduleTaintsFilterFunc() func(t *v1.Taint) bool {
return func(t *v1.Taint) bool {
// PodToleratesNodeTaints is only interested in NoSchedule and NoExecute taints.
return t.Effect == v1.TaintEffectNoSchedule || t.Effect == v1.TaintEffectNoExecute
}
}

View File

@ -0,0 +1,132 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package imagelocality
import (
"context"
"fmt"
"strings"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// The two thresholds are used as bounds for the image score range. They correspond to a reasonable size range for
// container images compressed and stored in registries; 90%ile of images on dockerhub drops into this range.
const (
mb int64 = 1024 * 1024
minThreshold int64 = 23 * mb
maxContainerThreshold int64 = 1000 * mb
)
// ImageLocality is a score plugin that favors nodes that already have requested pod container's images.
type ImageLocality struct {
handle framework.Handle
}
var _ framework.ScorePlugin = &ImageLocality{}
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.ImageLocality
// Name returns name of the plugin. It is used in logs, etc.
func (pl *ImageLocality) Name() string {
return Name
}
// Score invoked at the score extension point.
func (pl *ImageLocality) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
nodeInfos, err := pl.handle.SnapshotSharedLister().NodeInfos().List()
if err != nil {
return 0, framework.AsStatus(err)
}
totalNumNodes := len(nodeInfos)
imageScores := sumImageScores(nodeInfo, pod, totalNumNodes)
score := calculatePriority(imageScores, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
return score, nil
}
// ScoreExtensions of the Score plugin.
func (pl *ImageLocality) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, h framework.Handle) (framework.Plugin, error) {
return &ImageLocality{handle: h}, nil
}
// calculatePriority returns the priority of a node. Given the sumScores of requested images on the node, the node's
// priority is obtained by scaling the maximum priority value with a ratio proportional to the sumScores.
func calculatePriority(sumScores int64, numContainers int) int64 {
maxThreshold := maxContainerThreshold * int64(numContainers)
if sumScores < minThreshold {
sumScores = minThreshold
} else if sumScores > maxThreshold {
sumScores = maxThreshold
}
return framework.MaxNodeScore * (sumScores - minThreshold) / (maxThreshold - minThreshold)
}
// sumImageScores returns the sum of image scores of all the containers that are already on the node.
// Each image receives a raw score of its size, scaled by scaledImageScore. The raw scores are later used to calculate
// the final score.
func sumImageScores(nodeInfo *framework.NodeInfo, pod *v1.Pod, totalNumNodes int) int64 {
var sum int64
for _, container := range pod.Spec.InitContainers {
if state, ok := nodeInfo.ImageStates[normalizedImageName(container.Image)]; ok {
sum += scaledImageScore(state, totalNumNodes)
}
}
for _, container := range pod.Spec.Containers {
if state, ok := nodeInfo.ImageStates[normalizedImageName(container.Image)]; ok {
sum += scaledImageScore(state, totalNumNodes)
}
}
return sum
}
// scaledImageScore returns an adaptively scaled score for the given state of an image.
// The size of the image is used as the base score, scaled by a factor which considers how much nodes the image has "spread" to.
// This heuristic aims to mitigate the undesirable "node heating problem", i.e., pods get assigned to the same or
// a few nodes due to image locality.
func scaledImageScore(imageState *framework.ImageStateSummary, totalNumNodes int) int64 {
spread := float64(imageState.NumNodes) / float64(totalNumNodes)
return int64(float64(imageState.Size) * spread)
}
// normalizedImageName returns the CRI compliant name for a given image.
// TODO: cover the corner cases of missed matches, e.g,
// 1. Using Docker as runtime and docker.io/library/test:tag in pod spec, but only test:tag will present in node status
// 2. Using the implicit registry, i.e., test:tag or library/test:tag in pod spec but only docker.io/library/test:tag
// in node status; note that if users consistently use one registry format, this should not happen.
func normalizedImageName(name string) string {
if strings.LastIndex(name, ":") <= strings.LastIndex(name, "/") {
name = name + ":latest"
}
return name
}

View File

@ -0,0 +1,386 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package interpodaffinity
import (
"context"
"fmt"
"sync/atomic"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
const (
// preFilterStateKey is the key in CycleState to InterPodAffinity pre-computed data for Filtering.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// ErrReasonExistingAntiAffinityRulesNotMatch is used for ExistingPodsAntiAffinityRulesNotMatch predicate error.
ErrReasonExistingAntiAffinityRulesNotMatch = "node(s) didn't satisfy existing pods anti-affinity rules"
// ErrReasonAffinityRulesNotMatch is used for PodAffinityRulesNotMatch predicate error.
ErrReasonAffinityRulesNotMatch = "node(s) didn't match pod affinity rules"
// ErrReasonAntiAffinityRulesNotMatch is used for PodAntiAffinityRulesNotMatch predicate error.
ErrReasonAntiAffinityRulesNotMatch = "node(s) didn't match pod anti-affinity rules"
)
// preFilterState computed at PreFilter and used at Filter.
type preFilterState struct {
// A map of topology pairs to the number of existing pods that has anti-affinity terms that match the "pod".
existingAntiAffinityCounts topologyToMatchedTermCount
// A map of topology pairs to the number of existing pods that match the affinity terms of the "pod".
affinityCounts topologyToMatchedTermCount
// A map of topology pairs to the number of existing pods that match the anti-affinity terms of the "pod".
antiAffinityCounts topologyToMatchedTermCount
// podInfo of the incoming pod.
podInfo *framework.PodInfo
// A copy of the incoming pod's namespace labels.
namespaceLabels labels.Set
}
// Clone the prefilter state.
func (s *preFilterState) Clone() framework.StateData {
if s == nil {
return nil
}
copy := preFilterState{}
copy.affinityCounts = s.affinityCounts.clone()
copy.antiAffinityCounts = s.antiAffinityCounts.clone()
copy.existingAntiAffinityCounts = s.existingAntiAffinityCounts.clone()
// No need to deep copy the podInfo because it shouldn't change.
copy.podInfo = s.podInfo
copy.namespaceLabels = s.namespaceLabels
return &copy
}
// updateWithPod updates the preFilterState counters with the (anti)affinity matches for the given podInfo.
func (s *preFilterState) updateWithPod(pInfo *framework.PodInfo, node *v1.Node, multiplier int64) {
if s == nil {
return
}
s.existingAntiAffinityCounts.updateWithAntiAffinityTerms(pInfo.RequiredAntiAffinityTerms, s.podInfo.Pod, s.namespaceLabels, node, multiplier)
s.affinityCounts.updateWithAffinityTerms(s.podInfo.RequiredAffinityTerms, pInfo.Pod, node, multiplier)
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the updated pod's namespace labels, hence passing nil for nsLabels.
s.antiAffinityCounts.updateWithAntiAffinityTerms(s.podInfo.RequiredAntiAffinityTerms, pInfo.Pod, nil, node, multiplier)
}
type topologyPair struct {
key string
value string
}
type topologyToMatchedTermCount map[topologyPair]int64
func (m topologyToMatchedTermCount) append(toAppend topologyToMatchedTermCount) {
for pair := range toAppend {
m[pair] += toAppend[pair]
}
}
func (m topologyToMatchedTermCount) clone() topologyToMatchedTermCount {
copy := make(topologyToMatchedTermCount, len(m))
copy.append(m)
return copy
}
func (m topologyToMatchedTermCount) update(node *v1.Node, tk string, value int64) {
if tv, ok := node.Labels[tk]; ok {
pair := topologyPair{key: tk, value: tv}
m[pair] += value
// value could be negative, hence we delete the entry if it is down to zero.
if m[pair] == 0 {
delete(m, pair)
}
}
}
// updates the topologyToMatchedTermCount map with the specified value
// for each affinity term if "targetPod" matches ALL terms.
func (m topologyToMatchedTermCount) updateWithAffinityTerms(
terms []framework.AffinityTerm, pod *v1.Pod, node *v1.Node, value int64) {
if podMatchesAllAffinityTerms(terms, pod) {
for _, t := range terms {
m.update(node, t.TopologyKey, value)
}
}
}
// updates the topologyToMatchedTermCount map with the specified value
// for each anti-affinity term matched the target pod.
func (m topologyToMatchedTermCount) updateWithAntiAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, value int64) {
// Check anti-affinity terms.
for _, t := range terms {
if t.Matches(pod, nsLabels) {
m.update(node, t.TopologyKey, value)
}
}
}
// returns true IFF the given pod matches all the given terms.
func podMatchesAllAffinityTerms(terms []framework.AffinityTerm, pod *v1.Pod) bool {
if len(terms) == 0 {
return false
}
for _, t := range terms {
// The incoming pod NamespaceSelector was merged into the Namespaces set, and so
// we are not explicitly passing in namespace labels.
if !t.Matches(pod, nil) {
return false
}
}
return true
}
// calculates the following for each existing pod on each node:
// 1. Whether it has PodAntiAffinity
// 2. Whether any AntiAffinityTerm matches the incoming pod
func (pl *InterPodAffinity) getExistingAntiAffinityCounts(ctx context.Context, pod *v1.Pod, nsLabels labels.Set, nodes []*framework.NodeInfo) topologyToMatchedTermCount {
topoMaps := make([]topologyToMatchedTermCount, len(nodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := nodes[i]
node := nodeInfo.Node()
topoMap := make(topologyToMatchedTermCount)
for _, existingPod := range nodeInfo.PodsWithRequiredAntiAffinity {
topoMap.updateWithAntiAffinityTerms(existingPod.RequiredAntiAffinityTerms, pod, nsLabels, node, 1)
}
if len(topoMap) != 0 {
topoMaps[atomic.AddInt32(&index, 1)] = topoMap
}
}
pl.parallelizer.Until(ctx, len(nodes), processNode, pl.Name())
result := make(topologyToMatchedTermCount)
for i := 0; i <= int(index); i++ {
result.append(topoMaps[i])
}
return result
}
// finds existing Pods that match affinity terms of the incoming pod's (anti)affinity terms.
// It returns a topologyToMatchedTermCount that are checked later by the affinity
// predicate. With this topologyToMatchedTermCount available, the affinity predicate does not
// need to check all the pods in the cluster.
func (pl *InterPodAffinity) getIncomingAffinityAntiAffinityCounts(ctx context.Context, podInfo *framework.PodInfo, allNodes []*framework.NodeInfo) (topologyToMatchedTermCount, topologyToMatchedTermCount) {
affinityCounts := make(topologyToMatchedTermCount)
antiAffinityCounts := make(topologyToMatchedTermCount)
if len(podInfo.RequiredAffinityTerms) == 0 && len(podInfo.RequiredAntiAffinityTerms) == 0 {
return affinityCounts, antiAffinityCounts
}
affinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
antiAffinityCountsList := make([]topologyToMatchedTermCount, len(allNodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := allNodes[i]
node := nodeInfo.Node()
affinity := make(topologyToMatchedTermCount)
antiAffinity := make(topologyToMatchedTermCount)
for _, existingPod := range nodeInfo.Pods {
affinity.updateWithAffinityTerms(podInfo.RequiredAffinityTerms, existingPod.Pod, node, 1)
// The incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
antiAffinity.updateWithAntiAffinityTerms(podInfo.RequiredAntiAffinityTerms, existingPod.Pod, nil, node, 1)
}
if len(affinity) > 0 || len(antiAffinity) > 0 {
k := atomic.AddInt32(&index, 1)
affinityCountsList[k] = affinity
antiAffinityCountsList[k] = antiAffinity
}
}
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
for i := 0; i <= int(index); i++ {
affinityCounts.append(affinityCountsList[i])
antiAffinityCounts.append(antiAffinityCountsList[i])
}
return affinityCounts, antiAffinityCounts
}
// PreFilter invoked at the prefilter extension point.
func (pl *InterPodAffinity) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
var allNodes []*framework.NodeInfo
var nodesWithRequiredAntiAffinityPods []*framework.NodeInfo
var err error
if allNodes, err = pl.sharedLister.NodeInfos().List(); err != nil {
return nil, framework.AsStatus(fmt.Errorf("failed to list NodeInfos: %w", err))
}
if nodesWithRequiredAntiAffinityPods, err = pl.sharedLister.NodeInfos().HavePodsWithRequiredAntiAffinityList(); err != nil {
return nil, framework.AsStatus(fmt.Errorf("failed to list NodeInfos with pods with affinity: %w", err))
}
s := &preFilterState{}
if s.podInfo, err = framework.NewPodInfo(pod); err != nil {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("parsing pod: %+v", err))
}
for i := range s.podInfo.RequiredAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&s.podInfo.RequiredAffinityTerms[i]); err != nil {
return nil, framework.AsStatus(err)
}
}
for i := range s.podInfo.RequiredAntiAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&s.podInfo.RequiredAntiAffinityTerms[i]); err != nil {
return nil, framework.AsStatus(err)
}
}
logger := klog.FromContext(ctx)
s.namespaceLabels = GetNamespaceLabelsSnapshot(logger, pod.Namespace, pl.nsLister)
s.existingAntiAffinityCounts = pl.getExistingAntiAffinityCounts(ctx, pod, s.namespaceLabels, nodesWithRequiredAntiAffinityPods)
s.affinityCounts, s.antiAffinityCounts = pl.getIncomingAffinityAntiAffinityCounts(ctx, s.podInfo, allNodes)
if len(s.existingAntiAffinityCounts) == 0 && len(s.podInfo.RequiredAffinityTerms) == 0 && len(s.podInfo.RequiredAntiAffinityTerms) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, s)
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *InterPodAffinity) PreFilterExtensions() framework.PreFilterExtensions {
return pl
}
// AddPod from pre-computed data in cycleState.
func (pl *InterPodAffinity) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToAdd, nodeInfo.Node(), 1)
return nil
}
// RemovePod from pre-computed data in cycleState.
func (pl *InterPodAffinity) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToRemove, nodeInfo.Node(), -1)
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to interpodaffinity.state error", c)
}
return s, nil
}
// Checks if scheduling the pod onto this node would break any anti-affinity
// terms indicated by the existing pods.
func satisfyExistingPodsAntiAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
if len(state.existingAntiAffinityCounts) > 0 {
// Iterate over topology pairs to get any of the pods being affected by
// the scheduled pod anti-affinity terms
for topologyKey, topologyValue := range nodeInfo.Node().Labels {
tp := topologyPair{key: topologyKey, value: topologyValue}
if state.existingAntiAffinityCounts[tp] > 0 {
return false
}
}
}
return true
}
// Checks if the node satisfies the incoming pod's anti-affinity rules.
func satisfyPodAntiAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
if len(state.antiAffinityCounts) > 0 {
for _, term := range state.podInfo.RequiredAntiAffinityTerms {
if topologyValue, ok := nodeInfo.Node().Labels[term.TopologyKey]; ok {
tp := topologyPair{key: term.TopologyKey, value: topologyValue}
if state.antiAffinityCounts[tp] > 0 {
return false
}
}
}
}
return true
}
// Checks if the node satisfies the incoming pod's affinity rules.
func satisfyPodAffinity(state *preFilterState, nodeInfo *framework.NodeInfo) bool {
podsExist := true
for _, term := range state.podInfo.RequiredAffinityTerms {
if topologyValue, ok := nodeInfo.Node().Labels[term.TopologyKey]; ok {
tp := topologyPair{key: term.TopologyKey, value: topologyValue}
if state.affinityCounts[tp] <= 0 {
podsExist = false
}
} else {
// All topology labels must exist on the node.
return false
}
}
if !podsExist {
// This pod may be the first pod in a series that have affinity to themselves. In order
// to not leave such pods in pending state forever, we check that if no other pod
// in the cluster matches the namespace and selector of this pod, the pod matches
// its own terms, and the node has all the requested topologies, then we allow the pod
// to pass the affinity check.
if len(state.affinityCounts) == 0 && podMatchesAllAffinityTerms(state.podInfo.RequiredAffinityTerms, state.podInfo.Pod) {
return true
}
return false
}
return true
}
// Filter invoked at the filter extension point.
// It checks if a pod can be scheduled on the specified node with pod affinity/anti-affinity configuration.
func (pl *InterPodAffinity) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
if !satisfyPodAffinity(state, nodeInfo) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonAffinityRulesNotMatch)
}
if !satisfyPodAntiAffinity(state, nodeInfo) {
return framework.NewStatus(framework.Unschedulable, ErrReasonAntiAffinityRulesNotMatch)
}
if !satisfyExistingPodsAntiAffinity(state, nodeInfo) {
return framework.NewStatus(framework.Unschedulable, ErrReasonExistingAntiAffinityRulesNotMatch)
}
return nil
}

View File

@ -0,0 +1,247 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package interpodaffinity
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
listersv1 "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.InterPodAffinity
var _ framework.PreFilterPlugin = &InterPodAffinity{}
var _ framework.FilterPlugin = &InterPodAffinity{}
var _ framework.PreScorePlugin = &InterPodAffinity{}
var _ framework.ScorePlugin = &InterPodAffinity{}
var _ framework.EnqueueExtensions = &InterPodAffinity{}
// InterPodAffinity is a plugin that checks inter pod affinity
type InterPodAffinity struct {
parallelizer parallelize.Parallelizer
args config.InterPodAffinityArgs
sharedLister framework.SharedLister
nsLister listersv1.NamespaceLister
enableSchedulingQueueHint bool
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *InterPodAffinity) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a failed Pod
// schedulable
func (pl *InterPodAffinity) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeTaint event.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
return []framework.ClusterEventWithHint{
// All ActionType includes the following events:
// - Delete. An unschedulable Pod may fail due to violating an existing Pod's anti-affinity constraints,
// deleting an existing Pod may make it schedulable.
// - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make
// an unschedulable Pod schedulable.
// - Add. An unschedulable Pod may fail due to violating pod-affinity constraints,
// adding an assigned Pod may make it schedulable.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Add | framework.UpdatePodLabel | framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodChange},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
if h.SnapshotSharedLister() == nil {
return nil, fmt.Errorf("SnapshotSharedlister is nil")
}
args, err := getArgs(plArgs)
if err != nil {
return nil, err
}
if err := validation.ValidateInterPodAffinityArgs(nil, &args); err != nil {
return nil, err
}
pl := &InterPodAffinity{
parallelizer: h.Parallelizer(),
args: args,
sharedLister: h.SnapshotSharedLister(),
nsLister: h.SharedInformerFactory().Core().V1().Namespaces().Lister(),
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}
return pl, nil
}
func getArgs(obj runtime.Object) (config.InterPodAffinityArgs, error) {
ptr, ok := obj.(*config.InterPodAffinityArgs)
if !ok {
return config.InterPodAffinityArgs{}, fmt.Errorf("want args to be of type InterPodAffinityArgs, got %T", obj)
}
return *ptr, nil
}
// Updates Namespaces with the set of namespaces identified by NamespaceSelector.
// If successful, NamespaceSelector is set to nil.
// The assumption is that the term is for an incoming pod, in which case
// namespaceSelector is either unrolled into Namespaces (and so the selector
// is set to Nothing()) or is Empty(), which means match everything. Therefore,
// there when matching against this term, there is no need to lookup the existing
// pod's namespace labels to match them against term's namespaceSelector explicitly.
func (pl *InterPodAffinity) mergeAffinityTermNamespacesIfNotEmpty(at *framework.AffinityTerm) error {
if at.NamespaceSelector.Empty() {
return nil
}
ns, err := pl.nsLister.List(at.NamespaceSelector)
if err != nil {
return err
}
for _, n := range ns {
at.Namespaces.Insert(n.Name)
}
at.NamespaceSelector = labels.Nothing()
return nil
}
// GetNamespaceLabelsSnapshot returns a snapshot of the labels associated with
// the namespace.
func GetNamespaceLabelsSnapshot(logger klog.Logger, ns string, nsLister listersv1.NamespaceLister) (nsLabels labels.Set) {
podNS, err := nsLister.Get(ns)
if err == nil {
// Create and return snapshot of the labels.
return labels.Merge(podNS.Labels, nil)
}
logger.V(3).Info("getting namespace, assuming empty set of namespace labels", "namespace", ns, "err", err)
return
}
func (pl *InterPodAffinity) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPod, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if (modifiedPod != nil && modifiedPod.Spec.NodeName == "") || (originalPod != nil && originalPod.Spec.NodeName == "") {
logger.V(5).Info("the added/updated/deleted pod is unscheduled, so it doesn't make the target pod schedulable",
"pod", klog.KObj(pod), "originalPod", klog.KObj(originalPod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
terms, err := framework.GetAffinityTerms(pod, framework.GetPodAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
antiTerms, err := framework.GetAffinityTerms(pod, framework.GetPodAntiAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
// Pod is updated. Return Queue when the updated pod matching the target pod's affinity or not matching anti-affinity.
// Note that, we don't need to check each affinity individually when the Pod has more than one affinity
// because the current PodAffinity looks for a **single** existing pod that can satisfy **all** the terms of inter-pod affinity of an incoming pod.
if modifiedPod != nil && originalPod != nil {
if !podMatchesAllAffinityTerms(terms, originalPod) && podMatchesAllAffinityTerms(terms, modifiedPod) {
logger.V(5).Info("a scheduled pod was updated to match the target pod's affinity, and the pod may be schedulable now",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
if podMatchesAllAffinityTerms(antiTerms, originalPod) && !podMatchesAllAffinityTerms(antiTerms, modifiedPod) {
logger.V(5).Info("a scheduled pod was updated not to match the target pod's anti affinity, and the pod may be schedulable now",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was updated but it doesn't match the target pod's affinity or does match the target pod's anti-affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is added. Return Queue when the added pod matching the target pod's affinity.
if modifiedPod != nil {
if podMatchesAllAffinityTerms(terms, modifiedPod) {
logger.V(5).Info("a scheduled pod was added and it matches the target pod's affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was added and it doesn't match the target pod's affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is deleted. Return Queue when the deleted pod matching the target pod's anti-affinity.
if !podMatchesAllAffinityTerms(antiTerms, originalPod) {
logger.V(5).Info("a scheduled pod was deleted but it doesn't match the target pod's anti-affinity",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
logger.V(5).Info("a scheduled pod was deleted and it matches the target pod's anti-affinity. The pod may be schedulable now",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
func (pl *InterPodAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
terms, err := framework.GetAffinityTerms(pod, framework.GetPodAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
for _, term := range terms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod affinity topologyKey was added/updated and it may make pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
}
}
antiTerms, err := framework.GetAffinityTerms(pod, framework.GetPodAntiAffinityTerms(pod.Spec.Affinity))
if err != nil {
return framework.Queue, err
}
for _, term := range antiTerms {
if _, ok := modifiedNode.Labels[term.TopologyKey]; ok {
logger.V(5).Info("a node with matched pod anti-affinity topologyKey was added/updated and it may make pod schedulable",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, err
}
}
logger.V(5).Info("a node is added/updated but doesn't have any topologyKey which matches pod affinity/anti-affinity",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}

View File

@ -0,0 +1,302 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package interpodaffinity
import (
"context"
"fmt"
"math"
"sync/atomic"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// preScoreStateKey is the key in CycleState to InterPodAffinity pre-computed data for Scoring.
const preScoreStateKey = "PreScore" + Name
type scoreMap map[string]map[string]int64
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
topologyScore scoreMap
podInfo *framework.PodInfo
// A copy of the incoming pod's namespace labels.
namespaceLabels labels.Set
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
func (m scoreMap) processTerm(term *framework.AffinityTerm, weight int32, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, multiplier int32) {
if term.Matches(pod, nsLabels) {
if tpValue, tpValueExist := node.Labels[term.TopologyKey]; tpValueExist {
if m[term.TopologyKey] == nil {
m[term.TopologyKey] = make(map[string]int64)
}
m[term.TopologyKey][tpValue] += int64(weight * multiplier)
}
}
}
func (m scoreMap) processTerms(terms []framework.WeightedAffinityTerm, pod *v1.Pod, nsLabels labels.Set, node *v1.Node, multiplier int32) {
for _, term := range terms {
m.processTerm(&term.AffinityTerm, term.Weight, pod, nsLabels, node, multiplier)
}
}
func (m scoreMap) append(other scoreMap) {
for topology, oScores := range other {
scores := m[topology]
if scores == nil {
m[topology] = oScores
continue
}
for k, v := range oScores {
scores[k] += v
}
}
}
func (pl *InterPodAffinity) processExistingPod(
state *preScoreState,
existingPod *framework.PodInfo,
existingPodNodeInfo *framework.NodeInfo,
incomingPod *v1.Pod,
topoScore scoreMap,
) {
existingPodNode := existingPodNodeInfo.Node()
if len(existingPodNode.Labels) == 0 {
return
}
// For every soft pod affinity term of <pod>, if <existingPod> matches the term,
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPods>`s node by the term`s weight.
// Note that the incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
topoScore.processTerms(state.podInfo.PreferredAffinityTerms, existingPod.Pod, nil, existingPodNode, 1)
// For every soft pod anti-affinity term of <pod>, if <existingPod> matches the term,
// decrement <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>`s node by the term`s weight.
// Note that the incoming pod's terms have the namespaceSelector merged into the namespaces, and so
// here we don't lookup the existing pod's namespace labels, hence passing nil for nsLabels.
topoScore.processTerms(state.podInfo.PreferredAntiAffinityTerms, existingPod.Pod, nil, existingPodNode, -1)
// For every hard pod affinity term of <existingPod>, if <pod> matches the term,
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>'s node by the constant <args.hardPodAffinityWeight>
if pl.args.HardPodAffinityWeight > 0 && len(existingPodNode.Labels) != 0 {
for _, t := range existingPod.RequiredAffinityTerms {
topoScore.processTerm(&t, pl.args.HardPodAffinityWeight, incomingPod, state.namespaceLabels, existingPodNode, 1)
}
}
// For every soft pod affinity term of <existingPod>, if <pod> matches the term,
// increment <p.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>'s node by the term's weight.
topoScore.processTerms(existingPod.PreferredAffinityTerms, incomingPod, state.namespaceLabels, existingPodNode, 1)
// For every soft pod anti-affinity term of <existingPod>, if <pod> matches the term,
// decrement <pm.counts> for every node in the cluster with the same <term.TopologyKey>
// value as that of <existingPod>'s node by the term's weight.
topoScore.processTerms(existingPod.PreferredAntiAffinityTerms, incomingPod, state.namespaceLabels, existingPodNode, -1)
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *InterPodAffinity) PreScore(
pCtx context.Context,
cycleState *framework.CycleState,
pod *v1.Pod,
nodes []*framework.NodeInfo,
) *framework.Status {
if len(nodes) == 0 {
// No nodes to score.
return framework.NewStatus(framework.Skip)
}
if pl.sharedLister == nil {
return framework.NewStatus(framework.Error, "empty shared lister in InterPodAffinity PreScore")
}
affinity := pod.Spec.Affinity
hasPreferredAffinityConstraints := affinity != nil && affinity.PodAffinity != nil && len(affinity.PodAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
hasPreferredAntiAffinityConstraints := affinity != nil && affinity.PodAntiAffinity != nil && len(affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution) > 0
hasConstraints := hasPreferredAffinityConstraints || hasPreferredAntiAffinityConstraints
// Optionally ignore calculating preferences of existing pods' affinity rules
// if the incoming pod has no inter-pod affinities.
if pl.args.IgnorePreferredTermsOfExistingPods && !hasConstraints {
return framework.NewStatus(framework.Skip)
}
// Unless the pod being scheduled has preferred affinity terms, we only
// need to process nodes hosting pods with affinity.
var allNodes []*framework.NodeInfo
var err error
if hasConstraints {
allNodes, err = pl.sharedLister.NodeInfos().List()
if err != nil {
return framework.AsStatus(fmt.Errorf("failed to get all nodes from shared lister: %w", err))
}
} else {
allNodes, err = pl.sharedLister.NodeInfos().HavePodsWithAffinityList()
if err != nil {
return framework.AsStatus(fmt.Errorf("failed to get pods with affinity list: %w", err))
}
}
state := &preScoreState{
topologyScore: make(map[string]map[string]int64),
}
if state.podInfo, err = framework.NewPodInfo(pod); err != nil {
// Ideally we never reach here, because errors will be caught by PreFilter
return framework.AsStatus(fmt.Errorf("failed to parse pod: %w", err))
}
for i := range state.podInfo.PreferredAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&state.podInfo.PreferredAffinityTerms[i].AffinityTerm); err != nil {
return framework.AsStatus(fmt.Errorf("updating PreferredAffinityTerms: %w", err))
}
}
for i := range state.podInfo.PreferredAntiAffinityTerms {
if err := pl.mergeAffinityTermNamespacesIfNotEmpty(&state.podInfo.PreferredAntiAffinityTerms[i].AffinityTerm); err != nil {
return framework.AsStatus(fmt.Errorf("updating PreferredAntiAffinityTerms: %w", err))
}
}
logger := klog.FromContext(pCtx)
state.namespaceLabels = GetNamespaceLabelsSnapshot(logger, pod.Namespace, pl.nsLister)
topoScores := make([]scoreMap, len(allNodes))
index := int32(-1)
processNode := func(i int) {
nodeInfo := allNodes[i]
// Unless the pod being scheduled has preferred affinity terms, we only
// need to process pods with affinity in the node.
podsToProcess := nodeInfo.PodsWithAffinity
if hasConstraints {
// We need to process all the pods.
podsToProcess = nodeInfo.Pods
}
topoScore := make(scoreMap)
for _, existingPod := range podsToProcess {
pl.processExistingPod(state, existingPod, nodeInfo, pod, topoScore)
}
if len(topoScore) > 0 {
topoScores[atomic.AddInt32(&index, 1)] = topoScore
}
}
pl.parallelizer.Until(pCtx, len(allNodes), processNode, pl.Name())
if index == -1 {
return framework.NewStatus(framework.Skip)
}
for i := 0; i <= int(index); i++ {
state.topologyScore.append(topoScores[i])
}
cycleState.Write(preScoreStateKey, state)
return nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("failed to read %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("%+v convert to interpodaffinity.preScoreState error", c)
}
return s, nil
}
// Score invoked at the Score extension point.
// The "score" returned in this function is the sum of weights got from cycleState which have its topologyKey matching with the node's labels.
// it is normalized later.
// Note: the returned "score" is positive for pod-affinity, and negative for pod-antiaffinity.
func (pl *InterPodAffinity) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("failed to get node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
s, err := getPreScoreState(cycleState)
if err != nil {
return 0, framework.AsStatus(err)
}
var score int64
for tpKey, tpValues := range s.topologyScore {
if v, exist := node.Labels[tpKey]; exist {
score += tpValues[v]
}
}
return score, nil
}
// NormalizeScore normalizes the score for each filteredNode.
func (pl *InterPodAffinity) NormalizeScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
s, err := getPreScoreState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
if len(s.topologyScore) == 0 {
return nil
}
var minCount int64 = math.MaxInt64
var maxCount int64 = math.MinInt64
for i := range scores {
score := scores[i].Score
if score > maxCount {
maxCount = score
}
if score < minCount {
minCount = score
}
}
maxMinDiff := maxCount - minCount
for i := range scores {
fScore := float64(0)
if maxMinDiff > 0 {
fScore = float64(framework.MaxNodeScore) * (float64(scores[i].Score-minCount) / float64(maxMinDiff))
}
scores[i].Score = int64(fScore)
}
return nil
}
// ScoreExtensions of the Score plugin.
func (pl *InterPodAffinity) ScoreExtensions() framework.ScoreExtensions {
return pl
}

View File

@ -0,0 +1,39 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package names
const (
PrioritySort = "PrioritySort"
DefaultBinder = "DefaultBinder"
DefaultPreemption = "DefaultPreemption"
DynamicResources = "DynamicResources"
ImageLocality = "ImageLocality"
InterPodAffinity = "InterPodAffinity"
NodeAffinity = "NodeAffinity"
NodeName = "NodeName"
NodePorts = "NodePorts"
NodeResourcesBalancedAllocation = "NodeResourcesBalancedAllocation"
NodeResourcesFit = "NodeResourcesFit"
NodeUnschedulable = "NodeUnschedulable"
NodeVolumeLimits = "NodeVolumeLimits"
PodTopologySpread = "PodTopologySpread"
SchedulingGates = "SchedulingGates"
TaintToleration = "TaintToleration"
VolumeBinding = "VolumeBinding"
VolumeRestrictions = "VolumeRestrictions"
VolumeZone = "VolumeZone"
)

View File

@ -0,0 +1,372 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeaffinity
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// NodeAffinity is a plugin that checks if a pod node selector matches the node label.
type NodeAffinity struct {
handle framework.Handle
addedNodeSelector *nodeaffinity.NodeSelector
addedPrefSchedTerms *nodeaffinity.PreferredSchedulingTerms
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &NodeAffinity{}
var _ framework.FilterPlugin = &NodeAffinity{}
var _ framework.PreScorePlugin = &NodeAffinity{}
var _ framework.ScorePlugin = &NodeAffinity{}
var _ framework.EnqueueExtensions = &NodeAffinity{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodeAffinity
// preScoreStateKey is the key in CycleState to NodeAffinity pre-computed data for Scoring.
preScoreStateKey = "PreScore" + Name
// preFilterStateKey is the key in CycleState to NodeAffinity pre-compute data for Filtering.
preFilterStateKey = "PreFilter" + Name
// ErrReasonPod is the reason for Pod's node affinity/selector not matching.
ErrReasonPod = "node(s) didn't match Pod's node affinity/selector"
// errReasonEnforced is the reason for added node affinity not matching.
errReasonEnforced = "node(s) didn't match scheduler-enforced node affinity"
// errReasonConflict is the reason for pod's conflicting affinity rules.
errReasonConflict = "pod affinity terms conflict"
)
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodeAffinity) Name() string {
return Name
}
type preFilterState struct {
requiredNodeSelectorAndAffinity nodeaffinity.RequiredNodeAffinity
}
// Clone just returns the same state because it is not affected by pod additions or deletions.
func (s *preFilterState) Clone() framework.StateData {
return s
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodeAffinity) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence we can use UpdateNodeLabel instead of Update.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
return []framework.ClusterEventWithHint{
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
// isSchedulableAfterNodeChange is invoked whenever a node changed. It checks whether
// that change made a previously unschedulable pod schedulable.
func (pl *NodeAffinity) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if pl.addedNodeSelector != nil && !pl.addedNodeSelector.Match(modifiedNode) {
logger.V(4).Info("added or modified node didn't match scheduler-enforced node affinity and this event won't make the Pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
isMatched, err := requiredNodeAffinity.Match(modifiedNode)
if err != nil {
return framework.Queue, err
}
if !isMatched {
logger.V(5).Info("node was created or updated, but the pod's NodeAffinity doesn't match", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// Since the node was added and it matches the pod's affinity criteria, we can unblock it.
if originalNode == nil {
logger.V(5).Info("node was created, and matches with the pod's NodeAffinity", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// At this point we know the operation is update so we can narrow down the criteria to unmatch -> match changes only
// (necessary affinity label was added to the node in this case).
wasMatched, err := requiredNodeAffinity.Match(originalNode)
if err != nil {
return framework.Queue, err
}
if wasMatched {
logger.V(5).Info("node updated, but the pod's NodeAffinity hasn't changed", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("node was updated and the pod's NodeAffinity changed to matched", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// PreFilter builds and writes cycle state used by Filter.
func (pl *NodeAffinity) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
affinity := pod.Spec.Affinity
noNodeAffinity := (affinity == nil ||
affinity.NodeAffinity == nil ||
affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil)
if noNodeAffinity && pl.addedNodeSelector == nil && pod.Spec.NodeSelector == nil {
// NodeAffinity Filter has nothing to do with the Pod.
return nil, framework.NewStatus(framework.Skip)
}
state := &preFilterState{requiredNodeSelectorAndAffinity: nodeaffinity.GetRequiredNodeAffinity(pod)}
cycleState.Write(preFilterStateKey, state)
if noNodeAffinity || len(affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms) == 0 {
return nil, nil
}
// Check if there is affinity to a specific node and return it.
terms := affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
var nodeNames sets.Set[string]
for _, t := range terms {
var termNodeNames sets.Set[string]
for _, r := range t.MatchFields {
if r.Key == metav1.ObjectNameField && r.Operator == v1.NodeSelectorOpIn {
// The requirements represent ANDed constraints, and so we need to
// find the intersection of nodes.
s := sets.New(r.Values...)
if termNodeNames == nil {
termNodeNames = s
} else {
termNodeNames = termNodeNames.Intersection(s)
}
}
}
if termNodeNames == nil {
// If this term has no node.Name field affinity,
// then all nodes are eligible because the terms are ORed.
return nil, nil
}
nodeNames = nodeNames.Union(termNodeNames)
}
// If nodeNames is not nil, but length is 0, it means each term have conflicting affinity to node.Name;
// therefore, pod will not match any node.
if nodeNames != nil && len(nodeNames) == 0 {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, errReasonConflict)
} else if len(nodeNames) > 0 {
return &framework.PreFilterResult{NodeNames: nodeNames}, nil
}
return nil, nil
}
// PreFilterExtensions not necessary for this plugin as state doesn't depend on pod additions or deletions.
func (pl *NodeAffinity) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// Filter checks if the Node matches the Pod .spec.affinity.nodeAffinity and
// the plugin's added affinity.
func (pl *NodeAffinity) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
if pl.addedNodeSelector != nil && !pl.addedNodeSelector.Match(node) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, errReasonEnforced)
}
s, err := getPreFilterState(state)
if err != nil {
// Fallback to calculate requiredNodeSelector and requiredNodeAffinity
// here when PreFilter is disabled.
s = &preFilterState{requiredNodeSelectorAndAffinity: nodeaffinity.GetRequiredNodeAffinity(pod)}
}
// Ignore parsing errors for backwards compatibility.
match, _ := s.requiredNodeSelectorAndAffinity.Match(node)
if !match {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonPod)
}
return nil
}
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
preferredNodeAffinity *nodeaffinity.PreferredSchedulingTerms
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *NodeAffinity) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if len(nodes) == 0 {
return nil
}
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
if err != nil {
return framework.AsStatus(err)
}
if preferredNodeAffinity == nil && pl.addedPrefSchedTerms == nil {
// NodeAffinity Score has nothing to do with the Pod.
return framework.NewStatus(framework.Skip)
}
state := &preScoreState{
preferredNodeAffinity: preferredNodeAffinity,
}
cycleState.Write(preScoreStateKey, state)
return nil
}
// Score returns the sum of the weights of the terms that match the Node.
// Terms came from the Pod .spec.affinity.nodeAffinity and from the plugin's
// default affinity.
func (pl *NodeAffinity) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
var count int64
if pl.addedPrefSchedTerms != nil {
count += pl.addedPrefSchedTerms.Score(node)
}
s, err := getPreScoreState(state)
if err != nil {
// Fallback to calculate preferredNodeAffinity here when PreScore is disabled.
preferredNodeAffinity, err := getPodPreferredNodeAffinity(pod)
if err != nil {
return 0, framework.AsStatus(err)
}
s = &preScoreState{
preferredNodeAffinity: preferredNodeAffinity,
}
}
if s.preferredNodeAffinity != nil {
count += s.preferredNodeAffinity.Score(node)
}
return count, nil
}
// NormalizeScore invoked after scoring all nodes.
func (pl *NodeAffinity) NormalizeScore(ctx context.Context, state *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
return helper.DefaultNormalizeScore(framework.MaxNodeScore, false, scores)
}
// ScoreExtensions of the Score plugin.
func (pl *NodeAffinity) ScoreExtensions() framework.ScoreExtensions {
return pl
}
// New initializes a new plugin and returns it.
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, err := getArgs(plArgs)
if err != nil {
return nil, err
}
pl := &NodeAffinity{
handle: h,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}
if args.AddedAffinity != nil {
if ns := args.AddedAffinity.RequiredDuringSchedulingIgnoredDuringExecution; ns != nil {
pl.addedNodeSelector, err = nodeaffinity.NewNodeSelector(ns)
if err != nil {
return nil, fmt.Errorf("parsing addedAffinity.requiredDuringSchedulingIgnoredDuringExecution: %w", err)
}
}
// TODO: parse requiredDuringSchedulingRequiredDuringExecution when it gets added to the API.
if terms := args.AddedAffinity.PreferredDuringSchedulingIgnoredDuringExecution; len(terms) != 0 {
pl.addedPrefSchedTerms, err = nodeaffinity.NewPreferredSchedulingTerms(terms)
if err != nil {
return nil, fmt.Errorf("parsing addedAffinity.preferredDuringSchedulingIgnoredDuringExecution: %w", err)
}
}
}
return pl, nil
}
func getArgs(obj runtime.Object) (config.NodeAffinityArgs, error) {
ptr, ok := obj.(*config.NodeAffinityArgs)
if !ok {
return config.NodeAffinityArgs{}, fmt.Errorf("args are not of type NodeAffinityArgs, got %T", obj)
}
return *ptr, validation.ValidateNodeAffinityArgs(nil, ptr)
}
func getPodPreferredNodeAffinity(pod *v1.Pod) (*nodeaffinity.PreferredSchedulingTerms, error) {
affinity := pod.Spec.Affinity
if affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution != nil {
return nodeaffinity.NewPreferredSchedulingTerms(affinity.NodeAffinity.PreferredDuringSchedulingIgnoredDuringExecution)
}
return nil, nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
}
return s, nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %v", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("invalid PreFilter state, got type %T", c)
}
return s, nil
}

View File

@ -0,0 +1,89 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodename
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// NodeName is a plugin that checks if a pod spec node name matches the current node.
type NodeName struct {
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &NodeName{}
var _ framework.EnqueueExtensions = &NodeName{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodeName
// ErrReason returned when node name doesn't match.
ErrReason = "node(s) didn't match the requested node name"
)
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodeName) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add because any Node update event will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add
}
return []framework.ClusterEventWithHint{
// We don't need the QueueingHintFn here because the scheduling of Pods will be always retried with backoff when this Event happens.
// (the same as Queue)
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
}, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodeName) Name() string {
return Name
}
// Filter invoked at the filter extension point.
func (pl *NodeName) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
if !Fits(pod, nodeInfo) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReason)
}
return nil
}
// Fits actually checks if the pod fits the node.
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
return len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == nodeInfo.Node().Name
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &NodeName{
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,215 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeports
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// NodePorts is a plugin that checks if a node has free ports for the requested pod ports.
type NodePorts struct {
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &NodePorts{}
var _ framework.FilterPlugin = &NodePorts{}
var _ framework.EnqueueExtensions = &NodePorts{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodePorts
// preFilterStateKey is the key in CycleState to NodePorts pre-computed data.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// ErrReason when node ports aren't available.
ErrReason = "node(s) didn't have free ports for the requested pod ports"
)
type preFilterState []*v1.ContainerPort
// Clone the prefilter state.
func (s preFilterState) Clone() framework.StateData {
// The state is not impacted by adding/removing existing pods, hence we don't need to make a deep copy.
return s
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodePorts) Name() string {
return Name
}
// getContainerPorts returns the used host ports of Pods: if 'port' was used, a 'port:true' pair
// will be in the result; but it does not resolve port conflict.
func getContainerPorts(pods ...*v1.Pod) []*v1.ContainerPort {
ports := []*v1.ContainerPort{}
for _, pod := range pods {
for j := range pod.Spec.Containers {
container := &pod.Spec.Containers[j]
for k := range container.Ports {
// Only return ports with a host port specified.
if container.Ports[k].HostPort <= 0 {
continue
}
ports = append(ports, &container.Ports[k])
}
}
}
return ports
}
// PreFilter invoked at the prefilter extension point.
func (pl *NodePorts) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
s := getContainerPorts(pod)
// Skip if a pod has no ports.
if len(s) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, preFilterState(s))
return nil, nil
}
// PreFilterExtensions do not exist for this plugin.
func (pl *NodePorts) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to nodeports.preFilterState error", c)
}
return s, nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodePorts) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add because NodeUpdated event never means to have any free ports for the Pod.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add
}
return []framework.ClusterEventWithHint{
// Due to immutable fields `spec.containers[*].ports`, pod update events are ignored.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
// We don't need the QueueingHintFn here because the scheduling of Pods will be always retried with backoff when this Event happens.
// (the same as Queue)
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
}, nil
}
// isSchedulableAfterPodDeleted is invoked whenever a pod deleted. It checks whether
// that change made a previously unschedulable pod schedulable.
func (pl *NodePorts) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
deletedPod, _, err := util.As[*v1.Pod](oldObj, nil)
if err != nil {
return framework.Queue, err
}
// If the deleted pod is unscheduled, it doesn't make the target pod schedulable.
if deletedPod.Spec.NodeName == "" {
logger.V(4).Info("the deleted pod is unscheduled and it doesn't make the target pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
// Get the used host ports of the deleted pod.
usedPorts := make(framework.HostPortInfo)
for _, container := range deletedPod.Spec.Containers {
for _, podPort := range container.Ports {
if podPort.HostPort > 0 {
usedPorts.Add(podPort.HostIP, string(podPort.Protocol), podPort.HostPort)
}
}
}
// If the deleted pod doesn't use any host ports, it doesn't make the target pod schedulable.
if len(usedPorts) == 0 {
return framework.QueueSkip, nil
}
// Construct a fake NodeInfo that only has the deleted Pod.
// If we can schedule `pod` to this fake node, it means that `pod` and the deleted pod don't have any common port(s).
// So, deleting that pod couldn't make `pod` schedulable.
nodeInfo := framework.NodeInfo{UsedPorts: usedPorts}
if Fits(pod, &nodeInfo) {
logger.V(4).Info("the deleted pod and the target pod don't have any common port(s), returning QueueSkip as deleting this Pod won't make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
logger.V(4).Info("the deleted pod and the target pod have any common port(s), returning Queue as deleting this Pod may make the Pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.Queue, nil
}
// Filter invoked at the filter extension point.
func (pl *NodePorts) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
wantPorts, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
fits := fitsPorts(wantPorts, nodeInfo)
if !fits {
return framework.NewStatus(framework.Unschedulable, ErrReason)
}
return nil
}
// Fits checks if the pod fits the node.
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
return fitsPorts(getContainerPorts(pod), nodeInfo)
}
func fitsPorts(wantPorts []*v1.ContainerPort, nodeInfo *framework.NodeInfo) bool {
// try to see whether existingPorts and wantPorts will conflict or not
existingPorts := nodeInfo.UsedPorts
for _, cp := range wantPorts {
if existingPorts.CheckConflict(cp.HostIP, string(cp.Protocol), cp.HostPort) {
return false
}
}
return true
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &NodePorts{
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,173 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"context"
"fmt"
"math"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// BalancedAllocation is a score plugin that calculates the difference between the cpu and memory fraction
// of capacity, and prioritizes the host based on how close the two metrics are to each other.
type BalancedAllocation struct {
handle framework.Handle
resourceAllocationScorer
}
var _ framework.PreScorePlugin = &BalancedAllocation{}
var _ framework.ScorePlugin = &BalancedAllocation{}
// BalancedAllocationName is the name of the plugin used in the plugin registry and configurations.
const (
BalancedAllocationName = names.NodeResourcesBalancedAllocation
// balancedAllocationPreScoreStateKey is the key in CycleState to NodeResourcesBalancedAllocation pre-computed data for Scoring.
balancedAllocationPreScoreStateKey = "PreScore" + BalancedAllocationName
)
// balancedAllocationPreScoreState computed at PreScore and used at Score.
type balancedAllocationPreScoreState struct {
// podRequests have the same order of the resources defined in NodeResourcesFitArgs.Resources,
// same for other place we store a list like that.
podRequests []int64
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *balancedAllocationPreScoreState) Clone() framework.StateData {
return s
}
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
func (ba *BalancedAllocation) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
state := &balancedAllocationPreScoreState{
podRequests: ba.calculatePodResourceRequestList(pod, ba.resources),
}
cycleState.Write(balancedAllocationPreScoreStateKey, state)
return nil
}
func getBalancedAllocationPreScoreState(cycleState *framework.CycleState) (*balancedAllocationPreScoreState, error) {
c, err := cycleState.Read(balancedAllocationPreScoreStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %w", balancedAllocationPreScoreStateKey, err)
}
s, ok := c.(*balancedAllocationPreScoreState)
if !ok {
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
}
return s, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (ba *BalancedAllocation) Name() string {
return BalancedAllocationName
}
// Score invoked at the score extension point.
func (ba *BalancedAllocation) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := ba.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
s, err := getBalancedAllocationPreScoreState(state)
if err != nil {
s = &balancedAllocationPreScoreState{podRequests: ba.calculatePodResourceRequestList(pod, ba.resources)}
}
// ba.score favors nodes with balanced resource usage rate.
// It calculates the standard deviation for those resources and prioritizes the node based on how close the usage of those resources is to each other.
// Detail: score = (1 - std) * MaxNodeScore, where std is calculated by the root square of Σ((fraction(i)-mean)^2)/len(resources)
// The algorithm is partly inspired by:
// "Wei Huang et al. An Energy Efficient Virtual Machine Placement Algorithm with Balanced Resource Utilization"
return ba.score(ctx, pod, nodeInfo, s.podRequests)
}
// ScoreExtensions of the Score plugin.
func (ba *BalancedAllocation) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// NewBalancedAllocation initializes a new plugin and returns it.
func NewBalancedAllocation(_ context.Context, baArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := baArgs.(*config.NodeResourcesBalancedAllocationArgs)
if !ok {
return nil, fmt.Errorf("want args to be of type NodeResourcesBalancedAllocationArgs, got %T", baArgs)
}
if err := validation.ValidateNodeResourcesBalancedAllocationArgs(nil, args); err != nil {
return nil, err
}
return &BalancedAllocation{
handle: h,
resourceAllocationScorer: resourceAllocationScorer{
Name: BalancedAllocationName,
scorer: balancedResourceScorer,
useRequested: true,
resources: args.Resources,
},
}, nil
}
func balancedResourceScorer(requested, allocable []int64) int64 {
var resourceToFractions []float64
var totalFraction float64
for i := range requested {
if allocable[i] == 0 {
continue
}
fraction := float64(requested[i]) / float64(allocable[i])
if fraction > 1 {
fraction = 1
}
totalFraction += fraction
resourceToFractions = append(resourceToFractions, fraction)
}
std := 0.0
// For most cases, resources are limited to cpu and memory, the std could be simplified to std := (fraction1-fraction2)/2
// len(fractions) > 2: calculate std based on the well-known formula - root square of Σ((fraction(i)-mean)^2)/len(fractions)
// Otherwise, set the std to zero is enough.
if len(resourceToFractions) == 2 {
std = math.Abs((resourceToFractions[0] - resourceToFractions[1]) / 2)
} else if len(resourceToFractions) > 2 {
mean := totalFraction / float64(len(resourceToFractions))
var sum float64
for _, fraction := range resourceToFractions {
sum = sum + (fraction-mean)*(fraction-mean)
}
std = math.Sqrt(sum / float64(len(resourceToFractions)))
}
// STD (standard deviation) is always a positive value. 1-deviation lets the score to be higher for node which has least deviation and
// multiplying it with `MaxNodeScore` provides the scaling factor needed.
return int64((1 - std) * float64(framework.MaxNodeScore))
}

View File

@ -0,0 +1,596 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"context"
"fmt"
"strings"
"github.com/google/go-cmp/cmp"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/resource"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)
var _ framework.PreFilterPlugin = &Fit{}
var _ framework.FilterPlugin = &Fit{}
var _ framework.EnqueueExtensions = &Fit{}
var _ framework.PreScorePlugin = &Fit{}
var _ framework.ScorePlugin = &Fit{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.NodeResourcesFit
// preFilterStateKey is the key in CycleState to NodeResourcesFit pre-computed data.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// preScoreStateKey is the key in CycleState to NodeResourcesFit pre-computed data for Scoring.
preScoreStateKey = "PreScore" + Name
)
// nodeResourceStrategyTypeMap maps strategy to scorer implementation
var nodeResourceStrategyTypeMap = map[config.ScoringStrategyType]scorer{
config.LeastAllocated: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
resources := args.ScoringStrategy.Resources
return &resourceAllocationScorer{
Name: string(config.LeastAllocated),
scorer: leastResourceScorer(resources),
resources: resources,
}
},
config.MostAllocated: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
resources := args.ScoringStrategy.Resources
return &resourceAllocationScorer{
Name: string(config.MostAllocated),
scorer: mostResourceScorer(resources),
resources: resources,
}
},
config.RequestedToCapacityRatio: func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer {
resources := args.ScoringStrategy.Resources
return &resourceAllocationScorer{
Name: string(config.RequestedToCapacityRatio),
scorer: requestedToCapacityRatioScorer(resources, args.ScoringStrategy.RequestedToCapacityRatio.Shape),
resources: resources,
}
},
}
// Fit is a plugin that checks if a node has sufficient resources.
type Fit struct {
ignoredResources sets.Set[string]
ignoredResourceGroups sets.Set[string]
enableInPlacePodVerticalScaling bool
enableSidecarContainers bool
enableSchedulingQueueHint bool
enablePodLevelResources bool
handle framework.Handle
resourceAllocationScorer
}
// ScoreExtensions of the Score plugin.
func (f *Fit) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// preFilterState computed at PreFilter and used at Filter.
type preFilterState struct {
framework.Resource
}
// Clone the prefilter state.
func (s *preFilterState) Clone() framework.StateData {
return s
}
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
// podRequests have the same order as the resources defined in NodeResourcesBalancedAllocationArgs.Resources,
// same for other place we store a list like that.
podRequests []int64
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// PreScore calculates incoming pod's resource requests and writes them to the cycle state used.
func (f *Fit) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
state := &preScoreState{
podRequests: f.calculatePodResourceRequestList(pod, f.resources),
}
cycleState.Write(preScoreStateKey, state)
return nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("reading %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("invalid PreScore state, got type %T", c)
}
return s, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (f *Fit) Name() string {
return Name
}
// NewFit initializes a new plugin and returns it.
func NewFit(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := plArgs.(*config.NodeResourcesFitArgs)
if !ok {
return nil, fmt.Errorf("want args to be of type NodeResourcesFitArgs, got %T", plArgs)
}
if err := validation.ValidateNodeResourcesFitArgs(nil, args); err != nil {
return nil, err
}
if args.ScoringStrategy == nil {
return nil, fmt.Errorf("scoring strategy not specified")
}
strategy := args.ScoringStrategy.Type
scorePlugin, exists := nodeResourceStrategyTypeMap[strategy]
if !exists {
return nil, fmt.Errorf("scoring strategy %s is not supported", strategy)
}
return &Fit{
ignoredResources: sets.New(args.IgnoredResources...),
ignoredResourceGroups: sets.New(args.IgnoredResourceGroups...),
enableInPlacePodVerticalScaling: fts.EnableInPlacePodVerticalScaling,
enableSidecarContainers: fts.EnableSidecarContainers,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
handle: h,
enablePodLevelResources: fts.EnablePodLevelResources,
resourceAllocationScorer: *scorePlugin(args),
}, nil
}
type ResourceRequestsOptions struct {
EnablePodLevelResources bool
}
// computePodResourceRequest returns a framework.Resource that covers the largest
// width in each resource dimension. Because init-containers run sequentially, we collect
// the max in each dimension iteratively. In contrast, we sum the resource vectors for
// regular containers since they run simultaneously.
//
// # The resources defined for Overhead should be added to the calculated Resource request sum
//
// Example:
//
// Pod:
//
// InitContainers
// IC1:
// CPU: 2
// Memory: 1G
// IC2:
// CPU: 2
// Memory: 3G
// Containers
// C1:
// CPU: 2
// Memory: 1G
// C2:
// CPU: 1
// Memory: 1G
//
// Result: CPU: 3, Memory: 3G
// TODO(ndixita): modify computePodResourceRequest to accept opts of type
// ResourceRequestOptions as the second parameter.
func computePodResourceRequest(pod *v1.Pod, opts ResourceRequestsOptions) *preFilterState {
// pod hasn't scheduled yet so we don't need to worry about InPlacePodVerticalScalingEnabled
reqs := resource.PodRequests(pod, resource.PodResourcesOptions{
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !opts.EnablePodLevelResources,
})
result := &preFilterState{}
result.SetMaxResource(reqs)
return result
}
// PreFilter invoked at the prefilter extension point.
func (f *Fit) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
if !f.enableSidecarContainers && hasRestartableInitContainer(pod) {
// Scheduler will calculate resources usage for a Pod containing
// restartable init containers that will be equal or more than kubelet will
// require to run the Pod. So there will be no overbooking. However, to
// avoid the inconsistency in resource calculation between the scheduler
// and the older (before v1.28) kubelet, make the Pod unschedulable.
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "Pod has a restartable init container and the SidecarContainers feature is disabled")
}
cycleState.Write(preFilterStateKey, computePodResourceRequest(pod, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}))
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (f *Fit) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to NodeResourcesFit.preFilterState error", c)
}
return s, nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (f *Fit) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
podActionType := framework.Delete
if f.enableInPlacePodVerticalScaling {
// If InPlacePodVerticalScaling (KEP 1287) is enabled, then UpdatePodScaleDown event should be registered
// for this plugin since a Pod update may free up resources that make other Pods schedulable.
podActionType |= framework.UpdatePodScaleDown
}
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add | UpdateNodeAllocatable because the only resource update could change the node resource fit plugin's result.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeAllocatable | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if f.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add | framework.UpdateNodeAllocatable
}
return []framework.ClusterEventWithHint{
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: podActionType}, QueueingHintFn: f.isSchedulableAfterPodEvent},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}, QueueingHintFn: f.isSchedulableAfterNodeChange},
}, nil
}
// isSchedulableAfterPodEvent is invoked whenever a pod deleted or scaled down. It checks whether
// that change made a previously unschedulable pod schedulable.
func (f *Fit) isSchedulableAfterPodEvent(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPod, modifiedPod, err := schedutil.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if modifiedPod == nil {
if originalPod.Spec.NodeName == "" {
logger.V(5).Info("the deleted pod was unscheduled and it wouldn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.QueueSkip, nil
}
// any deletion event to a scheduled pod could make the unscheduled pod schedulable.
logger.V(5).Info("another scheduled pod was deleted, and it may make the unscheduled pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.Queue, nil
}
if !f.enableInPlacePodVerticalScaling {
// If InPlacePodVerticalScaling (KEP 1287) is disabled, the pod scale down event cannot free up any resources.
logger.V(5).Info("another pod was modified, but InPlacePodVerticalScaling is disabled, so it doesn't make the unscheduled pod schedulable", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
if !f.isSchedulableAfterPodScaleDown(pod, originalPod, modifiedPod) {
if loggerV := logger.V(10); loggerV.Enabled() {
// Log more information.
loggerV.Info("pod got scaled down, but the modification isn't related to the resource requests of the target pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod), "diff", cmp.Diff(originalPod, modifiedPod))
} else {
logger.V(5).Info("pod got scaled down, but the modification isn't related to the resource requests of the target pod", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
}
return framework.QueueSkip, nil
}
logger.V(5).Info("another scheduled pod or the target pod itself got scaled down, and it may make the unscheduled pod schedulable", "pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
// isSchedulableAfterPodScaleDown checks whether the scale down event may make the target pod schedulable. Specifically:
// - Returns true when the update event is for the target pod itself.
// - Returns true when the update event shows a scheduled pod's resource request that the target pod also requests got reduced.
func (f *Fit) isSchedulableAfterPodScaleDown(targetPod, originalPod, modifiedPod *v1.Pod) bool {
if modifiedPod.UID == targetPod.UID {
// If the scaling down event is for targetPod, it would make targetPod schedulable.
return true
}
if modifiedPod.Spec.NodeName == "" {
// If the update event is for a unscheduled Pod,
// it wouldn't make targetPod schedulable.
return false
}
// the other pod was scheduled, so modification or deletion may free up some resources.
originalMaxResourceReq, modifiedMaxResourceReq := &framework.Resource{}, &framework.Resource{}
originalMaxResourceReq.SetMaxResource(resource.PodRequests(originalPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling}))
modifiedMaxResourceReq.SetMaxResource(resource.PodRequests(modifiedPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling}))
// check whether the resource request of the modified pod is less than the original pod.
podRequests := resource.PodRequests(targetPod, resource.PodResourcesOptions{UseStatusResources: f.enableInPlacePodVerticalScaling})
for rName, rValue := range podRequests {
if rValue.IsZero() {
// We only care about the resources requested by the pod we are trying to schedule.
continue
}
switch rName {
case v1.ResourceCPU:
if originalMaxResourceReq.MilliCPU > modifiedMaxResourceReq.MilliCPU {
return true
}
case v1.ResourceMemory:
if originalMaxResourceReq.Memory > modifiedMaxResourceReq.Memory {
return true
}
case v1.ResourceEphemeralStorage:
if originalMaxResourceReq.EphemeralStorage > modifiedMaxResourceReq.EphemeralStorage {
return true
}
default:
if schedutil.IsScalarResourceName(rName) && originalMaxResourceReq.ScalarResources[rName] > modifiedMaxResourceReq.ScalarResources[rName] {
return true
}
}
}
return false
}
// isSchedulableAfterNodeChange is invoked whenever a node added or changed. It checks whether
// that change could make a previously unschedulable pod schedulable.
func (f *Fit) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := schedutil.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
// Leaving in the queue, since the pod won't fit into the modified node anyway.
if !isFit(pod, modifiedNode, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}) {
logger.V(5).Info("node was created or updated, but it doesn't have enough resource(s) to accommodate this pod", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// The pod will fit, so since it's add, unblock scheduling.
if originalNode == nil {
logger.V(5).Info("node was added and it might fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// The pod will fit, but since there was no increase in available resources, the change won't make the pod schedulable.
if !haveAnyRequestedResourcesIncreased(pod, originalNode, modifiedNode, ResourceRequestsOptions{EnablePodLevelResources: f.enablePodLevelResources}) {
logger.V(5).Info("node was updated, but haven't changed the pod's resource requestments fit assessment", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("node was updated, and may now fit the pod's resource requests", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
// haveAnyRequestedResourcesIncreased returns true if any of the resources requested by the pod have increased or if allowed pod number increased.
func haveAnyRequestedResourcesIncreased(pod *v1.Pod, originalNode, modifiedNode *v1.Node, opts ResourceRequestsOptions) bool {
podRequest := computePodResourceRequest(pod, opts)
originalNodeInfo := framework.NewNodeInfo()
originalNodeInfo.SetNode(originalNode)
modifiedNodeInfo := framework.NewNodeInfo()
modifiedNodeInfo.SetNode(modifiedNode)
if modifiedNodeInfo.Allocatable.AllowedPodNumber > originalNodeInfo.Allocatable.AllowedPodNumber {
return true
}
if podRequest.MilliCPU == 0 &&
podRequest.Memory == 0 &&
podRequest.EphemeralStorage == 0 &&
len(podRequest.ScalarResources) == 0 {
return false
}
if (podRequest.MilliCPU > 0 && modifiedNodeInfo.Allocatable.MilliCPU > originalNodeInfo.Allocatable.MilliCPU) ||
(podRequest.Memory > 0 && modifiedNodeInfo.Allocatable.Memory > originalNodeInfo.Allocatable.Memory) ||
(podRequest.EphemeralStorage > 0 && modifiedNodeInfo.Allocatable.EphemeralStorage > originalNodeInfo.Allocatable.EphemeralStorage) {
return true
}
for rName, rQuant := range podRequest.ScalarResources {
// Skip in case request quantity is zero
if rQuant == 0 {
continue
}
if modifiedNodeInfo.Allocatable.ScalarResources[rName] > originalNodeInfo.Allocatable.ScalarResources[rName] {
return true
}
}
return false
}
// isFit checks if the pod fits the node. If the node is nil, it returns false.
// It constructs a fake NodeInfo object for the node and checks if the pod fits the node.
func isFit(pod *v1.Pod, node *v1.Node, opts ResourceRequestsOptions) bool {
if node == nil {
return false
}
nodeInfo := framework.NewNodeInfo()
nodeInfo.SetNode(node)
return len(Fits(pod, nodeInfo, opts)) == 0
}
// Filter invoked at the filter extension point.
// Checks if a node has sufficient resources, such as cpu, memory, gpu, opaque int resources etc to run a pod.
// It returns a list of insufficient resources, if empty, then the node has all the resources requested by the pod.
func (f *Fit) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
insufficientResources := fitsRequest(s, nodeInfo, f.ignoredResources, f.ignoredResourceGroups)
if len(insufficientResources) != 0 {
// We will keep all failure reasons.
failureReasons := make([]string, 0, len(insufficientResources))
for i := range insufficientResources {
failureReasons = append(failureReasons, insufficientResources[i].Reason)
}
return framework.NewStatus(framework.Unschedulable, failureReasons...)
}
return nil
}
func hasRestartableInitContainer(pod *v1.Pod) bool {
for _, c := range pod.Spec.InitContainers {
if c.RestartPolicy != nil && *c.RestartPolicy == v1.ContainerRestartPolicyAlways {
return true
}
}
return false
}
// InsufficientResource describes what kind of resource limit is hit and caused the pod to not fit the node.
type InsufficientResource struct {
ResourceName v1.ResourceName
// We explicitly have a parameter for reason to avoid formatting a message on the fly
// for common resources, which is expensive for cluster autoscaler simulations.
Reason string
Requested int64
Used int64
Capacity int64
}
// Fits checks if node have enough resources to host the pod.
func Fits(pod *v1.Pod, nodeInfo *framework.NodeInfo, opts ResourceRequestsOptions) []InsufficientResource {
return fitsRequest(computePodResourceRequest(pod, opts), nodeInfo, nil, nil)
}
func fitsRequest(podRequest *preFilterState, nodeInfo *framework.NodeInfo, ignoredExtendedResources, ignoredResourceGroups sets.Set[string]) []InsufficientResource {
insufficientResources := make([]InsufficientResource, 0, 4)
allowedPodNumber := nodeInfo.Allocatable.AllowedPodNumber
if len(nodeInfo.Pods)+1 > allowedPodNumber {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourcePods,
Reason: "Too many pods",
Requested: 1,
Used: int64(len(nodeInfo.Pods)),
Capacity: int64(allowedPodNumber),
})
}
if podRequest.MilliCPU == 0 &&
podRequest.Memory == 0 &&
podRequest.EphemeralStorage == 0 &&
len(podRequest.ScalarResources) == 0 {
return insufficientResources
}
if podRequest.MilliCPU > 0 && podRequest.MilliCPU > (nodeInfo.Allocatable.MilliCPU-nodeInfo.Requested.MilliCPU) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourceCPU,
Reason: "Insufficient cpu",
Requested: podRequest.MilliCPU,
Used: nodeInfo.Requested.MilliCPU,
Capacity: nodeInfo.Allocatable.MilliCPU,
})
}
if podRequest.Memory > 0 && podRequest.Memory > (nodeInfo.Allocatable.Memory-nodeInfo.Requested.Memory) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourceMemory,
Reason: "Insufficient memory",
Requested: podRequest.Memory,
Used: nodeInfo.Requested.Memory,
Capacity: nodeInfo.Allocatable.Memory,
})
}
if podRequest.EphemeralStorage > 0 &&
podRequest.EphemeralStorage > (nodeInfo.Allocatable.EphemeralStorage-nodeInfo.Requested.EphemeralStorage) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: v1.ResourceEphemeralStorage,
Reason: "Insufficient ephemeral-storage",
Requested: podRequest.EphemeralStorage,
Used: nodeInfo.Requested.EphemeralStorage,
Capacity: nodeInfo.Allocatable.EphemeralStorage,
})
}
for rName, rQuant := range podRequest.ScalarResources {
// Skip in case request quantity is zero
if rQuant == 0 {
continue
}
if v1helper.IsExtendedResourceName(rName) {
// If this resource is one of the extended resources that should be ignored, we will skip checking it.
// rName is guaranteed to have a slash due to API validation.
var rNamePrefix string
if ignoredResourceGroups.Len() > 0 {
rNamePrefix = strings.Split(string(rName), "/")[0]
}
if ignoredExtendedResources.Has(string(rName)) || ignoredResourceGroups.Has(rNamePrefix) {
continue
}
}
if rQuant > (nodeInfo.Allocatable.ScalarResources[rName] - nodeInfo.Requested.ScalarResources[rName]) {
insufficientResources = append(insufficientResources, InsufficientResource{
ResourceName: rName,
Reason: fmt.Sprintf("Insufficient %v", rName),
Requested: podRequest.ScalarResources[rName],
Used: nodeInfo.Requested.ScalarResources[rName],
Capacity: nodeInfo.Allocatable.ScalarResources[rName],
})
}
}
return insufficientResources
}
// Score invoked at the Score extension point.
func (f *Fit) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := f.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
s, err := getPreScoreState(state)
if err != nil {
s = &preScoreState{
podRequests: f.calculatePodResourceRequestList(pod, f.resources),
}
}
return f.score(ctx, pod, nodeInfo, s.podRequests)
}

View File

@ -0,0 +1,61 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// leastResourceScorer favors nodes with fewer requested resources.
// It calculates the percentage of memory, CPU and other resources requested by pods scheduled on the node, and
// prioritizes based on the minimum of the average of the fraction of requested to capacity.
//
// Details:
// (cpu((capacity-requested)*MaxNodeScore*cpuWeight/capacity) + memory((capacity-requested)*MaxNodeScore*memoryWeight/capacity) + ...)/weightSum
func leastResourceScorer(resources []config.ResourceSpec) func([]int64, []int64) int64 {
return func(requested, allocable []int64) int64 {
var nodeScore, weightSum int64
for i := range requested {
if allocable[i] == 0 {
continue
}
weight := resources[i].Weight
resourceScore := leastRequestedScore(requested[i], allocable[i])
nodeScore += resourceScore * weight
weightSum += weight
}
if weightSum == 0 {
return 0
}
return nodeScore / weightSum
}
}
// The unused capacity is calculated on a scale of 0-MaxNodeScore
// 0 being the lowest priority and `MaxNodeScore` being the highest.
// The more unused resources the higher the score is.
func leastRequestedScore(requested, capacity int64) int64 {
if capacity == 0 {
return 0
}
if requested > capacity {
return 0
}
return ((capacity - requested) * framework.MaxNodeScore) / capacity
}

View File

@ -0,0 +1,65 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// mostResourceScorer favors nodes with most requested resources.
// It calculates the percentage of memory and CPU requested by pods scheduled on the node, and prioritizes
// based on the maximum of the average of the fraction of requested to capacity.
//
// Details:
// (cpu(MaxNodeScore * requested * cpuWeight / capacity) + memory(MaxNodeScore * requested * memoryWeight / capacity) + ...) / weightSum
func mostResourceScorer(resources []config.ResourceSpec) func(requested, allocable []int64) int64 {
return func(requested, allocable []int64) int64 {
var nodeScore, weightSum int64
for i := range requested {
if allocable[i] == 0 {
continue
}
weight := resources[i].Weight
resourceScore := mostRequestedScore(requested[i], allocable[i])
nodeScore += resourceScore * weight
weightSum += weight
}
if weightSum == 0 {
return 0
}
return nodeScore / weightSum
}
}
// The used capacity is calculated on a scale of 0-MaxNodeScore (MaxNodeScore is
// constant with value set to 100).
// 0 being the lowest priority and 100 being the highest.
// The more resources are used the higher the score is. This function
// is almost a reversed version of noderesources.leastRequestedScore.
func mostRequestedScore(requested, capacity int64) int64 {
if capacity == 0 {
return 0
}
if requested > capacity {
// `requested` might be greater than `capacity` because pods with no
// requests get minimum values.
requested = capacity
}
return (requested * framework.MaxNodeScore) / capacity
}

View File

@ -0,0 +1,73 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"math"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
)
const maxUtilization = 100
// buildRequestedToCapacityRatioScorerFunction allows users to apply bin packing
// on core resources like CPU, Memory as well as extended resources like accelerators.
func buildRequestedToCapacityRatioScorerFunction(scoringFunctionShape helper.FunctionShape, resources []config.ResourceSpec) func([]int64, []int64) int64 {
rawScoringFunction := helper.BuildBrokenLinearFunction(scoringFunctionShape)
resourceScoringFunction := func(requested, capacity int64) int64 {
if capacity == 0 || requested > capacity {
return rawScoringFunction(maxUtilization)
}
return rawScoringFunction(requested * maxUtilization / capacity)
}
return func(requested, allocable []int64) int64 {
var nodeScore, weightSum int64
for i := range requested {
if allocable[i] == 0 {
continue
}
weight := resources[i].Weight
resourceScore := resourceScoringFunction(requested[i], allocable[i])
if resourceScore > 0 {
nodeScore += resourceScore * weight
weightSum += weight
}
}
if weightSum == 0 {
return 0
}
return int64(math.Round(float64(nodeScore) / float64(weightSum)))
}
}
func requestedToCapacityRatioScorer(resources []config.ResourceSpec, shape []config.UtilizationShapePoint) func([]int64, []int64) int64 {
shapes := make([]helper.FunctionShapePoint, 0, len(shape))
for _, point := range shape {
shapes = append(shapes, helper.FunctionShapePoint{
Utilization: int64(point.Utilization),
// MaxCustomPriorityScore may diverge from the max score used in the scheduler and defined by MaxNodeScore,
// therefore we need to scale the score returned by requested to capacity ratio to the score range
// used by the scheduler.
Score: int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore),
})
}
return buildRequestedToCapacityRatioScorerFunction(shapes, resources)
}

View File

@ -0,0 +1,148 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/klog/v2"
resourcehelper "k8s.io/component-helpers/resource"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/framework"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
)
// scorer is decorator for resourceAllocationScorer
type scorer func(args *config.NodeResourcesFitArgs) *resourceAllocationScorer
// resourceAllocationScorer contains information to calculate resource allocation score.
type resourceAllocationScorer struct {
Name string
// used to decide whether to use Requested or NonZeroRequested for
// cpu and memory.
useRequested bool
scorer func(requested, allocable []int64) int64
resources []config.ResourceSpec
}
// score will use `scorer` function to calculate the score.
func (r *resourceAllocationScorer) score(
ctx context.Context,
pod *v1.Pod,
nodeInfo *framework.NodeInfo,
podRequests []int64) (int64, *framework.Status) {
logger := klog.FromContext(ctx)
node := nodeInfo.Node()
// resources not set, nothing scheduled,
if len(r.resources) == 0 {
return 0, framework.NewStatus(framework.Error, "resources not found")
}
requested := make([]int64, len(r.resources))
allocatable := make([]int64, len(r.resources))
for i := range r.resources {
alloc, req := r.calculateResourceAllocatableRequest(logger, nodeInfo, v1.ResourceName(r.resources[i].Name), podRequests[i])
// Only fill the extended resource entry when it's non-zero.
if alloc == 0 {
continue
}
allocatable[i] = alloc
requested[i] = req
}
score := r.scorer(requested, allocatable)
if loggerV := logger.V(10); loggerV.Enabled() { // Serializing these maps is costly.
loggerV.Info("Listed internal info for allocatable resources, requested resources and score", "pod",
klog.KObj(pod), "node", klog.KObj(node), "resourceAllocationScorer", r.Name,
"allocatableResource", allocatable, "requestedResource", requested, "resourceScore", score,
)
}
return score, nil
}
// calculateResourceAllocatableRequest returns 2 parameters:
// - 1st param: quantity of allocatable resource on the node.
// - 2nd param: aggregated quantity of requested resource on the node.
// Note: if it's an extended resource, and the pod doesn't request it, (0, 0) is returned.
func (r *resourceAllocationScorer) calculateResourceAllocatableRequest(logger klog.Logger, nodeInfo *framework.NodeInfo, resource v1.ResourceName, podRequest int64) (int64, int64) {
requested := nodeInfo.NonZeroRequested
if r.useRequested {
requested = nodeInfo.Requested
}
// If it's an extended resource, and the pod doesn't request it. We return (0, 0)
// as an implication to bypass scoring on this resource.
if podRequest == 0 && schedutil.IsScalarResourceName(resource) {
return 0, 0
}
switch resource {
case v1.ResourceCPU:
return nodeInfo.Allocatable.MilliCPU, (requested.MilliCPU + podRequest)
case v1.ResourceMemory:
return nodeInfo.Allocatable.Memory, (requested.Memory + podRequest)
case v1.ResourceEphemeralStorage:
return nodeInfo.Allocatable.EphemeralStorage, (nodeInfo.Requested.EphemeralStorage + podRequest)
default:
if _, exists := nodeInfo.Allocatable.ScalarResources[resource]; exists {
return nodeInfo.Allocatable.ScalarResources[resource], (nodeInfo.Requested.ScalarResources[resource] + podRequest)
}
}
logger.V(10).Info("Requested resource is omitted for node score calculation", "resourceName", resource)
return 0, 0
}
// calculatePodResourceRequest returns the total non-zero requests. If Overhead is defined for the pod
// the Overhead is added to the result.
func (r *resourceAllocationScorer) calculatePodResourceRequest(pod *v1.Pod, resourceName v1.ResourceName) int64 {
opts := resourcehelper.PodResourcesOptions{
UseStatusResources: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
// SkipPodLevelResources is set to false when PodLevelResources feature is enabled.
SkipPodLevelResources: !utilfeature.DefaultFeatureGate.Enabled(features.PodLevelResources),
}
if !r.useRequested {
opts.NonMissingContainerRequests = v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(schedutil.DefaultMilliCPURequest, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(schedutil.DefaultMemoryRequest, resource.DecimalSI),
}
}
requests := resourcehelper.PodRequests(pod, opts)
quantity := requests[resourceName]
if resourceName == v1.ResourceCPU {
return quantity.MilliValue()
}
return quantity.Value()
}
func (r *resourceAllocationScorer) calculatePodResourceRequestList(pod *v1.Pod, resources []config.ResourceSpec) []int64 {
podRequests := make([]int64, len(resources))
for i := range resources {
podRequests[i] = r.calculatePodResourceRequest(pod, v1.ResourceName(resources[i].Name))
}
return podRequests
}

View File

@ -0,0 +1,57 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesources
import (
"github.com/google/go-cmp/cmp/cmpopts"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/validation/field"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
)
var (
ignoreBadValueDetail = cmpopts.IgnoreFields(field.Error{}, "BadValue", "Detail")
defaultResources = []config.ResourceSpec{
{Name: string(v1.ResourceCPU), Weight: 1},
{Name: string(v1.ResourceMemory), Weight: 1},
}
extendedRes = "abc.com/xyz"
extendedResourceSet = []config.ResourceSpec{
{Name: string(v1.ResourceCPU), Weight: 1},
{Name: string(v1.ResourceMemory), Weight: 1},
{Name: extendedRes, Weight: 1},
}
)
func makeNode(node string, milliCPU, memory int64, extendedResource map[string]int64) *v1.Node {
resourceList := make(map[v1.ResourceName]resource.Quantity)
for res, quantity := range extendedResource {
resourceList[v1.ResourceName(res)] = *resource.NewQuantity(quantity, resource.DecimalSI)
}
resourceList[v1.ResourceCPU] = *resource.NewMilliQuantity(milliCPU, resource.DecimalSI)
resourceList[v1.ResourceMemory] = *resource.NewQuantity(memory, resource.BinarySI)
return &v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: node},
Status: v1.NodeStatus{
Capacity: resourceList,
Allocatable: resourceList,
},
}
}

View File

@ -0,0 +1,154 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodeunschedulable
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
v1helper "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// NodeUnschedulable plugin filters nodes that set node.Spec.Unschedulable=true unless
// the pod tolerates {key=node.kubernetes.io/unschedulable, effect:NoSchedule} taint.
type NodeUnschedulable struct {
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &NodeUnschedulable{}
var _ framework.EnqueueExtensions = &NodeUnschedulable{}
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.NodeUnschedulable
const (
// ErrReasonUnknownCondition is used for NodeUnknownCondition predicate error.
ErrReasonUnknownCondition = "node(s) had unknown conditions"
// ErrReasonUnschedulable is used for NodeUnschedulable predicate error.
ErrReasonUnschedulable = "node(s) were unschedulable"
)
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *NodeUnschedulable) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if !pl.enableSchedulingQueueHint {
return []framework.ClusterEventWithHint{
// A note about UpdateNodeLabel event:
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
return []framework.ClusterEventWithHint{
// When QueueingHint is enabled, we don't use preCheck and we don't need to register UpdateNodeLabel event.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
// When the QueueingHint feature is enabled,
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
}, nil
}
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
func (pl *NodeUnschedulable) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if pod.UID == modifiedPod.UID {
// Note: we don't need to check oldPod tolerations the taint because:
// - Taint can be added, but can't be modified nor removed.
// - If the Pod already has the toleration, it shouldn't have rejected by this plugin in the first place.
// Meaning, here this Pod has been rejected by this plugin, and hence it shouldn't have the toleration yet.
if v1helper.TolerationsTolerateTaint(modifiedPod.Spec.Tolerations, &v1.Taint{
Key: v1.TaintNodeUnschedulable,
Effect: v1.TaintEffectNoSchedule,
}) {
// This update makes the pod tolerate the unschedulable taint.
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a new toleration is added for the unschedulable Pod, but it's an unrelated toleration", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// isSchedulableAfterNodeChange is invoked for all node events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable.
func (pl *NodeUnschedulable) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
// We queue this Pod when -
// 1. the node is updated from unschedulable to schedulable.
// 2. the node is added and is schedulable.
if (originalNode != nil && originalNode.Spec.Unschedulable && !modifiedNode.Spec.Unschedulable) ||
(originalNode == nil && !modifiedNode.Spec.Unschedulable) {
logger.V(5).Info("node was created or updated, pod may be schedulable now", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
logger.V(5).Info("node was created or updated, but it doesn't make this pod schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *NodeUnschedulable) Name() string {
return Name
}
// Filter invoked at the filter extension point.
func (pl *NodeUnschedulable) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
if !node.Spec.Unschedulable {
return nil
}
// If pod tolerate unschedulable taint, it's also tolerate `node.Spec.Unschedulable`.
podToleratesUnschedulable := v1helper.TolerationsTolerateTaint(pod.Spec.Tolerations, &v1.Taint{
Key: v1.TaintNodeUnschedulable,
Effect: v1.TaintEffectNoSchedule,
})
if !podToleratesUnschedulable {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonUnschedulable)
}
return nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &NodeUnschedulable{enableSchedulingQueueHint: fts.EnableSchedulingQueueHint}, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,539 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodevolumelimits
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/rand"
corelisters "k8s.io/client-go/listers/core/v1"
storagelisters "k8s.io/client-go/listers/storage/v1"
ephemeral "k8s.io/component-helpers/storage/ephemeral"
storagehelpers "k8s.io/component-helpers/storage/volume"
csitrans "k8s.io/csi-translation-lib"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
const (
// ErrReasonMaxVolumeCountExceeded is used for MaxVolumeCount predicate error.
ErrReasonMaxVolumeCountExceeded = "node(s) exceed max volume count"
)
// InTreeToCSITranslator contains methods required to check migratable status
// and perform translations from InTree PV's to CSI
type InTreeToCSITranslator interface {
IsPVMigratable(pv *v1.PersistentVolume) bool
IsInlineMigratable(vol *v1.Volume) bool
IsMigratableIntreePluginByName(inTreePluginName string) bool
GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
GetCSINameFromInTreeName(pluginName string) (string, error)
TranslateInTreePVToCSI(logger klog.Logger, pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
TranslateInTreeInlineVolumeToCSI(logger klog.Logger, volume *v1.Volume, podNamespace string) (*v1.PersistentVolume, error)
}
// CSILimits is a plugin that checks node volume limits.
type CSILimits struct {
csiNodeLister storagelisters.CSINodeLister
pvLister corelisters.PersistentVolumeLister
pvcLister corelisters.PersistentVolumeClaimLister
scLister storagelisters.StorageClassLister
vaLister storagelisters.VolumeAttachmentLister
randomVolumeIDPrefix string
translator InTreeToCSITranslator
}
var _ framework.PreFilterPlugin = &CSILimits{}
var _ framework.FilterPlugin = &CSILimits{}
var _ framework.EnqueueExtensions = &CSILimits{}
// CSIName is the name of the plugin used in the plugin registry and configurations.
const CSIName = names.NodeVolumeLimits
// Name returns name of the plugin. It is used in logs, etc.
func (pl *CSILimits) Name() string {
return CSIName
}
// EventsToRegister returns the possible events that may make a Pod.
// failed by this plugin schedulable.
func (pl *CSILimits) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
return []framework.ClusterEventWithHint{
// We don't register any `QueueingHintFn` intentionally
// because any new CSINode could make pods that were rejected by CSI volumes schedulable.
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add}},
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterPVCAdded},
{Event: framework.ClusterEvent{Resource: framework.VolumeAttachment, ActionType: framework.Delete}},
}, nil
}
func (pl *CSILimits) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err)
}
if len(deletedPod.Spec.Volumes) == 0 {
return framework.QueueSkip, nil
}
if deletedPod.Spec.NodeName == "" {
return framework.QueueSkip, nil
}
for _, vol := range deletedPod.Spec.Volumes {
if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(&vol) {
return framework.Queue, nil
}
}
logger.V(5).Info("The deleted pod does not impact the scheduling of the unscheduled pod", "deletedPod", klog.KObj(pod), "pod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
func (pl *CSILimits) isSchedulableAfterPVCAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, addedPvc, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPVCAdded: %w", err)
}
if addedPvc.Namespace != pod.Namespace {
return framework.QueueSkip, nil
}
for _, volumes := range pod.Spec.Volumes {
var pvcName string
switch {
case volumes.PersistentVolumeClaim != nil:
pvcName = volumes.PersistentVolumeClaim.ClaimName
case volumes.Ephemeral != nil:
pvcName = ephemeral.VolumeClaimName(pod, &volumes)
default:
// Volume is not using a PVC, ignore
continue
}
if pvcName == addedPvc.Name {
logger.V(5).Info("PVC that is referred from the pod was created, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(addedPvc))
return framework.Queue, nil
}
}
logger.V(5).Info("PVC irrelevant to the Pod was created, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(addedPvc))
return framework.QueueSkip, nil
}
// PreFilter invoked at the prefilter extension point
//
// If the pod haven't those types of volumes, we'll skip the Filter phase
func (pl *CSILimits) PreFilter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
volumes := pod.Spec.Volumes
for i := range volumes {
vol := &volumes[i]
if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(vol) {
return nil, nil
}
}
return nil, framework.NewStatus(framework.Skip)
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *CSILimits) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// Filter invoked at the filter extension point.
func (pl *CSILimits) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
// If the new pod doesn't have any volume attached to it, the predicate will always be true
if len(pod.Spec.Volumes) == 0 {
return nil
}
node := nodeInfo.Node()
logger := klog.FromContext(ctx)
csiNode, err := pl.csiNodeLister.Get(node.Name)
if err != nil {
// TODO: return the error once CSINode is created by default (2 releases)
logger.V(5).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
}
// Count CSI volumes from the new pod
newVolumes := make(map[string]string)
if err := pl.filterAttachableVolumes(logger, pod, csiNode, true /* new pod */, newVolumes); err != nil {
if apierrors.IsNotFound(err) {
// PVC is not found. This Pod will never be schedulable until PVC is created.
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
return framework.AsStatus(err)
}
// If the pod doesn't have any new CSI volumes, the predicate will always be true
if len(newVolumes) == 0 {
return nil
}
// If the node doesn't have volume limits, the predicate will always be true
nodeVolumeLimits := getVolumeLimits(csiNode)
if len(nodeVolumeLimits) == 0 {
return nil
}
// Count CSI volumes from existing pods
attachedVolumes := make(map[string]string)
for _, existingPod := range nodeInfo.Pods {
if err := pl.filterAttachableVolumes(logger, existingPod.Pod, csiNode, false /* existing pod */, attachedVolumes); err != nil {
return framework.AsStatus(err)
}
}
attachedVolumeCount := map[string]int{}
for volumeUniqueName, driverName := range attachedVolumes {
// Don't count single volume used in multiple pods more than once
delete(newVolumes, volumeUniqueName)
attachedVolumeCount[driverName]++
}
// Count CSI volumes from VolumeAttachments
volumeAttachments, err := pl.getNodeVolumeAttachmentInfo(logger, node.Name)
if err != nil {
return framework.AsStatus(err)
}
for volumeUniqueName, driverName := range volumeAttachments {
// Avoid double-counting volumes already used by existing pods
if _, exists := attachedVolumes[volumeUniqueName]; !exists {
attachedVolumeCount[driverName]++
}
}
// Count the new volumes count per driver
newVolumeCount := map[string]int{}
for _, driverName := range newVolumes {
newVolumeCount[driverName]++
}
for driverName, count := range newVolumeCount {
maxVolumeLimit, ok := nodeVolumeLimits[driverName]
if ok {
currentVolumeCount := attachedVolumeCount[driverName]
logger.V(5).Info("Found plugin volume limits", "node", node.Name, "driverName", driverName,
"maxLimits", maxVolumeLimit, "currentVolumeCount", currentVolumeCount, "newVolumeCount", count,
"pod", klog.KObj(pod))
if currentVolumeCount+count > int(maxVolumeLimit) {
return framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded)
}
}
}
return nil
}
// filterAttachableVolumes filters the attachable volumes from the pod and adds them to the result map.
// The result map is a map of volumeUniqueName to driver name. The volumeUniqueName is a unique name for
// the volume in the format of "driverName/volumeHandle". And driver name is the CSI driver name.
func (pl *CSILimits) filterAttachableVolumes(
logger klog.Logger, pod *v1.Pod, csiNode *storagev1.CSINode, newPod bool, result map[string]string) error {
for _, vol := range pod.Spec.Volumes {
pvcName := ""
isEphemeral := false
switch {
case vol.PersistentVolumeClaim != nil:
// Normal CSI volume can only be used through PVC
pvcName = vol.PersistentVolumeClaim.ClaimName
case vol.Ephemeral != nil:
// Generic ephemeral inline volumes also use a PVC,
// just with a computed name and certain ownership.
// That is checked below once the pvc object is
// retrieved.
pvcName = ephemeral.VolumeClaimName(pod, &vol)
isEphemeral = true
default:
// Inline Volume does not have PVC.
// Need to check if CSI migration is enabled for this inline volume.
// - If the volume is migratable and CSI migration is enabled, need to count it
// as well.
// - If the volume is not migratable, it will be count in non_csi filter.
if err := pl.checkAttachableInlineVolume(logger, &vol, csiNode, pod, result); err != nil {
return err
}
continue
}
if pvcName == "" {
return fmt.Errorf("PersistentVolumeClaim had no name")
}
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
if err != nil {
if newPod {
// The PVC is required to proceed with
// scheduling of a new pod because it cannot
// run without it. Bail out immediately.
return fmt.Errorf("looking up PVC %s/%s: %w", pod.Namespace, pvcName, err)
}
// If the PVC is invalid, we don't count the volume because
// there's no guarantee that it belongs to the running predicate.
logger.V(5).Info("Unable to look up PVC info", "pod", klog.KObj(pod), "PVC", klog.KRef(pod.Namespace, pvcName))
continue
}
// The PVC for an ephemeral volume must be owned by the pod.
if isEphemeral {
if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
return err
}
}
driverName, volumeHandle := pl.getCSIDriverInfo(logger, csiNode, pvc)
if driverName == "" || volumeHandle == "" {
logger.V(5).Info("Could not find a CSI driver name or volume handle, not counting volume")
continue
}
volumeUniqueName := getVolumeUniqueName(driverName, volumeHandle)
result[volumeUniqueName] = driverName
}
return nil
}
// checkAttachableInlineVolume takes an inline volume and add to the result map if the
// volume is migratable and CSI migration for this plugin has been enabled.
func (pl *CSILimits) checkAttachableInlineVolume(logger klog.Logger, vol *v1.Volume, csiNode *storagev1.CSINode,
pod *v1.Pod, result map[string]string) error {
if !pl.translator.IsInlineMigratable(vol) {
return nil
}
// Check if the intree provisioner CSI migration has been enabled.
inTreeProvisionerName, err := pl.translator.GetInTreePluginNameFromSpec(nil, vol)
if err != nil {
return fmt.Errorf("looking up provisioner name for volume %s: %w", vol.Name, err)
}
if !isCSIMigrationOn(csiNode, inTreeProvisionerName) {
csiNodeName := ""
if csiNode != nil {
csiNodeName = csiNode.Name
}
logger.V(5).Info("CSI Migration is not enabled for provisioner", "provisioner", inTreeProvisionerName,
"pod", klog.KObj(pod), "csiNode", csiNodeName)
return nil
}
// Do translation for the in-tree volume.
translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(logger, vol, pod.Namespace)
if err != nil || translatedPV == nil {
return fmt.Errorf("converting volume(%s) from inline to csi: %w", vol.Name, err)
}
driverName, err := pl.translator.GetCSINameFromInTreeName(inTreeProvisionerName)
if err != nil {
return fmt.Errorf("looking up CSI driver name for provisioner %s: %w", inTreeProvisionerName, err)
}
// TranslateInTreeInlineVolumeToCSI should translate inline volume to CSI. If it is not set,
// the volume does not support inline. Skip the count.
if translatedPV.Spec.PersistentVolumeSource.CSI == nil {
return nil
}
volumeUniqueName := getVolumeUniqueName(driverName, translatedPV.Spec.PersistentVolumeSource.CSI.VolumeHandle)
result[volumeUniqueName] = driverName
return nil
}
// getCSIDriverInfo returns the CSI driver name and volume ID of a given PVC.
// If the PVC is from a migrated in-tree plugin, this function will return
// the information of the CSI driver that the plugin has been migrated to.
func (pl *CSILimits) getCSIDriverInfo(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
pvName := pvc.Spec.VolumeName
if pvName == "" {
logger.V(5).Info("Persistent volume had no name for claim", "PVC", klog.KObj(pvc))
return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
}
pv, err := pl.pvLister.Get(pvName)
if err != nil {
logger.V(5).Info("Unable to look up PV info for PVC and PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvName))
// If we can't fetch PV associated with PVC, may be it got deleted
// or PVC was prebound to a PVC that hasn't been created yet.
// fallback to using StorageClass for volume counting
return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
}
csiSource := pv.Spec.PersistentVolumeSource.CSI
if csiSource == nil {
// We make a fast path for non-CSI volumes that aren't migratable
if !pl.translator.IsPVMigratable(pv) {
return "", ""
}
pluginName, err := pl.translator.GetInTreePluginNameFromSpec(pv, nil)
if err != nil {
logger.V(5).Info("Unable to look up plugin name from PV spec", "err", err)
return "", ""
}
if !isCSIMigrationOn(csiNode, pluginName) {
logger.V(5).Info("CSI Migration of plugin is not enabled", "plugin", pluginName)
return "", ""
}
csiPV, err := pl.translator.TranslateInTreePVToCSI(logger, pv)
if err != nil {
logger.V(5).Info("Unable to translate in-tree volume to CSI", "err", err)
return "", ""
}
if csiPV.Spec.PersistentVolumeSource.CSI == nil {
logger.V(5).Info("Unable to get a valid volume source for translated PV", "PV", pvName)
return "", ""
}
csiSource = csiPV.Spec.PersistentVolumeSource.CSI
}
return csiSource.Driver, csiSource.VolumeHandle
}
// getCSIDriverInfoFromSC returns the CSI driver name and a random volume ID of a given PVC's StorageClass.
func (pl *CSILimits) getCSIDriverInfoFromSC(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
namespace := pvc.Namespace
pvcName := pvc.Name
scName := storagehelpers.GetPersistentVolumeClaimClass(pvc)
// If StorageClass is not set or not found, then PVC must be using immediate binding mode
// and hence it must be bound before scheduling. So it is safe to not count it.
if scName == "" {
logger.V(5).Info("PVC has no StorageClass", "PVC", klog.KObj(pvc))
return "", ""
}
storageClass, err := pl.scLister.Get(scName)
if err != nil {
logger.V(5).Info("Could not get StorageClass for PVC", "PVC", klog.KObj(pvc), "err", err)
return "", ""
}
// We use random prefix to avoid conflict with volume IDs. If PVC is bound during the execution of the
// predicate and there is another pod on the same node that uses same volume, then we will overcount
// the volume and consider both volumes as different.
volumeHandle := fmt.Sprintf("%s-%s/%s", pl.randomVolumeIDPrefix, namespace, pvcName)
provisioner := storageClass.Provisioner
if pl.translator.IsMigratableIntreePluginByName(provisioner) {
if !isCSIMigrationOn(csiNode, provisioner) {
logger.V(5).Info("CSI Migration of provisioner is not enabled", "provisioner", provisioner)
return "", ""
}
driverName, err := pl.translator.GetCSINameFromInTreeName(provisioner)
if err != nil {
logger.V(5).Info("Unable to look up driver name from provisioner name", "provisioner", provisioner, "err", err)
return "", ""
}
return driverName, volumeHandle
}
return provisioner, volumeHandle
}
// NewCSI initializes a new plugin and returns it.
func NewCSI(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
informerFactory := handle.SharedInformerFactory()
pvLister := informerFactory.Core().V1().PersistentVolumes().Lister()
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
csiNodesLister := informerFactory.Storage().V1().CSINodes().Lister()
scLister := informerFactory.Storage().V1().StorageClasses().Lister()
vaLister := informerFactory.Storage().V1().VolumeAttachments().Lister()
csiTranslator := csitrans.New()
return &CSILimits{
csiNodeLister: csiNodesLister,
pvLister: pvLister,
pvcLister: pvcLister,
scLister: scLister,
vaLister: vaLister,
randomVolumeIDPrefix: rand.String(32),
translator: csiTranslator,
}, nil
}
// getVolumeLimits reads the volume limits from CSINode object and returns a map of volume limits.
// The key is the driver name and the value is the maximum number of volumes that can be attached to the node.
// If a key is not found in the map, it means there is no limit for the driver on the node.
func getVolumeLimits(csiNode *storagev1.CSINode) map[string]int64 {
nodeVolumeLimits := make(map[string]int64)
if csiNode == nil {
return nodeVolumeLimits
}
for _, d := range csiNode.Spec.Drivers {
if d.Allocatable != nil && d.Allocatable.Count != nil {
nodeVolumeLimits[d.Name] = int64(*d.Allocatable.Count)
}
}
return nodeVolumeLimits
}
// getNodeVolumeAttachmentInfo returns a map of volumeID to driver name for the given node.
func (pl *CSILimits) getNodeVolumeAttachmentInfo(logger klog.Logger, nodeName string) (map[string]string, error) {
volumeAttachments := make(map[string]string)
vas, err := pl.vaLister.List(labels.Everything())
if err != nil {
return nil, err
}
for _, va := range vas {
if va.Spec.NodeName == nodeName {
if va.Spec.Attacher == "" {
logger.V(5).Info("VolumeAttachment has no attacher", "VolumeAttachment", klog.KObj(va))
continue
}
if va.Spec.Source.PersistentVolumeName == nil {
logger.V(5).Info("VolumeAttachment has no PV name", "VolumeAttachment", klog.KObj(va))
continue
}
pv, err := pl.pvLister.Get(*va.Spec.Source.PersistentVolumeName)
if err != nil {
logger.V(5).Info("Unable to get PV for VolumeAttachment", "VolumeAttachment", klog.KObj(va), "err", err)
continue
}
if pv.Spec.CSI == nil {
logger.V(5).Info("PV is not a CSI volume", "PV", klog.KObj(pv))
continue
}
volumeID := getVolumeUniqueName(va.Spec.Attacher, pv.Spec.CSI.VolumeHandle)
volumeAttachments[volumeID] = va.Spec.Attacher
}
}
return volumeAttachments, nil
}
func getVolumeUniqueName(driverName, volumeHandle string) string {
return fmt.Sprintf("%s/%s", driverName, volumeHandle)
}

View File

@ -0,0 +1,73 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package nodevolumelimits
import (
"strings"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
"k8s.io/apimachinery/pkg/util/sets"
utilfeature "k8s.io/apiserver/pkg/util/feature"
csilibplugins "k8s.io/csi-translation-lib/plugins"
"k8s.io/kubernetes/pkg/features"
)
// isCSIMigrationOn returns a boolean value indicating whether
// the CSI migration has been enabled for a particular storage plugin.
func isCSIMigrationOn(csiNode *storagev1.CSINode, pluginName string) bool {
if csiNode == nil || len(pluginName) == 0 {
return false
}
// In-tree storage to CSI driver migration feature should be enabled,
// along with the plugin-specific one
switch pluginName {
case csilibplugins.AWSEBSInTreePluginName:
return true
case csilibplugins.PortworxVolumePluginName:
if !utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx) {
return false
}
case csilibplugins.GCEPDInTreePluginName:
return true
case csilibplugins.AzureDiskInTreePluginName:
return true
case csilibplugins.CinderInTreePluginName:
return true
default:
return false
}
// The plugin name should be listed in the CSINode object annotation.
// This indicates that the plugin has been migrated to a CSI driver in the node.
csiNodeAnn := csiNode.GetAnnotations()
if csiNodeAnn == nil {
return false
}
var mpaSet sets.Set[string]
mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey]
if len(mpa) == 0 {
mpaSet = sets.New[string]()
} else {
tok := strings.Split(mpa, ",")
mpaSet = sets.New(tok...)
}
return mpaSet.Has(pluginName)
}

View File

@ -0,0 +1,174 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
v1helper "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/utils/ptr"
)
type topologyPair struct {
key string
value string
}
// topologySpreadConstraint is an internal version for v1.TopologySpreadConstraint
// and where the selector is parsed.
// Fields are exported for comparison during testing.
type topologySpreadConstraint struct {
MaxSkew int32
TopologyKey string
Selector labels.Selector
MinDomains int32
NodeAffinityPolicy v1.NodeInclusionPolicy
NodeTaintsPolicy v1.NodeInclusionPolicy
}
func (tsc *topologySpreadConstraint) matchNodeInclusionPolicies(pod *v1.Pod, node *v1.Node, require nodeaffinity.RequiredNodeAffinity) bool {
if tsc.NodeAffinityPolicy == v1.NodeInclusionPolicyHonor {
// We ignore parsing errors here for backwards compatibility.
if match, _ := require.Match(node); !match {
return false
}
}
if tsc.NodeTaintsPolicy == v1.NodeInclusionPolicyHonor {
if _, untolerated := v1helper.FindMatchingUntoleratedTaint(node.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc()); untolerated {
return false
}
}
return true
}
// buildDefaultConstraints builds the constraints for a pod using
// .DefaultConstraints and the selectors from the services, replication
// controllers, replica sets and stateful sets that match the pod.
func (pl *PodTopologySpread) buildDefaultConstraints(p *v1.Pod, action v1.UnsatisfiableConstraintAction) ([]topologySpreadConstraint, error) {
constraints, err := pl.filterTopologySpreadConstraints(pl.defaultConstraints, p.Labels, action)
if err != nil || len(constraints) == 0 {
return nil, err
}
selector := helper.DefaultSelector(p, pl.services, pl.replicationCtrls, pl.replicaSets, pl.statefulSets)
if selector.Empty() {
return nil, nil
}
for i := range constraints {
constraints[i].Selector = selector
}
return constraints, nil
}
// nodeLabelsMatchSpreadConstraints checks if ALL topology keys in spread Constraints are present in node labels.
func nodeLabelsMatchSpreadConstraints(nodeLabels map[string]string, constraints []topologySpreadConstraint) bool {
for _, c := range constraints {
if _, ok := nodeLabels[c.TopologyKey]; !ok {
return false
}
}
return true
}
func (pl *PodTopologySpread) filterTopologySpreadConstraints(constraints []v1.TopologySpreadConstraint, podLabels map[string]string, action v1.UnsatisfiableConstraintAction) ([]topologySpreadConstraint, error) {
var result []topologySpreadConstraint
for _, c := range constraints {
if c.WhenUnsatisfiable == action {
selector, err := metav1.LabelSelectorAsSelector(c.LabelSelector)
if err != nil {
return nil, err
}
if pl.enableMatchLabelKeysInPodTopologySpread && len(c.MatchLabelKeys) > 0 {
matchLabels := make(labels.Set)
for _, labelKey := range c.MatchLabelKeys {
if value, ok := podLabels[labelKey]; ok {
matchLabels[labelKey] = value
}
}
if len(matchLabels) > 0 {
selector = mergeLabelSetWithSelector(matchLabels, selector)
}
}
tsc := topologySpreadConstraint{
MaxSkew: c.MaxSkew,
TopologyKey: c.TopologyKey,
Selector: selector,
MinDomains: ptr.Deref(c.MinDomains, 1), // If MinDomains is nil, we treat MinDomains as 1.
NodeAffinityPolicy: v1.NodeInclusionPolicyHonor, // If NodeAffinityPolicy is nil, we treat NodeAffinityPolicy as "Honor".
NodeTaintsPolicy: v1.NodeInclusionPolicyIgnore, // If NodeTaintsPolicy is nil, we treat NodeTaintsPolicy as "Ignore".
}
if pl.enableNodeInclusionPolicyInPodTopologySpread {
if c.NodeAffinityPolicy != nil {
tsc.NodeAffinityPolicy = *c.NodeAffinityPolicy
}
if c.NodeTaintsPolicy != nil {
tsc.NodeTaintsPolicy = *c.NodeTaintsPolicy
}
}
result = append(result, tsc)
}
}
return result, nil
}
func mergeLabelSetWithSelector(matchLabels labels.Set, s labels.Selector) labels.Selector {
mergedSelector := labels.SelectorFromSet(matchLabels)
requirements, ok := s.Requirements()
if !ok {
return s
}
for _, r := range requirements {
mergedSelector = mergedSelector.Add(r)
}
return mergedSelector
}
func countPodsMatchSelector(podInfos []*framework.PodInfo, selector labels.Selector, ns string) int {
if selector.Empty() {
return 0
}
count := 0
for _, p := range podInfos {
// Bypass terminating Pod (see #87621).
if p.Pod.DeletionTimestamp != nil || p.Pod.Namespace != ns {
continue
}
if selector.Matches(labels.Set(p.Pod.Labels)) {
count++
}
}
return count
}
// podLabelsMatchSpreadConstraints returns whether tha labels matches with the selector in any of topologySpreadConstraint
func podLabelsMatchSpreadConstraints(constraints []topologySpreadConstraint, labels labels.Set) bool {
for _, c := range constraints {
if c.Selector.Matches(labels) {
return true
}
}
return false
}

View File

@ -0,0 +1,371 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
"context"
"fmt"
"math"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
const preFilterStateKey = "PreFilter" + Name
// preFilterState computed at PreFilter and used at Filter.
// It combines TpKeyToCriticalPaths and TpPairToMatchNum to represent:
// (1) critical paths where the least pods are matched on each spread constraint.
// (2) number of pods matched on each spread constraint.
// A nil preFilterState denotes it's not set at all (in PreFilter phase);
// An empty preFilterState object denotes it's a legit state and is set in PreFilter phase.
// Fields are exported for comparison during testing.
type preFilterState struct {
Constraints []topologySpreadConstraint
// We record 2 critical paths instead of all critical paths here.
// criticalPaths[0].MatchNum always holds the minimum matching number.
// criticalPaths[1].MatchNum is always greater or equal to criticalPaths[0].MatchNum, but
// it's not guaranteed to be the 2nd minimum match number.
TpKeyToCriticalPaths map[string]*criticalPaths
// TpKeyToDomainsNum is keyed with topologyKey, and valued with the number of domains.
TpKeyToDomainsNum map[string]int
// TpPairToMatchNum is keyed with topologyPair, and valued with the number of matching pods.
TpPairToMatchNum map[topologyPair]int
}
// minMatchNum returns the global minimum for the calculation of skew while taking MinDomains into account.
func (s *preFilterState) minMatchNum(tpKey string, minDomains int32) (int, error) {
paths, ok := s.TpKeyToCriticalPaths[tpKey]
if !ok {
return 0, fmt.Errorf("failed to retrieve path by topology key")
}
minMatchNum := paths[0].MatchNum
domainsNum, ok := s.TpKeyToDomainsNum[tpKey]
if !ok {
return 0, fmt.Errorf("failed to retrieve the number of domains by topology key")
}
if domainsNum < int(minDomains) {
// When the number of eligible domains with matching topology keys is less than `minDomains`,
// it treats "global minimum" as 0.
minMatchNum = 0
}
return minMatchNum, nil
}
// Clone makes a copy of the given state.
func (s *preFilterState) Clone() framework.StateData {
if s == nil {
return nil
}
copy := preFilterState{
// Constraints are shared because they don't change.
Constraints: s.Constraints,
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(s.TpKeyToCriticalPaths)),
// The number of domains does not change as a result of AddPod/RemovePod methods on PreFilter Extensions
TpKeyToDomainsNum: s.TpKeyToDomainsNum,
TpPairToMatchNum: make(map[topologyPair]int, len(s.TpPairToMatchNum)),
}
for tpKey, paths := range s.TpKeyToCriticalPaths {
copy.TpKeyToCriticalPaths[tpKey] = &criticalPaths{paths[0], paths[1]}
}
for tpPair, matchNum := range s.TpPairToMatchNum {
copy.TpPairToMatchNum[tpPair] = matchNum
}
return &copy
}
// CAVEAT: the reason that `[2]criticalPath` can work is based on the implementation of current
// preemption algorithm, in particular the following 2 facts:
// Fact 1: we only preempt pods on the same node, instead of pods on multiple nodes.
// Fact 2: each node is evaluated on a separate copy of the preFilterState during its preemption cycle.
// If we plan to turn to a more complex algorithm like "arbitrary pods on multiple nodes", this
// structure needs to be revisited.
// Fields are exported for comparison during testing.
type criticalPaths [2]struct {
// TopologyValue denotes the topology value mapping to topology key.
TopologyValue string
// MatchNum denotes the number of matching pods.
MatchNum int
}
func newCriticalPaths() *criticalPaths {
return &criticalPaths{{MatchNum: math.MaxInt32}, {MatchNum: math.MaxInt32}}
}
func (p *criticalPaths) update(tpVal string, num int) {
// first verify if `tpVal` exists or not
i := -1
if tpVal == p[0].TopologyValue {
i = 0
} else if tpVal == p[1].TopologyValue {
i = 1
}
if i >= 0 {
// `tpVal` exists
p[i].MatchNum = num
if p[0].MatchNum > p[1].MatchNum {
// swap paths[0] and paths[1]
p[0], p[1] = p[1], p[0]
}
} else {
// `tpVal` doesn't exist
if num < p[0].MatchNum {
// update paths[1] with paths[0]
p[1] = p[0]
// update paths[0]
p[0].TopologyValue, p[0].MatchNum = tpVal, num
} else if num < p[1].MatchNum {
// update paths[1]
p[1].TopologyValue, p[1].MatchNum = tpVal, num
}
}
}
// PreFilter invoked at the prefilter extension point.
func (pl *PodTopologySpread) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
s, err := pl.calPreFilterState(ctx, pod)
if err != nil {
return nil, framework.AsStatus(err)
} else if s != nil && len(s.Constraints) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, s)
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *PodTopologySpread) PreFilterExtensions() framework.PreFilterExtensions {
return pl
}
// AddPod from pre-computed data in cycleState.
func (pl *PodTopologySpread) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
pl.updateWithPod(s, podInfoToAdd.Pod, podToSchedule, nodeInfo.Node(), 1)
return nil
}
// RemovePod from pre-computed data in cycleState.
func (pl *PodTopologySpread) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
pl.updateWithPod(s, podInfoToRemove.Pod, podToSchedule, nodeInfo.Node(), -1)
return nil
}
func (pl *PodTopologySpread) updateWithPod(s *preFilterState, updatedPod, preemptorPod *v1.Pod, node *v1.Node, delta int) {
if s == nil || updatedPod.Namespace != preemptorPod.Namespace || node == nil {
return
}
if !nodeLabelsMatchSpreadConstraints(node.Labels, s.Constraints) {
return
}
requiredSchedulingTerm := nodeaffinity.GetRequiredNodeAffinity(preemptorPod)
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
// spreading is applied to nodes that pass those filters.
// Ignore parsing errors for backwards compatibility.
if match, _ := requiredSchedulingTerm.Match(node); !match {
return
}
}
podLabelSet := labels.Set(updatedPod.Labels)
for _, constraint := range s.Constraints {
if !constraint.Selector.Matches(podLabelSet) {
continue
}
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!constraint.matchNodeInclusionPolicies(preemptorPod, node, requiredSchedulingTerm) {
continue
}
k, v := constraint.TopologyKey, node.Labels[constraint.TopologyKey]
pair := topologyPair{key: k, value: v}
s.TpPairToMatchNum[pair] += delta
s.TpKeyToCriticalPaths[k].update(v, s.TpPairToMatchNum[pair])
}
}
// getPreFilterState fetches a pre-computed preFilterState.
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("reading %q from cycleState: %w", preFilterStateKey, err)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to podtopologyspread.preFilterState error", c)
}
return s, nil
}
// calPreFilterState computes preFilterState describing how pods are spread on topologies.
func (pl *PodTopologySpread) calPreFilterState(ctx context.Context, pod *v1.Pod) (*preFilterState, error) {
constraints, err := pl.getConstraints(pod)
if err != nil {
return nil, fmt.Errorf("get constraints from pod: %w", err)
}
if len(constraints) == 0 {
return &preFilterState{}, nil
}
allNodes, err := pl.sharedLister.NodeInfos().List()
if err != nil {
return nil, fmt.Errorf("listing NodeInfos: %w", err)
}
s := preFilterState{
Constraints: constraints,
TpKeyToCriticalPaths: make(map[string]*criticalPaths, len(constraints)),
TpPairToMatchNum: make(map[topologyPair]int, sizeHeuristic(len(allNodes), constraints)),
}
tpCountsByNode := make([]map[topologyPair]int, len(allNodes))
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
processNode := func(i int) {
nodeInfo := allNodes[i]
node := nodeInfo.Node()
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
// spreading is applied to nodes that pass those filters.
// Ignore parsing errors for backwards compatibility.
if match, _ := requiredNodeAffinity.Match(node); !match {
return
}
}
// Ensure current node's labels contains all topologyKeys in 'Constraints'.
if !nodeLabelsMatchSpreadConstraints(node.Labels, constraints) {
return
}
tpCounts := make(map[topologyPair]int, len(constraints))
for _, c := range constraints {
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
continue
}
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
tpCounts[pair] = count
}
tpCountsByNode[i] = tpCounts
}
pl.parallelizer.Until(ctx, len(allNodes), processNode, pl.Name())
for _, tpCounts := range tpCountsByNode {
for tp, count := range tpCounts {
s.TpPairToMatchNum[tp] += count
}
}
s.TpKeyToDomainsNum = make(map[string]int, len(constraints))
for tp := range s.TpPairToMatchNum {
s.TpKeyToDomainsNum[tp.key]++
}
// calculate min match for each topology pair
for i := 0; i < len(constraints); i++ {
key := constraints[i].TopologyKey
s.TpKeyToCriticalPaths[key] = newCriticalPaths()
}
for pair, num := range s.TpPairToMatchNum {
s.TpKeyToCriticalPaths[pair.key].update(pair.value, num)
}
return &s, nil
}
// Filter invoked at the filter extension point.
func (pl *PodTopologySpread) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
// However, "empty" preFilterState is legit which tolerates every toSchedule Pod.
if len(s.Constraints) == 0 {
return nil
}
logger := klog.FromContext(ctx)
podLabelSet := labels.Set(pod.Labels)
for _, c := range s.Constraints {
tpKey := c.TopologyKey
tpVal, ok := node.Labels[c.TopologyKey]
if !ok {
logger.V(5).Info("Node doesn't have required label", "node", klog.KObj(node), "label", tpKey)
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonNodeLabelNotMatch)
}
// judging criteria:
// 'existing matching num' + 'if self-match (1 or 0)' - 'global minimum' <= 'maxSkew'
minMatchNum, err := s.minMatchNum(tpKey, c.MinDomains)
if err != nil {
logger.Error(err, "Internal error occurred while retrieving value precalculated in PreFilter", "topologyKey", tpKey, "paths", s.TpKeyToCriticalPaths)
continue
}
selfMatchNum := 0
if c.Selector.Matches(podLabelSet) {
selfMatchNum = 1
}
pair := topologyPair{key: tpKey, value: tpVal}
matchNum := 0
if tpCount, ok := s.TpPairToMatchNum[pair]; ok {
matchNum = tpCount
}
skew := matchNum + selfMatchNum - minMatchNum
if skew > int(c.MaxSkew) {
logger.V(5).Info("Node failed spreadConstraint: matchNum + selfMatchNum - minMatchNum > maxSkew", "node", klog.KObj(node), "topologyKey", tpKey, "matchNum", matchNum, "selfMatchNum", selfMatchNum, "minMatchNum", minMatchNum, "maxSkew", c.MaxSkew)
return framework.NewStatus(framework.Unschedulable, ErrReasonConstraintsNotMatch)
}
}
return nil
}
func sizeHeuristic(nodes int, constraints []topologySpreadConstraint) int {
for _, c := range constraints {
if c.TopologyKey == v1.LabelHostname {
return nodes
}
}
return 0
}

View File

@ -0,0 +1,351 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/informers"
appslisters "k8s.io/client-go/listers/apps/v1"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
const (
// ErrReasonConstraintsNotMatch is used for PodTopologySpread filter error.
ErrReasonConstraintsNotMatch = "node(s) didn't match pod topology spread constraints"
// ErrReasonNodeLabelNotMatch is used when the node doesn't hold the required label.
ErrReasonNodeLabelNotMatch = ErrReasonConstraintsNotMatch + " (missing required label)"
)
var systemDefaultConstraints = []v1.TopologySpreadConstraint{
{
TopologyKey: v1.LabelHostname,
WhenUnsatisfiable: v1.ScheduleAnyway,
MaxSkew: 3,
},
{
TopologyKey: v1.LabelTopologyZone,
WhenUnsatisfiable: v1.ScheduleAnyway,
MaxSkew: 5,
},
}
// PodTopologySpread is a plugin that ensures pod's topologySpreadConstraints is satisfied.
type PodTopologySpread struct {
systemDefaulted bool
parallelizer parallelize.Parallelizer
defaultConstraints []v1.TopologySpreadConstraint
sharedLister framework.SharedLister
services corelisters.ServiceLister
replicationCtrls corelisters.ReplicationControllerLister
replicaSets appslisters.ReplicaSetLister
statefulSets appslisters.StatefulSetLister
enableNodeInclusionPolicyInPodTopologySpread bool
enableMatchLabelKeysInPodTopologySpread bool
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &PodTopologySpread{}
var _ framework.FilterPlugin = &PodTopologySpread{}
var _ framework.PreScorePlugin = &PodTopologySpread{}
var _ framework.ScorePlugin = &PodTopologySpread{}
var _ framework.EnqueueExtensions = &PodTopologySpread{}
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.PodTopologySpread
// Name returns name of the plugin. It is used in logs, etc.
func (pl *PodTopologySpread) Name() string {
return Name
}
// New initializes a new plugin and returns it.
func New(_ context.Context, plArgs runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
if h.SnapshotSharedLister() == nil {
return nil, fmt.Errorf("SnapshotSharedlister is nil")
}
args, err := getArgs(plArgs)
if err != nil {
return nil, err
}
if err := validation.ValidatePodTopologySpreadArgs(nil, &args); err != nil {
return nil, err
}
pl := &PodTopologySpread{
parallelizer: h.Parallelizer(),
sharedLister: h.SnapshotSharedLister(),
defaultConstraints: args.DefaultConstraints,
enableNodeInclusionPolicyInPodTopologySpread: fts.EnableNodeInclusionPolicyInPodTopologySpread,
enableMatchLabelKeysInPodTopologySpread: fts.EnableMatchLabelKeysInPodTopologySpread,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}
if args.DefaultingType == config.SystemDefaulting {
pl.defaultConstraints = systemDefaultConstraints
pl.systemDefaulted = true
}
if len(pl.defaultConstraints) != 0 {
if h.SharedInformerFactory() == nil {
return nil, fmt.Errorf("SharedInformerFactory is nil")
}
pl.setListers(h.SharedInformerFactory())
}
return pl, nil
}
func getArgs(obj runtime.Object) (config.PodTopologySpreadArgs, error) {
ptr, ok := obj.(*config.PodTopologySpreadArgs)
if !ok {
return config.PodTopologySpreadArgs{}, fmt.Errorf("want args to be of type PodTopologySpreadArgs, got %T", obj)
}
return *ptr, nil
}
func (pl *PodTopologySpread) setListers(factory informers.SharedInformerFactory) {
pl.services = factory.Core().V1().Services().Lister()
pl.replicationCtrls = factory.Core().V1().ReplicationControllers().Lister()
pl.replicaSets = factory.Apps().V1().ReplicaSets().Lister()
pl.statefulSets = factory.Apps().V1().StatefulSets().Lister()
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *PodTopologySpread) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
podActionType := framework.Add | framework.UpdatePodLabel | framework.Delete
if pl.enableSchedulingQueueHint {
// When the QueueingHint feature is enabled, the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
// (If not, the scheduling queue always retries the unschedulable Pods when they're updated.)
//
// The Pod rejected by this plugin can be schedulable when the Pod has a spread constraint with NodeTaintsPolicy:Honor
// and has got a new toleration.
// So, we add UpdatePodTolerations here only when QHint is enabled.
podActionType = framework.Add | framework.UpdatePodLabel | framework.UpdatePodTolerations | framework.Delete
}
return []framework.ClusterEventWithHint{
// All ActionType includes the following events:
// - Add. An unschedulable Pod may fail due to violating topology spread constraints,
// adding an assigned Pod may make it schedulable.
// - UpdatePodLabel. Updating on an existing Pod's labels (e.g., removal) may make
// an unschedulable Pod schedulable.
// - Delete. An unschedulable Pod may fail due to violating an existing Pod's topology spread constraints,
// deleting an existing Pod may make it schedulable.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: podActionType}, QueueingHintFn: pl.isSchedulableAfterPodChange},
// Node add|delete|update maybe lead an topology key changed,
// and make these pod in scheduling schedulable or unschedulable.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.Delete | framework.UpdateNodeLabel | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
}, nil
}
// involvedInTopologySpreading returns true if the incomingPod is involved in the topology spreading of podWithSpreading.
func involvedInTopologySpreading(incomingPod, podWithSpreading *v1.Pod) bool {
return incomingPod.UID == podWithSpreading.UID ||
(incomingPod.Spec.NodeName != "" && incomingPod.Namespace == podWithSpreading.Namespace)
}
// hasConstraintWithNodeTaintsPolicyHonor returns true if any constraint has `NodeTaintsPolicy: Honor`.
func hasConstraintWithNodeTaintsPolicyHonor(constraints []topologySpreadConstraint) bool {
for _, c := range constraints {
if c.NodeTaintsPolicy == v1.NodeInclusionPolicyHonor {
return true
}
}
return false
}
func (pl *PodTopologySpread) isSchedulableAfterPodChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPod, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if (modifiedPod != nil && !involvedInTopologySpreading(modifiedPod, pod)) || (originalPod != nil && !involvedInTopologySpreading(originalPod, pod)) {
logger.V(5).Info("the added/updated/deleted pod is unscheduled or has different namespace with target pod, so it doesn't make the target pod schedulable",
"pod", klog.KObj(pod), "originalPod", klog.KObj(originalPod))
return framework.QueueSkip, nil
}
constraints, err := pl.getConstraints(pod)
if err != nil {
return framework.Queue, err
}
// Pod is modified. Return Queue when the label(s) matching topologySpread's selector is added, changed, or deleted.
if modifiedPod != nil && originalPod != nil {
if pod.UID == modifiedPod.UID && !equality.Semantic.DeepEqual(modifiedPod.Spec.Tolerations, originalPod.Spec.Tolerations) && hasConstraintWithNodeTaintsPolicyHonor(constraints) {
// If any constraint has `NodeTaintsPolicy: Honor`, we can return Queue when the target Pod has got a new toleration.
logger.V(5).Info("the unschedulable pod has got a new toleration, which could make it schedulable",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
if equality.Semantic.DeepEqual(modifiedPod.Labels, originalPod.Labels) {
logger.V(5).Info("the pod's update doesn't include the label update, which doesn't make the target pod schedulable",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
for _, c := range constraints {
if c.Selector.Matches(labels.Set(originalPod.Labels)) != c.Selector.Matches(labels.Set(modifiedPod.Labels)) {
// This modification makes this Pod match(or not match) with this constraint.
// Maybe now the scheduling result of topology spread gets changed by this change.
logger.V(5).Info("a scheduled pod's label was updated and it makes the updated pod match or unmatch the pod's topology spread constraints",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
}
// This modification of labels doesn't change whether this Pod would match selector or not in any constraints.
logger.V(5).Info("a scheduled pod's label was updated, but it's a change unrelated to the pod's topology spread constraints",
"pod", klog.KObj(pod), "modifiedPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is added. Return Queue when the added Pod has a label that matches with topologySpread's selector.
if modifiedPod != nil {
if podLabelsMatchSpreadConstraints(constraints, modifiedPod.Labels) {
logger.V(5).Info("a scheduled pod was created and it matches with the pod's topology spread constraints",
"pod", klog.KObj(pod), "createdPod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was created, but it doesn't matches with the pod's topology spread constraints",
"pod", klog.KObj(pod), "createdPod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}
// Pod is deleted. Return Queue when the deleted Pod has a label that matches with topologySpread's selector.
if podLabelsMatchSpreadConstraints(constraints, originalPod.Labels) {
logger.V(5).Info("a scheduled pod which matches with the pod's topology spread constraints was deleted, and the pod may be schedulable now",
"pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.Queue, nil
}
logger.V(5).Info("a scheduled pod was deleted, but it's unrelated to the pod's topology spread constraints",
"pod", klog.KObj(pod), "deletedPod", klog.KObj(originalPod))
return framework.QueueSkip, nil
}
// getConstraints extracts topologySpreadConstraint(s) from the Pod spec.
// If the Pod doesn't have any topologySpreadConstraint, it returns default constraints.
func (pl *PodTopologySpread) getConstraints(pod *v1.Pod) ([]topologySpreadConstraint, error) {
var constraints []topologySpreadConstraint
var err error
if len(pod.Spec.TopologySpreadConstraints) > 0 {
// We have feature gating in APIServer to strip the spec
// so don't need to re-check feature gate, just check length of Constraints.
constraints, err = pl.filterTopologySpreadConstraints(
pod.Spec.TopologySpreadConstraints,
pod.Labels,
v1.DoNotSchedule,
)
if err != nil {
return nil, fmt.Errorf("obtaining pod's hard topology spread constraints: %w", err)
}
} else {
constraints, err = pl.buildDefaultConstraints(pod, v1.DoNotSchedule)
if err != nil {
return nil, fmt.Errorf("setting default hard topology spread constraints: %w", err)
}
}
return constraints, nil
}
// isSchedulableAfterNodeChange returns Queue when node has topologyKey in its labels, else return QueueSkip.
func (pl *PodTopologySpread) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
constraints, err := pl.getConstraints(pod)
if err != nil {
return framework.Queue, err
}
var originalNodeMatching, modifiedNodeMatching bool
if originalNode != nil {
originalNodeMatching = nodeLabelsMatchSpreadConstraints(originalNode.Labels, constraints)
}
if modifiedNode != nil {
modifiedNodeMatching = nodeLabelsMatchSpreadConstraints(modifiedNode.Labels, constraints)
}
// We return Queue in the following cases:
// 1. Node/UpdateNodeLabel:
// - The original node matched the pod's topology spread constraints, but the modified node does not.
// - The modified node matches the pod's topology spread constraints, but the original node does not.
// - The modified node matches the pod's topology spread constraints, and the original node and the modified node have different label values for any topologyKey.
// 2. Node/UpdateNodeTaint:
// - The modified node match the pod's topology spread constraints, and the original node and the modified node have different taints.
// 3. Node/Add: The created node matches the pod's topology spread constraints.
// 4. Node/Delete: The original node matched the pod's topology spread constraints.
if originalNode != nil && modifiedNode != nil {
if originalNodeMatching != modifiedNodeMatching {
logger.V(5).Info("the node is updated and now pod topology spread constraints has changed, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode), "originalMatching", originalNodeMatching, "newMatching", modifiedNodeMatching)
return framework.Queue, nil
}
if modifiedNodeMatching && (checkTopologyKeyLabelsChanged(originalNode.Labels, modifiedNode.Labels, constraints) || !equality.Semantic.DeepEqual(originalNode.Spec.Taints, modifiedNode.Spec.Taints)) {
logger.V(5).Info("the node is updated and now has different taints or labels, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
return framework.QueueSkip, nil
}
if modifiedNode != nil {
if !modifiedNodeMatching {
logger.V(5).Info("the created node doesn't match pod topology spread constraints",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("the created node matches topology spread constraints, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
if !originalNodeMatching {
logger.V(5).Info("the deleted node doesn't match pod topology spread constraints", "pod", klog.KObj(pod), "node", klog.KObj(originalNode))
return framework.QueueSkip, nil
}
logger.V(5).Info("the deleted node matches topology spread constraints, and the pod may be schedulable now",
"pod", klog.KObj(pod), "node", klog.KObj(originalNode))
return framework.Queue, nil
}
// checkTopologyKeyLabelsChanged checks if any of the labels specified as topologyKey in the constraints have changed.
func checkTopologyKeyLabelsChanged(originalLabels, modifiedLabels map[string]string, constraints []topologySpreadConstraint) bool {
for _, constraint := range constraints {
topologyKey := constraint.TopologyKey
if originalLabels[topologyKey] != modifiedLabels[topologyKey] {
return true
}
}
return false
}

View File

@ -0,0 +1,305 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podtopologyspread
import (
"context"
"fmt"
"math"
"sync/atomic"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
const preScoreStateKey = "PreScore" + Name
const invalidScore = -1
// preScoreState computed at PreScore and used at Score.
// Fields are exported for comparison during testing.
type preScoreState struct {
Constraints []topologySpreadConstraint
// IgnoredNodes is a set of node names which miss some Constraints[*].topologyKey.
IgnoredNodes sets.Set[string]
// TopologyPairToPodCounts is keyed with topologyPair, and valued with the number of matching pods.
TopologyPairToPodCounts map[topologyPair]*int64
// TopologyNormalizingWeight is the weight we give to the counts per topology.
// This allows the pod counts of smaller topologies to not be watered down by
// bigger ones.
TopologyNormalizingWeight []float64
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// initPreScoreState iterates "filteredNodes" to filter out the nodes which
// don't have required topologyKey(s), and initialize:
// 1) s.TopologyPairToPodCounts: keyed with both eligible topology pair and node names.
// 2) s.IgnoredNodes: the set of nodes that shouldn't be scored.
// 3) s.TopologyNormalizingWeight: The weight to be given to each constraint based on the number of values in a topology.
func (pl *PodTopologySpread) initPreScoreState(s *preScoreState, pod *v1.Pod, filteredNodes []*framework.NodeInfo, requireAllTopologies bool) error {
var err error
if len(pod.Spec.TopologySpreadConstraints) > 0 {
s.Constraints, err = pl.filterTopologySpreadConstraints(
pod.Spec.TopologySpreadConstraints,
pod.Labels,
v1.ScheduleAnyway,
)
if err != nil {
return fmt.Errorf("obtaining pod's soft topology spread constraints: %w", err)
}
} else {
s.Constraints, err = pl.buildDefaultConstraints(pod, v1.ScheduleAnyway)
if err != nil {
return fmt.Errorf("setting default soft topology spread constraints: %w", err)
}
}
if len(s.Constraints) == 0 {
return nil
}
topoSize := make([]int, len(s.Constraints))
for _, node := range filteredNodes {
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Node().Labels, s.Constraints) {
// Nodes which don't have all required topologyKeys present are ignored
// when scoring later.
s.IgnoredNodes.Insert(node.Node().Name)
continue
}
for i, constraint := range s.Constraints {
// per-node counts are calculated during Score.
if constraint.TopologyKey == v1.LabelHostname {
continue
}
pair := topologyPair{key: constraint.TopologyKey, value: node.Node().Labels[constraint.TopologyKey]}
if s.TopologyPairToPodCounts[pair] == nil {
s.TopologyPairToPodCounts[pair] = new(int64)
topoSize[i]++
}
}
}
s.TopologyNormalizingWeight = make([]float64, len(s.Constraints))
for i, c := range s.Constraints {
sz := topoSize[i]
if c.TopologyKey == v1.LabelHostname {
sz = len(filteredNodes) - len(s.IgnoredNodes)
}
s.TopologyNormalizingWeight[i] = topologyNormalizingWeight(sz)
}
return nil
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *PodTopologySpread) PreScore(
ctx context.Context,
cycleState *framework.CycleState,
pod *v1.Pod,
filteredNodes []*framework.NodeInfo,
) *framework.Status {
allNodes, err := pl.sharedLister.NodeInfos().List()
if err != nil {
return framework.AsStatus(fmt.Errorf("getting all nodes: %w", err))
}
if len(allNodes) == 0 {
// No need to score.
return framework.NewStatus(framework.Skip)
}
state := &preScoreState{
IgnoredNodes: sets.New[string](),
TopologyPairToPodCounts: make(map[topologyPair]*int64),
}
// Only require that nodes have all the topology labels if using
// non-system-default spreading rules. This allows nodes that don't have a
// zone label to still have hostname spreading.
requireAllTopologies := len(pod.Spec.TopologySpreadConstraints) > 0 || !pl.systemDefaulted
err = pl.initPreScoreState(state, pod, filteredNodes, requireAllTopologies)
if err != nil {
return framework.AsStatus(fmt.Errorf("calculating preScoreState: %w", err))
}
// return Skip if incoming pod doesn't have soft topology spread Constraints.
if len(state.Constraints) == 0 {
return framework.NewStatus(framework.Skip)
}
// Ignore parsing errors for backwards compatibility.
requiredNodeAffinity := nodeaffinity.GetRequiredNodeAffinity(pod)
processAllNode := func(i int) {
nodeInfo := allNodes[i]
node := nodeInfo.Node()
if !pl.enableNodeInclusionPolicyInPodTopologySpread {
// `node` should satisfy incoming pod's NodeSelector/NodeAffinity
if match, _ := requiredNodeAffinity.Match(node); !match {
return
}
}
// All topologyKeys need to be present in `node`
if requireAllTopologies && !nodeLabelsMatchSpreadConstraints(node.Labels, state.Constraints) {
return
}
for _, c := range state.Constraints {
if pl.enableNodeInclusionPolicyInPodTopologySpread &&
!c.matchNodeInclusionPolicies(pod, node, requiredNodeAffinity) {
continue
}
pair := topologyPair{key: c.TopologyKey, value: node.Labels[c.TopologyKey]}
// If current topology pair is not associated with any candidate node,
// continue to avoid unnecessary calculation.
// Per-node counts are also skipped, as they are done during Score.
tpCount := state.TopologyPairToPodCounts[pair]
if tpCount == nil {
continue
}
count := countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace)
atomic.AddInt64(tpCount, int64(count))
}
}
pl.parallelizer.Until(ctx, len(allNodes), processAllNode, pl.Name())
cycleState.Write(preScoreStateKey, state)
return nil
}
// Score invoked at the Score extension point.
// The "score" returned in this function is the matching number of pods on the `nodeName`,
// it is normalized later.
func (pl *PodTopologySpread) Score(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.sharedLister.NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
s, err := getPreScoreState(cycleState)
if err != nil {
return 0, framework.AsStatus(err)
}
// Return if the node is not qualified.
if s.IgnoredNodes.Has(node.Name) {
return 0, nil
}
// For each present <pair>, current node gets a credit of <matchSum>.
// And we sum up <matchSum> and return it as this node's score.
var score float64
for i, c := range s.Constraints {
if tpVal, ok := node.Labels[c.TopologyKey]; ok {
var cnt int64
if c.TopologyKey == v1.LabelHostname {
cnt = int64(countPodsMatchSelector(nodeInfo.Pods, c.Selector, pod.Namespace))
} else {
pair := topologyPair{key: c.TopologyKey, value: tpVal}
cnt = *s.TopologyPairToPodCounts[pair]
}
score += scoreForCount(cnt, c.MaxSkew, s.TopologyNormalizingWeight[i])
}
}
return int64(math.Round(score)), nil
}
// NormalizeScore invoked after scoring all nodes.
func (pl *PodTopologySpread) NormalizeScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
s, err := getPreScoreState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
if s == nil {
return nil
}
// Calculate <minScore> and <maxScore>
var minScore int64 = math.MaxInt64
var maxScore int64
for i, score := range scores {
// it's mandatory to check if <score.Name> is present in m.IgnoredNodes
if s.IgnoredNodes.Has(score.Name) {
scores[i].Score = invalidScore
continue
}
if score.Score < minScore {
minScore = score.Score
}
if score.Score > maxScore {
maxScore = score.Score
}
}
for i := range scores {
if scores[i].Score == invalidScore {
scores[i].Score = 0
continue
}
if maxScore == 0 {
scores[i].Score = framework.MaxNodeScore
continue
}
s := scores[i].Score
scores[i].Score = framework.MaxNodeScore * (maxScore + minScore - s) / maxScore
}
return nil
}
// ScoreExtensions of the Score plugin.
func (pl *PodTopologySpread) ScoreExtensions() framework.ScoreExtensions {
return pl
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("error reading %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("%+v convert to podtopologyspread.preScoreState error", c)
}
return s, nil
}
// topologyNormalizingWeight calculates the weight for the topology, based on
// the number of values that exist for a topology.
// Since <size> is at least 1 (all nodes that passed the Filters are in the
// same topology), and k8s supports 5k nodes, the result is in the interval
// <1.09, 8.52>.
//
// Note: <size> could also be zero when no nodes have the required topologies,
// however we don't care about topology weight in this case as we return a 0
// score for all nodes.
func topologyNormalizingWeight(size int) float64 {
return math.Log(float64(size + 2))
}
// scoreForCount calculates the score based on number of matching pods in a
// topology domain, the constraint's maxSkew and the topology weight.
// `maxSkew-1` is added to the score so that differences between topology
// domains get watered down, controlling the tolerance of the score to skews.
func scoreForCount(cnt int64, maxSkew int32, tpWeight float64) float64 {
return float64(cnt)*tpWeight + float64(maxSkew-1)
}

View File

@ -0,0 +1,53 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package queuesort
import (
"context"
"k8s.io/apimachinery/pkg/runtime"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
)
// Name is the name of the plugin used in the plugin registry and configurations.
const Name = names.PrioritySort
// PrioritySort is a plugin that implements Priority based sorting.
type PrioritySort struct{}
var _ framework.QueueSortPlugin = &PrioritySort{}
// Name returns name of the plugin.
func (pl *PrioritySort) Name() string {
return Name
}
// Less is the function used by the activeQ heap algorithm to sort pods.
// It sorts pods based on their priority. When priorities are equal, it uses
// PodQueueInfo.timestamp.
func (pl *PrioritySort) Less(pInfo1, pInfo2 *framework.QueuedPodInfo) bool {
p1 := corev1helpers.PodPriority(pInfo1.Pod)
p2 := corev1helpers.PodPriority(pInfo2.Pod)
return (p1 > p2) || (p1 == p2 && pInfo1.Timestamp.Before(pInfo2.Timestamp))
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, handle framework.Handle) (framework.Plugin, error) {
return &PrioritySort{}, nil
}

View File

@ -0,0 +1,84 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package plugins
import (
"k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources"
plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone"
"k8s.io/kubernetes/pkg/scheduler/framework/runtime"
)
// NewInTreeRegistry builds the registry with all the in-tree plugins.
// A scheduler that runs out of tree plugins can register additional plugins
// through the WithFrameworkOutOfTreeRegistry option.
func NewInTreeRegistry() runtime.Registry {
fts := plfeature.Features{
EnableDRAAdminAccess: feature.DefaultFeatureGate.Enabled(features.DRAAdminAccess),
EnableDynamicResourceAllocation: feature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation),
EnableVolumeCapacityPriority: feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
EnableNodeInclusionPolicyInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread),
EnableMatchLabelKeysInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread),
EnableInPlacePodVerticalScaling: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
EnableSidecarContainers: feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
EnableSchedulingQueueHint: feature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
EnableAsyncPreemption: feature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption),
EnablePodLevelResources: feature.DefaultFeatureGate.Enabled(features.PodLevelResources),
}
registry := runtime.Registry{
dynamicresources.Name: runtime.FactoryAdapter(fts, dynamicresources.New),
imagelocality.Name: imagelocality.New,
tainttoleration.Name: runtime.FactoryAdapter(fts, tainttoleration.New),
nodename.Name: runtime.FactoryAdapter(fts, nodename.New),
nodeports.Name: runtime.FactoryAdapter(fts, nodeports.New),
nodeaffinity.Name: runtime.FactoryAdapter(fts, nodeaffinity.New),
podtopologyspread.Name: runtime.FactoryAdapter(fts, podtopologyspread.New),
nodeunschedulable.Name: runtime.FactoryAdapter(fts, nodeunschedulable.New),
noderesources.Name: runtime.FactoryAdapter(fts, noderesources.NewFit),
noderesources.BalancedAllocationName: runtime.FactoryAdapter(fts, noderesources.NewBalancedAllocation),
volumebinding.Name: runtime.FactoryAdapter(fts, volumebinding.New),
volumerestrictions.Name: runtime.FactoryAdapter(fts, volumerestrictions.New),
volumezone.Name: runtime.FactoryAdapter(fts, volumezone.New),
nodevolumelimits.CSIName: runtime.FactoryAdapter(fts, nodevolumelimits.NewCSI),
interpodaffinity.Name: runtime.FactoryAdapter(fts, interpodaffinity.New),
queuesort.Name: queuesort.New,
defaultbinder.Name: defaultbinder.New,
defaultpreemption.Name: runtime.FactoryAdapter(fts, defaultpreemption.New),
schedulinggates.Name: runtime.FactoryAdapter(fts, schedulinggates.New),
}
return registry
}

View File

@ -0,0 +1,94 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package schedulinggates
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Name of the plugin used in the plugin registry and configurations.
const Name = names.SchedulingGates
// SchedulingGates checks if a Pod carries .spec.schedulingGates.
type SchedulingGates struct {
enableSchedulingQueueHint bool
}
var _ framework.PreEnqueuePlugin = &SchedulingGates{}
var _ framework.EnqueueExtensions = &SchedulingGates{}
func (pl *SchedulingGates) Name() string {
return Name
}
func (pl *SchedulingGates) PreEnqueue(ctx context.Context, p *v1.Pod) *framework.Status {
if len(p.Spec.SchedulingGates) == 0 {
return nil
}
gates := make([]string, 0, len(p.Spec.SchedulingGates))
for _, gate := range p.Spec.SchedulingGates {
gates = append(gates, gate.Name)
}
return framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("waiting for scheduling gates: %v", gates))
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *SchedulingGates) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if !pl.enableSchedulingQueueHint {
return nil, nil
}
// When the QueueingHint feature is enabled,
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
return []framework.ClusterEventWithHint{
// Pods can be more schedulable once it's gates are removed
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodSchedulingGatesEliminated}, QueueingHintFn: pl.isSchedulableAfterUpdatePodSchedulingGatesEliminated},
}, nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, _ framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &SchedulingGates{
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}
func (pl *SchedulingGates) isSchedulableAfterUpdatePodSchedulingGatesEliminated(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if modifiedPod.UID != pod.UID {
// If the update event is not for targetPod, it wouldn't make targetPod schedulable.
return framework.QueueSkip, nil
}
return framework.Queue, nil
}

View File

@ -0,0 +1,236 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package tainttoleration
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
v1helper "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// TaintToleration is a plugin that checks if a pod tolerates a node's taints.
type TaintToleration struct {
handle framework.Handle
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &TaintToleration{}
var _ framework.PreScorePlugin = &TaintToleration{}
var _ framework.ScorePlugin = &TaintToleration{}
var _ framework.EnqueueExtensions = &TaintToleration{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.TaintToleration
// preScoreStateKey is the key in CycleState to TaintToleration pre-computed data for Scoring.
preScoreStateKey = "PreScore" + Name
// ErrReasonNotMatch is the Filter reason status when not matching.
ErrReasonNotMatch = "node(s) had taints that the pod didn't tolerate"
)
// Name returns name of the plugin. It is used in logs, etc.
func (pl *TaintToleration) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *TaintToleration) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
if pl.enableSchedulingQueueHint {
return []framework.ClusterEventWithHint{
// When the QueueingHint feature is enabled, preCheck is eliminated and we don't need additional UpdateNodeLabel.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
// When the QueueingHint feature is enabled,
// the scheduling queue uses Pod/Update Queueing Hint
// to determine whether a Pod's update makes the Pod schedulable or not.
// https://github.com/kubernetes/kubernetes/pull/122234
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.UpdatePodTolerations}, QueueingHintFn: pl.isSchedulableAfterPodTolerationChange},
}, nil
}
return []framework.ClusterEventWithHint{
// A note about UpdateNodeLabel event:
// Ideally, it's supposed to register only Add | UpdateNodeTaint because UpdateNodeLabel will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel}, QueueingHintFn: pl.isSchedulableAfterNodeChange},
// No need to register the Pod event; the update to the unschedulable Pods already triggers the scheduling retry when QHint is disabled.
}, nil
}
// isSchedulableAfterNodeChange is invoked for all node events reported by
// an informer. It checks whether that change made a previously unschedulable
// pod schedulable.
func (pl *TaintToleration) isSchedulableAfterNodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalNode, modifiedNode, err := util.As[*v1.Node](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
wasUntolerated := true
if originalNode != nil {
_, wasUntolerated = v1helper.FindMatchingUntoleratedTaint(originalNode.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
}
_, isUntolerated := v1helper.FindMatchingUntoleratedTaint(modifiedNode.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
if wasUntolerated && !isUntolerated {
logger.V(5).Info("node was created or updated, and this may make the Pod rejected by TaintToleration plugin in the previous scheduling cycle schedulable", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.Queue, nil
}
logger.V(5).Info("node was created or updated, but it doesn't change the TaintToleration plugin's decision", "pod", klog.KObj(pod), "node", klog.KObj(modifiedNode))
return framework.QueueSkip, nil
}
// Filter invoked at the filter extension point.
func (pl *TaintToleration) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
node := nodeInfo.Node()
taint, isUntolerated := v1helper.FindMatchingUntoleratedTaint(node.Spec.Taints, pod.Spec.Tolerations, helper.DoNotScheduleTaintsFilterFunc())
if !isUntolerated {
return nil
}
errReason := fmt.Sprintf("node(s) had untolerated taint {%s: %s}", taint.Key, taint.Value)
return framework.NewStatus(framework.UnschedulableAndUnresolvable, errReason)
}
// preScoreState computed at PreScore and used at Score.
type preScoreState struct {
tolerationsPreferNoSchedule []v1.Toleration
}
// Clone implements the mandatory Clone interface. We don't really copy the data since
// there is no need for that.
func (s *preScoreState) Clone() framework.StateData {
return s
}
// getAllTolerationEffectPreferNoSchedule gets the list of all Tolerations with Effect PreferNoSchedule or with no effect.
func getAllTolerationPreferNoSchedule(tolerations []v1.Toleration) (tolerationList []v1.Toleration) {
for _, toleration := range tolerations {
// Empty effect means all effects which includes PreferNoSchedule, so we need to collect it as well.
if len(toleration.Effect) == 0 || toleration.Effect == v1.TaintEffectPreferNoSchedule {
tolerationList = append(tolerationList, toleration)
}
}
return
}
// PreScore builds and writes cycle state used by Score and NormalizeScore.
func (pl *TaintToleration) PreScore(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if len(nodes) == 0 {
return nil
}
tolerationsPreferNoSchedule := getAllTolerationPreferNoSchedule(pod.Spec.Tolerations)
state := &preScoreState{
tolerationsPreferNoSchedule: tolerationsPreferNoSchedule,
}
cycleState.Write(preScoreStateKey, state)
return nil
}
func getPreScoreState(cycleState *framework.CycleState) (*preScoreState, error) {
c, err := cycleState.Read(preScoreStateKey)
if err != nil {
return nil, fmt.Errorf("failed to read %q from cycleState: %w", preScoreStateKey, err)
}
s, ok := c.(*preScoreState)
if !ok {
return nil, fmt.Errorf("%+v convert to tainttoleration.preScoreState error", c)
}
return s, nil
}
// CountIntolerableTaintsPreferNoSchedule gives the count of intolerable taints of a pod with effect PreferNoSchedule
func countIntolerableTaintsPreferNoSchedule(taints []v1.Taint, tolerations []v1.Toleration) (intolerableTaints int) {
for _, taint := range taints {
// check only on taints that have effect PreferNoSchedule
if taint.Effect != v1.TaintEffectPreferNoSchedule {
continue
}
if !v1helper.TolerationsTolerateTaint(tolerations, &taint) {
intolerableTaints++
}
}
return
}
// Score invoked at the Score extension point.
func (pl *TaintToleration) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
nodeInfo, err := pl.handle.SnapshotSharedLister().NodeInfos().Get(nodeName)
if err != nil {
return 0, framework.AsStatus(fmt.Errorf("getting node %q from Snapshot: %w", nodeName, err))
}
node := nodeInfo.Node()
s, err := getPreScoreState(state)
if err != nil {
return 0, framework.AsStatus(err)
}
score := int64(countIntolerableTaintsPreferNoSchedule(node.Spec.Taints, s.tolerationsPreferNoSchedule))
return score, nil
}
// NormalizeScore invoked after scoring all nodes.
func (pl *TaintToleration) NormalizeScore(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, scores framework.NodeScoreList) *framework.Status {
return helper.DefaultNormalizeScore(framework.MaxNodeScore, true, scores)
}
// ScoreExtensions of the Score plugin.
func (pl *TaintToleration) ScoreExtensions() framework.ScoreExtensions {
return pl
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, h framework.Handle, fts feature.Features) (framework.Plugin, error) {
return &TaintToleration{
handle: h,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}
// isSchedulableAfterPodTolerationChange is invoked whenever a pod's toleration changed.
func (pl *TaintToleration) isSchedulableAfterPodTolerationChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPod, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if pod.UID == modifiedPod.UID {
// The updated Pod is the unschedulable Pod.
logger.V(5).Info("a new toleration is added for the unschedulable Pod, and it may make it schedulable", "pod", klog.KObj(modifiedPod))
return framework.Queue, nil
}
logger.V(5).Info("a new toleration is added for a Pod, but it's an unrelated Pod and wouldn't change the TaintToleration plugin's decision", "pod", klog.KObj(modifiedPod))
return framework.QueueSkip, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,131 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"fmt"
v1 "k8s.io/api/core/v1"
storagehelpers "k8s.io/component-helpers/storage/volume"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
)
// PVAssumeCache is a AssumeCache for PersistentVolume objects
type PVAssumeCache struct {
*assumecache.AssumeCache
logger klog.Logger
}
func pvStorageClassIndexFunc(obj interface{}) ([]string, error) {
if pv, ok := obj.(*v1.PersistentVolume); ok {
return []string{storagehelpers.GetPersistentVolumeClass(pv)}, nil
}
return []string{""}, fmt.Errorf("object is not a v1.PersistentVolume: %v", obj)
}
// NewPVAssumeCache creates a PV assume cache.
func NewPVAssumeCache(logger klog.Logger, informer assumecache.Informer) *PVAssumeCache {
logger = klog.LoggerWithName(logger, "PV Cache")
return &PVAssumeCache{
AssumeCache: assumecache.NewAssumeCache(logger, informer, "v1.PersistentVolume", "storageclass", pvStorageClassIndexFunc),
logger: logger,
}
}
func (c *PVAssumeCache) GetPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.Get(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}
}
return pv, nil
}
func (c *PVAssumeCache) GetAPIPV(pvName string) (*v1.PersistentVolume, error) {
obj, err := c.GetAPIObj(pvName)
if err != nil {
return nil, err
}
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}
}
return pv, nil
}
func (c *PVAssumeCache) ListPVs(storageClassName string) []*v1.PersistentVolume {
objs := c.List(&v1.PersistentVolume{
Spec: v1.PersistentVolumeSpec{
StorageClassName: storageClassName,
},
})
pvs := []*v1.PersistentVolume{}
for _, obj := range objs {
pv, ok := obj.(*v1.PersistentVolume)
if !ok {
c.logger.Error(&assumecache.WrongTypeError{TypeName: "v1.PersistentVolume", Object: obj}, "ListPVs")
continue
}
pvs = append(pvs, pv)
}
return pvs
}
// PVCAssumeCache is a AssumeCache for PersistentVolumeClaim objects
type PVCAssumeCache struct {
*assumecache.AssumeCache
logger klog.Logger
}
// NewPVCAssumeCache creates a PVC assume cache.
func NewPVCAssumeCache(logger klog.Logger, informer assumecache.Informer) *PVCAssumeCache {
logger = klog.LoggerWithName(logger, "PVC Cache")
return &PVCAssumeCache{
AssumeCache: assumecache.NewAssumeCache(logger, informer, "v1.PersistentVolumeClaim", "", nil),
logger: logger,
}
}
func (c *PVCAssumeCache) GetPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.Get(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolumeClaim", Object: obj}
}
return pvc, nil
}
func (c *PVCAssumeCache) GetAPIPVC(pvcKey string) (*v1.PersistentVolumeClaim, error) {
obj, err := c.GetAPIObj(pvcKey)
if err != nil {
return nil, err
}
pvc, ok := obj.(*v1.PersistentVolumeClaim)
if !ok {
return nil, &assumecache.WrongTypeError{TypeName: "v1.PersistentVolumeClaim", Object: obj}
}
return pvc, nil
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,75 @@
/*
Copyright 2017 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"context"
v1 "k8s.io/api/core/v1"
"k8s.io/klog/v2"
)
// FakeVolumeBinderConfig holds configurations for fake volume binder.
type FakeVolumeBinderConfig struct {
AllBound bool
FindReasons ConflictReasons
FindErr error
AssumeErr error
BindErr error
}
// NewFakeVolumeBinder sets up all the caches needed for the scheduler to make
// topology-aware volume binding decisions.
func NewFakeVolumeBinder(config *FakeVolumeBinderConfig) *FakeVolumeBinder {
return &FakeVolumeBinder{
config: config,
}
}
// FakeVolumeBinder represents a fake volume binder for testing.
type FakeVolumeBinder struct {
config *FakeVolumeBinderConfig
AssumeCalled bool
BindCalled bool
}
var _ SchedulerVolumeBinder = &FakeVolumeBinder{}
// GetPodVolumeClaims implements SchedulerVolumeBinder.GetPodVolumes.
func (b *FakeVolumeBinder) GetPodVolumeClaims(_ klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) {
return &PodVolumeClaims{}, nil
}
// FindPodVolumes implements SchedulerVolumeBinder.FindPodVolumes.
func (b *FakeVolumeBinder) FindPodVolumes(_ klog.Logger, pod *v1.Pod, _ *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) {
return nil, b.config.FindReasons, b.config.FindErr
}
// AssumePodVolumes implements SchedulerVolumeBinder.AssumePodVolumes.
func (b *FakeVolumeBinder) AssumePodVolumes(_ klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (bool, error) {
b.AssumeCalled = true
return b.config.AllBound, b.config.AssumeErr
}
// RevertAssumedPodVolumes implements SchedulerVolumeBinder.RevertAssumedPodVolumes
func (b *FakeVolumeBinder) RevertAssumedPodVolumes(_ *PodVolumes) {}
// BindPodVolumes implements SchedulerVolumeBinder.BindPodVolumes.
func (b *FakeVolumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) error {
b.BindCalled = true
return b.config.BindErr
}

View File

@ -0,0 +1,55 @@
/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
)
// VolumeSchedulerSubsystem - subsystem name used by scheduler
const VolumeSchedulerSubsystem = "scheduler_volume"
var (
// VolumeBindingRequestSchedulerBinderCache tracks the number of volume binder cache operations.
VolumeBindingRequestSchedulerBinderCache = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "binder_cache_requests_total",
Help: "Total number for request volume binding cache",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
// VolumeSchedulingStageFailed tracks the number of failed volume scheduling operations.
VolumeSchedulingStageFailed = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: VolumeSchedulerSubsystem,
Name: "scheduling_stage_error_total",
Help: "Volume scheduling stage error count",
StabilityLevel: metrics.ALPHA,
},
[]string{"operation"},
)
)
// RegisterVolumeSchedulingMetrics is used for scheduler, because the volume binding cache is a library
// used by scheduler process.
func RegisterVolumeSchedulingMetrics() {
legacyregistry.MustRegister(VolumeBindingRequestSchedulerBinderCache)
legacyregistry.MustRegister(VolumeSchedulingStageFailed)
}

View File

@ -0,0 +1,54 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"math"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
)
// classResourceMap holds a map of storage class to resource.
type classResourceMap map[string]*StorageResource
// volumeCapacityScorer calculates the score based on class storage resource information.
type volumeCapacityScorer func(classResourceMap) int64
// buildScorerFunction builds volumeCapacityScorer from the scoring function shape.
func buildScorerFunction(scoringFunctionShape helper.FunctionShape) volumeCapacityScorer {
rawScoringFunction := helper.BuildBrokenLinearFunction(scoringFunctionShape)
f := func(requested, capacity int64) int64 {
if capacity == 0 || requested > capacity {
return rawScoringFunction(maxUtilization)
}
return rawScoringFunction(requested * maxUtilization / capacity)
}
return func(classResources classResourceMap) int64 {
var nodeScore int64
// in alpha stage, all classes have the same weight
weightSum := len(classResources)
if weightSum == 0 {
return 0
}
for _, resource := range classResources {
classScore := f(resource.Requested, resource.Capacity)
nodeScore += classScore
}
return int64(math.Round(float64(nodeScore) / float64(weightSum)))
}
}

View File

@ -0,0 +1,217 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/component-helpers/storage/volume"
"k8s.io/utils/ptr"
)
type nodeBuilder struct {
*v1.Node
}
func makeNode(name string) nodeBuilder {
return nodeBuilder{Node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Labels: map[string]string{
v1.LabelHostname: name,
},
},
}}
}
func (nb nodeBuilder) withLabel(key, value string) nodeBuilder {
if nb.Node.ObjectMeta.Labels == nil {
nb.Node.ObjectMeta.Labels = map[string]string{}
}
nb.Node.ObjectMeta.Labels[key] = value
return nb
}
type pvBuilder struct {
*v1.PersistentVolume
}
func makePV(name, className string) pvBuilder {
return pvBuilder{PersistentVolume: &v1.PersistentVolume{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
Spec: v1.PersistentVolumeSpec{
StorageClassName: className,
},
}}
}
func (pvb pvBuilder) withNodeAffinity(keyValues map[string][]string) pvBuilder {
matchExpressions := make([]v1.NodeSelectorRequirement, 0)
for key, values := range keyValues {
matchExpressions = append(matchExpressions, v1.NodeSelectorRequirement{
Key: key,
Operator: v1.NodeSelectorOpIn,
Values: values,
})
}
pvb.PersistentVolume.Spec.NodeAffinity = &v1.VolumeNodeAffinity{
Required: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: matchExpressions,
},
},
},
}
return pvb
}
func (pvb pvBuilder) withVersion(version string) pvBuilder {
pvb.PersistentVolume.ObjectMeta.ResourceVersion = version
return pvb
}
func (pvb pvBuilder) withCapacity(capacity resource.Quantity) pvBuilder {
pvb.PersistentVolume.Spec.Capacity = v1.ResourceList{
v1.ResourceName(v1.ResourceStorage): capacity,
}
return pvb
}
func (pvb pvBuilder) withPhase(phase v1.PersistentVolumePhase) pvBuilder {
pvb.PersistentVolume.Status = v1.PersistentVolumeStatus{
Phase: phase,
}
return pvb
}
type pvcBuilder struct {
*v1.PersistentVolumeClaim
}
func makePVC(name string, storageClassName string) pvcBuilder {
return pvcBuilder{PersistentVolumeClaim: &v1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: v1.NamespaceDefault,
},
Spec: v1.PersistentVolumeClaimSpec{
StorageClassName: ptr.To(storageClassName),
},
}}
}
func (pvcb pvcBuilder) withBoundPV(pvName string) pvcBuilder {
pvcb.PersistentVolumeClaim.Spec.VolumeName = pvName
metav1.SetMetaDataAnnotation(&pvcb.PersistentVolumeClaim.ObjectMeta, volume.AnnBindCompleted, "true")
return pvcb
}
func (pvcb pvcBuilder) withRequestStorage(request resource.Quantity) pvcBuilder {
pvcb.PersistentVolumeClaim.Spec.Resources = v1.VolumeResourceRequirements{
Requests: v1.ResourceList{
v1.ResourceName(v1.ResourceStorage): request,
},
}
return pvcb
}
func (pvcb pvcBuilder) withPhase(phase v1.PersistentVolumeClaimPhase) pvcBuilder {
pvcb.PersistentVolumeClaim.Status = v1.PersistentVolumeClaimStatus{
Phase: phase,
}
return pvcb
}
type podBuilder struct {
*v1.Pod
}
func makePod(name string) podBuilder {
pb := podBuilder{Pod: &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: name,
Namespace: v1.NamespaceDefault,
},
}}
pb.Pod.Spec.Volumes = make([]v1.Volume, 0)
return pb
}
func (pb podBuilder) withNodeName(name string) podBuilder {
pb.Pod.Spec.NodeName = name
return pb
}
func (pb podBuilder) withNamespace(name string) podBuilder {
pb.Pod.ObjectMeta.Namespace = name
return pb
}
func (pb podBuilder) withPVCVolume(pvcName, name string) podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
Name: name,
VolumeSource: v1.VolumeSource{
PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{
ClaimName: pvcName,
},
},
})
return pb
}
func (pb podBuilder) withPVCSVolume(pvcs []*v1.PersistentVolumeClaim) podBuilder {
for i, pvc := range pvcs {
pb.withPVCVolume(pvc.Name, fmt.Sprintf("vol%v", i))
}
return pb
}
func (pb podBuilder) withEmptyDirVolume() podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{},
},
})
return pb
}
func (pb podBuilder) withGenericEphemeralVolume(name string) podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
Name: name,
VolumeSource: v1.VolumeSource{
Ephemeral: &v1.EphemeralVolumeSource{},
},
})
return pb
}
func (pb podBuilder) withCSI(driver string) podBuilder {
pb.Pod.Spec.Volumes = append(pb.Pod.Spec.Volumes, v1.Volume{
VolumeSource: v1.VolumeSource{
CSI: &v1.CSIVolumeSource{
Driver: driver,
},
},
})
return pb
}

View File

@ -0,0 +1,602 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumebinding
import (
"context"
"errors"
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/component-helpers/storage/ephemeral"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/apis/config"
"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
const (
stateKey framework.StateKey = Name
maxUtilization = 100
)
// the state is initialized in PreFilter phase. because we save the pointer in
// framework.CycleState, in the later phases we don't need to call Write method
// to update the value
type stateData struct {
allBound bool
// podVolumesByNode holds the pod's volume information found in the Filter
// phase for each node
// it's initialized in the PreFilter phase
podVolumesByNode map[string]*PodVolumes
podVolumeClaims *PodVolumeClaims
// hasStaticBindings declares whether the pod contains one or more StaticBinding.
// If not, vloumeBinding will skip score extension point.
hasStaticBindings bool
sync.Mutex
}
func (d *stateData) Clone() framework.StateData {
return d
}
// VolumeBinding is a plugin that binds pod volumes in scheduling.
// In the Filter phase, pod binding cache is created for the pod and used in
// Reserve and PreBind phases.
type VolumeBinding struct {
Binder SchedulerVolumeBinder
PVCLister corelisters.PersistentVolumeClaimLister
scorer volumeCapacityScorer
fts feature.Features
}
var _ framework.PreFilterPlugin = &VolumeBinding{}
var _ framework.FilterPlugin = &VolumeBinding{}
var _ framework.ReservePlugin = &VolumeBinding{}
var _ framework.PreBindPlugin = &VolumeBinding{}
var _ framework.PreScorePlugin = &VolumeBinding{}
var _ framework.ScorePlugin = &VolumeBinding{}
var _ framework.EnqueueExtensions = &VolumeBinding{}
// Name is the name of the plugin used in Registry and configurations.
const Name = names.VolumeBinding
// Name returns name of the plugin. It is used in logs, etc.
func (pl *VolumeBinding) Name() string {
return Name
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *VolumeBinding) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// Pods may fail to find available PVs because the node labels do not
// match the storage class's allowed topologies or PV's node affinity.
// A new or updated node may make pods schedulable.
//
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.fts.EnableSchedulingQueueHint {
// When scheduling queue hint is enabled, we don't use the problematic preCheck and don't need to register UpdateNodeTaint event.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
events := []framework.ClusterEventWithHint{
// Pods may fail because of missing or mis-configured storage class
// (e.g., allowedTopologies, volumeBindingMode), and hence may become
// schedulable upon StorageClass Add or Update events.
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterStorageClassChange},
// We bind PVCs with PVs, so any changes may make the pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimChange},
{Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// We rely on CSI node to translate in-tree PV to CSI.
// TODO: kube-schduler will unregister the CSINode events once all the volume plugins has completed their CSI migration.
{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSINodeChange},
// When CSIStorageCapacity is enabled, pods may become schedulable
// on CSI driver & storage capacity changes.
{Event: framework.ClusterEvent{Resource: framework.CSIDriver, ActionType: framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSIDriverChange},
{Event: framework.ClusterEvent{Resource: framework.CSIStorageCapacity, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterCSIStorageCapacityChange},
}
return events, nil
}
func (pl *VolumeBinding) isSchedulableAfterCSINodeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
if oldObj == nil {
logger.V(5).Info("CSINode creation could make the pod schedulable")
return framework.Queue, nil
}
oldCSINode, modifiedCSINode, err := util.As[*storagev1.CSINode](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"CSINode", klog.KObj(modifiedCSINode),
)
if oldCSINode.ObjectMeta.Annotations[v1.MigratedPluginsAnnotationKey] != modifiedCSINode.ObjectMeta.Annotations[v1.MigratedPluginsAnnotationKey] {
logger.V(5).Info("CSINode's migrated plugins annotation is updated and that may make the pod schedulable")
return framework.Queue, nil
}
logger.V(5).Info("CISNode was created or updated but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
func (pl *VolumeBinding) isSchedulableAfterPersistentVolumeClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, newPVC, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"PersistentVolumeClaim", klog.KObj(newPVC),
)
if pod.Namespace != newPVC.Namespace {
logger.V(5).Info("PersistentVolumeClaim was created or updated, but it doesn't make this pod schedulable because the PVC belongs to a different namespace")
return framework.QueueSkip, nil
}
for _, vol := range pod.Spec.Volumes {
var pvcName string
switch {
case vol.PersistentVolumeClaim != nil:
pvcName = vol.PersistentVolumeClaim.ClaimName
case vol.Ephemeral != nil:
pvcName = ephemeral.VolumeClaimName(pod, &vol)
default:
continue
}
if pvcName == newPVC.Name {
// Return Queue because, in this case,
// all PVC creations and almost all PVC updates could make the Pod schedulable.
logger.V(5).Info("PersistentVolumeClaim the pod requires was created or updated, potentially making the target Pod schedulable")
return framework.Queue, nil
}
}
logger.V(5).Info("PersistentVolumeClaim was created or updated, but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
// isSchedulableAfterStorageClassChange checks whether an StorageClass event might make a Pod schedulable or not.
// Any StorageClass addition and a StorageClass update to allowedTopologies
// might make a Pod schedulable.
// Note that an update to volume binding mode is not allowed and we don't have to consider while examining the update event.
func (pl *VolumeBinding) isSchedulableAfterStorageClassChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
oldSC, newSC, err := util.As[*storagev1.StorageClass](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"StorageClass", klog.KObj(newSC),
)
if oldSC == nil {
// No further filtering can be made for a creation event,
// and we just always return Queue.
logger.V(5).Info("A new StorageClass was created, which could make a Pod schedulable")
return framework.Queue, nil
}
if !apiequality.Semantic.DeepEqual(newSC.AllowedTopologies, oldSC.AllowedTopologies) {
logger.V(5).Info("StorageClass got an update in AllowedTopologies", "AllowedTopologies", newSC.AllowedTopologies)
return framework.Queue, nil
}
logger.V(5).Info("StorageClass was updated, but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
// isSchedulableAfterCSIStorageCapacityChange checks whether a CSIStorageCapacity event
// might make a Pod schedulable or not.
// Any CSIStorageCapacity addition and a CSIStorageCapacity update to volume limit
// (calculated based on capacity and maximumVolumeSize) might make a Pod schedulable.
// Note that an update to nodeTopology and storageClassName is not allowed and
// we don't have to consider while examining the update event.
func (pl *VolumeBinding) isSchedulableAfterCSIStorageCapacityChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
oldCap, newCap, err := util.As[*storagev1.CSIStorageCapacity](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
if oldCap == nil {
logger.V(5).Info(
"A new CSIStorageCapacity was created, which could make a Pod schedulable",
"Pod", klog.KObj(pod),
"CSIStorageCapacity", klog.KObj(newCap),
)
return framework.Queue, nil
}
oldLimit := volumeLimit(oldCap)
newLimit := volumeLimit(newCap)
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"CSIStorageCapacity", klog.KObj(newCap),
"volumeLimit(new)", newLimit,
"volumeLimit(old)", oldLimit,
)
if newLimit != nil && (oldLimit == nil || newLimit.Value() > oldLimit.Value()) {
logger.V(5).Info("VolumeLimit was increased, which could make a Pod schedulable")
return framework.Queue, nil
}
logger.V(5).Info("CSIStorageCapacity was updated, but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
func (pl *VolumeBinding) isSchedulableAfterCSIDriverChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalCSIDriver, modifiedCSIDriver, err := util.As[*storagev1.CSIDriver](oldObj, newObj)
if err != nil {
return framework.Queue, err
}
logger = klog.LoggerWithValues(
logger,
"Pod", klog.KObj(pod),
"CSIDriver", klog.KObj(modifiedCSIDriver),
)
for _, vol := range pod.Spec.Volumes {
if vol.CSI == nil || vol.CSI.Driver != modifiedCSIDriver.Name {
continue
}
if (originalCSIDriver.Spec.StorageCapacity != nil && *originalCSIDriver.Spec.StorageCapacity) &&
(modifiedCSIDriver.Spec.StorageCapacity == nil || !*modifiedCSIDriver.Spec.StorageCapacity) {
logger.V(5).Info("CSIDriver was updated and storage capacity got disabled, which may make the pod schedulable")
return framework.Queue, nil
}
}
logger.V(5).Info("CSIDriver was created or updated but it doesn't make this pod schedulable")
return framework.QueueSkip, nil
}
// podHasPVCs returns 2 values:
// - the first one to denote if the given "pod" has any PVC defined.
// - the second one to return any error if the requested PVC is illegal.
func (pl *VolumeBinding) podHasPVCs(pod *v1.Pod) (bool, error) {
hasPVC := false
for _, vol := range pod.Spec.Volumes {
var pvcName string
isEphemeral := false
switch {
case vol.PersistentVolumeClaim != nil:
pvcName = vol.PersistentVolumeClaim.ClaimName
case vol.Ephemeral != nil:
pvcName = ephemeral.VolumeClaimName(pod, &vol)
isEphemeral = true
default:
// Volume is not using a PVC, ignore
continue
}
hasPVC = true
pvc, err := pl.PVCLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
if err != nil {
// The error usually has already enough context ("persistentvolumeclaim "myclaim" not found"),
// but we can do better for generic ephemeral inline volumes where that situation
// is normal directly after creating a pod.
if isEphemeral && apierrors.IsNotFound(err) {
err = fmt.Errorf("waiting for ephemeral volume controller to create the persistentvolumeclaim %q", pvcName)
}
return hasPVC, err
}
if pvc.Status.Phase == v1.ClaimLost {
return hasPVC, fmt.Errorf("persistentvolumeclaim %q bound to non-existent persistentvolume %q", pvc.Name, pvc.Spec.VolumeName)
}
if pvc.DeletionTimestamp != nil {
return hasPVC, fmt.Errorf("persistentvolumeclaim %q is being deleted", pvc.Name)
}
if isEphemeral {
if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
return hasPVC, err
}
}
}
return hasPVC, nil
}
// PreFilter invoked at the prefilter extension point to check if pod has all
// immediate PVCs bound. If not all immediate PVCs are bound, an
// UnschedulableAndUnresolvable is returned.
func (pl *VolumeBinding) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
logger := klog.FromContext(ctx)
// If pod does not reference any PVC, we don't need to do anything.
if hasPVC, err := pl.podHasPVCs(pod); err != nil {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
} else if !hasPVC {
state.Write(stateKey, &stateData{})
return nil, framework.NewStatus(framework.Skip)
}
podVolumeClaims, err := pl.Binder.GetPodVolumeClaims(logger, pod)
if err != nil {
return nil, framework.AsStatus(err)
}
if len(podVolumeClaims.unboundClaimsImmediate) > 0 {
// Return UnschedulableAndUnresolvable error if immediate claims are
// not bound. Pod will be moved to active/backoff queues once these
// claims are bound by PV controller.
status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
status.AppendReason("pod has unbound immediate PersistentVolumeClaims")
return nil, status
}
state.Write(stateKey, &stateData{
podVolumesByNode: make(map[string]*PodVolumes),
podVolumeClaims: &PodVolumeClaims{
boundClaims: podVolumeClaims.boundClaims,
unboundClaimsDelayBinding: podVolumeClaims.unboundClaimsDelayBinding,
unboundVolumesDelayBinding: podVolumeClaims.unboundVolumesDelayBinding,
},
})
return nil, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *VolumeBinding) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
func getStateData(cs *framework.CycleState) (*stateData, error) {
state, err := cs.Read(stateKey)
if err != nil {
return nil, err
}
s, ok := state.(*stateData)
if !ok {
return nil, errors.New("unable to convert state into stateData")
}
return s, nil
}
// Filter invoked at the filter extension point.
// It evaluates if a pod can fit due to the volumes it requests,
// for both bound and unbound PVCs.
//
// For PVCs that are bound, then it checks that the corresponding PV's node affinity is
// satisfied by the given node.
//
// For PVCs that are unbound, it tries to find available PVs that can satisfy the PVC requirements
// and that the PV node affinity is satisfied by the given node.
//
// If storage capacity tracking is enabled, then enough space has to be available
// for the node and volumes that still need to be created.
//
// The predicate returns true if all bound PVCs have compatible PVs with the node, and if all unbound
// PVCs can be matched with an available and node-compatible PV.
func (pl *VolumeBinding) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
logger := klog.FromContext(ctx)
node := nodeInfo.Node()
state, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
podVolumes, reasons, err := pl.Binder.FindPodVolumes(logger, pod, state.podVolumeClaims, node)
if err != nil {
return framework.AsStatus(err)
}
if len(reasons) > 0 {
status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
for _, reason := range reasons {
status.AppendReason(string(reason))
}
return status
}
// multiple goroutines call `Filter` on different nodes simultaneously and the `CycleState` may be duplicated, so we must use a local lock here
state.Lock()
state.podVolumesByNode[node.Name] = podVolumes
state.hasStaticBindings = state.hasStaticBindings || (podVolumes != nil && len(podVolumes.StaticBindings) > 0)
state.Unlock()
return nil
}
// PreScore invoked at the preScore extension point. It checks whether volumeBinding can skip Score
func (pl *VolumeBinding) PreScore(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
if pl.scorer == nil {
return framework.NewStatus(framework.Skip)
}
state, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
if state.hasStaticBindings {
return nil
}
return framework.NewStatus(framework.Skip)
}
// Score invoked at the score extension point.
func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
if pl.scorer == nil {
return 0, nil
}
state, err := getStateData(cs)
if err != nil {
return 0, framework.AsStatus(err)
}
podVolumes, ok := state.podVolumesByNode[nodeName]
if !ok {
return 0, nil
}
// group by storage class
classResources := make(classResourceMap)
for _, staticBinding := range podVolumes.StaticBindings {
class := staticBinding.StorageClassName()
storageResource := staticBinding.StorageResource()
if _, ok := classResources[class]; !ok {
classResources[class] = &StorageResource{
Requested: 0,
Capacity: 0,
}
}
classResources[class].Requested += storageResource.Requested
classResources[class].Capacity += storageResource.Capacity
}
return pl.scorer(classResources), nil
}
// ScoreExtensions of the Score plugin.
func (pl *VolumeBinding) ScoreExtensions() framework.ScoreExtensions {
return nil
}
// Reserve reserves volumes of pod and saves binding status in cycle state.
func (pl *VolumeBinding) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
state, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
// we don't need to hold the lock as only one node will be reserved for the given pod
podVolumes, ok := state.podVolumesByNode[nodeName]
if ok {
allBound, err := pl.Binder.AssumePodVolumes(klog.FromContext(ctx), pod, nodeName, podVolumes)
if err != nil {
return framework.AsStatus(err)
}
state.allBound = allBound
} else {
// may not exist if the pod does not reference any PVC
state.allBound = true
}
return nil
}
// PreBind will make the API update with the assumed bindings and wait until
// the PV controller has completely finished the binding operation.
//
// If binding errors, times out or gets undone, then an error will be returned to
// retry scheduling.
func (pl *VolumeBinding) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
s, err := getStateData(cs)
if err != nil {
return framework.AsStatus(err)
}
if s.allBound {
// no need to bind volumes
return nil
}
// we don't need to hold the lock as only one node will be pre-bound for the given pod
podVolumes, ok := s.podVolumesByNode[nodeName]
if !ok {
return framework.AsStatus(fmt.Errorf("no pod volumes found for node %q", nodeName))
}
logger := klog.FromContext(ctx)
logger.V(5).Info("Trying to bind volumes for pod", "pod", klog.KObj(pod))
err = pl.Binder.BindPodVolumes(ctx, pod, podVolumes)
if err != nil {
logger.V(5).Info("Failed to bind volumes for pod", "pod", klog.KObj(pod), "err", err)
return framework.AsStatus(err)
}
logger.V(5).Info("Success binding volumes for pod", "pod", klog.KObj(pod))
return nil
}
// Unreserve clears assumed PV and PVC cache.
// It's idempotent, and does nothing if no cache found for the given pod.
func (pl *VolumeBinding) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
s, err := getStateData(cs)
if err != nil {
return
}
// we don't need to hold the lock as only one node may be unreserved
podVolumes, ok := s.podVolumesByNode[nodeName]
if !ok {
return
}
pl.Binder.RevertAssumedPodVolumes(podVolumes)
}
// New initializes a new plugin and returns it.
func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
args, ok := plArgs.(*config.VolumeBindingArgs)
if !ok {
return nil, fmt.Errorf("want args to be of type VolumeBindingArgs, got %T", plArgs)
}
if err := validation.ValidateVolumeBindingArgsWithOptions(nil, args, validation.VolumeBindingArgsValidationOptions{
AllowVolumeCapacityPriority: fts.EnableVolumeCapacityPriority,
}); err != nil {
return nil, err
}
podInformer := fh.SharedInformerFactory().Core().V1().Pods()
nodeInformer := fh.SharedInformerFactory().Core().V1().Nodes()
pvcInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumeClaims()
pvInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumes()
storageClassInformer := fh.SharedInformerFactory().Storage().V1().StorageClasses()
csiNodeInformer := fh.SharedInformerFactory().Storage().V1().CSINodes()
capacityCheck := CapacityCheck{
CSIDriverInformer: fh.SharedInformerFactory().Storage().V1().CSIDrivers(),
CSIStorageCapacityInformer: fh.SharedInformerFactory().Storage().V1().CSIStorageCapacities(),
}
binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
// build score function
var scorer volumeCapacityScorer
if fts.EnableVolumeCapacityPriority {
shape := make(helper.FunctionShape, 0, len(args.Shape))
for _, point := range args.Shape {
shape = append(shape, helper.FunctionShapePoint{
Utilization: int64(point.Utilization),
Score: int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore),
})
}
scorer = buildScorerFunction(shape)
}
return &VolumeBinding{
Binder: binder,
PVCLister: pvcInformer.Lister(),
scorer: scorer,
fts: fts,
}, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,426 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumerestrictions
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
corelisters "k8s.io/client-go/listers/core/v1"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// VolumeRestrictions is a plugin that checks volume restrictions.
type VolumeRestrictions struct {
pvcLister corelisters.PersistentVolumeClaimLister
sharedLister framework.SharedLister
enableSchedulingQueueHint bool
}
var _ framework.PreFilterPlugin = &VolumeRestrictions{}
var _ framework.FilterPlugin = &VolumeRestrictions{}
var _ framework.EnqueueExtensions = &VolumeRestrictions{}
var _ framework.StateData = &preFilterState{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.VolumeRestrictions
// preFilterStateKey is the key in CycleState to VolumeRestrictions pre-computed data for Filtering.
// Using the name of the plugin will likely help us avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + Name
// ErrReasonDiskConflict is used for NoDiskConflict predicate error.
ErrReasonDiskConflict = "node(s) had no available disk"
// ErrReasonReadWriteOncePodConflict is used when a pod is found using the same PVC with the ReadWriteOncePod access mode.
ErrReasonReadWriteOncePodConflict = "node has pod using PersistentVolumeClaim with the same name and ReadWriteOncePod access mode"
)
// preFilterState computed at PreFilter and used at Filter.
type preFilterState struct {
// Names of the pod's volumes using the ReadWriteOncePod access mode.
readWriteOncePodPVCs sets.Set[string]
// The number of references to these ReadWriteOncePod volumes by scheduled pods.
conflictingPVCRefCount int
}
func (s *preFilterState) updateWithPod(podInfo *framework.PodInfo, multiplier int) {
s.conflictingPVCRefCount += multiplier * s.conflictingPVCRefCountForPod(podInfo)
}
func (s *preFilterState) conflictingPVCRefCountForPod(podInfo *framework.PodInfo) int {
conflicts := 0
for _, volume := range podInfo.Pod.Spec.Volumes {
if volume.PersistentVolumeClaim == nil {
continue
}
if s.readWriteOncePodPVCs.Has(volume.PersistentVolumeClaim.ClaimName) {
conflicts += 1
}
}
return conflicts
}
// Clone the prefilter state.
func (s *preFilterState) Clone() framework.StateData {
if s == nil {
return nil
}
return &preFilterState{
readWriteOncePodPVCs: s.readWriteOncePodPVCs,
conflictingPVCRefCount: s.conflictingPVCRefCount,
}
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *VolumeRestrictions) Name() string {
return Name
}
func isVolumeConflict(volume *v1.Volume, pod *v1.Pod) bool {
for _, existingVolume := range pod.Spec.Volumes {
// Same GCE disk mounted by multiple pods conflicts unless all pods mount it read-only.
if volume.GCEPersistentDisk != nil && existingVolume.GCEPersistentDisk != nil {
disk, existingDisk := volume.GCEPersistentDisk, existingVolume.GCEPersistentDisk
if disk.PDName == existingDisk.PDName && !(disk.ReadOnly && existingDisk.ReadOnly) {
return true
}
}
if volume.AWSElasticBlockStore != nil && existingVolume.AWSElasticBlockStore != nil {
if volume.AWSElasticBlockStore.VolumeID == existingVolume.AWSElasticBlockStore.VolumeID {
return true
}
}
if volume.ISCSI != nil && existingVolume.ISCSI != nil {
iqn := volume.ISCSI.IQN
eiqn := existingVolume.ISCSI.IQN
// two ISCSI volumes are same, if they share the same iqn. As iscsi volumes are of type
// RWO or ROX, we could permit only one RW mount. Same iscsi volume mounted by multiple Pods
// conflict unless all other pods mount as read only.
if iqn == eiqn && !(volume.ISCSI.ReadOnly && existingVolume.ISCSI.ReadOnly) {
return true
}
}
if volume.RBD != nil && existingVolume.RBD != nil {
mon, pool, image := volume.RBD.CephMonitors, volume.RBD.RBDPool, volume.RBD.RBDImage
emon, epool, eimage := existingVolume.RBD.CephMonitors, existingVolume.RBD.RBDPool, existingVolume.RBD.RBDImage
// two RBDs images are the same if they share the same Ceph monitor, are in the same RADOS Pool, and have the same image name
// only one read-write mount is permitted for the same RBD image.
// same RBD image mounted by multiple Pods conflicts unless all Pods mount the image read-only
if haveOverlap(mon, emon) && pool == epool && image == eimage && !(volume.RBD.ReadOnly && existingVolume.RBD.ReadOnly) {
return true
}
}
}
return false
}
// haveOverlap searches two arrays and returns true if they have at least one common element; returns false otherwise.
func haveOverlap(a1, a2 []string) bool {
if len(a1) > len(a2) {
a1, a2 = a2, a1
}
m := sets.New(a1...)
for _, val := range a2 {
if _, ok := m[val]; ok {
return true
}
}
return false
}
// return true if there are conflict checking targets.
func needsRestrictionsCheck(v v1.Volume) bool {
return v.GCEPersistentDisk != nil || v.AWSElasticBlockStore != nil || v.RBD != nil || v.ISCSI != nil
}
// PreFilter computes and stores cycleState containing details for enforcing ReadWriteOncePod.
func (pl *VolumeRestrictions) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
needsCheck := false
for i := range pod.Spec.Volumes {
if needsRestrictionsCheck(pod.Spec.Volumes[i]) {
needsCheck = true
break
}
}
pvcs, err := pl.readWriteOncePodPVCsForPod(ctx, pod)
if err != nil {
if apierrors.IsNotFound(err) {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
return nil, framework.AsStatus(err)
}
s, err := pl.calPreFilterState(ctx, pod, pvcs)
if err != nil {
return nil, framework.AsStatus(err)
}
if !needsCheck && s.conflictingPVCRefCount == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cycleState.Write(preFilterStateKey, s)
return nil, nil
}
// AddPod from pre-computed data in cycleState.
func (pl *VolumeRestrictions) AddPod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToAdd *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToAdd, 1)
return nil
}
// RemovePod from pre-computed data in cycleState.
func (pl *VolumeRestrictions) RemovePod(ctx context.Context, cycleState *framework.CycleState, podToSchedule *v1.Pod, podInfoToRemove *framework.PodInfo, nodeInfo *framework.NodeInfo) *framework.Status {
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
state.updateWithPod(podInfoToRemove, -1)
return nil
}
func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
// preFilterState doesn't exist, likely PreFilter wasn't invoked.
return nil, fmt.Errorf("cannot read %q from cycleState", preFilterStateKey)
}
s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to volumerestrictions.state error", c)
}
return s, nil
}
// calPreFilterState computes preFilterState describing which PVCs use ReadWriteOncePod
// and which pods in the cluster are in conflict.
func (pl *VolumeRestrictions) calPreFilterState(ctx context.Context, pod *v1.Pod, pvcs sets.Set[string]) (*preFilterState, error) {
conflictingPVCRefCount := 0
for pvc := range pvcs {
key := framework.GetNamespacedName(pod.Namespace, pvc)
if pl.sharedLister.StorageInfos().IsPVCUsedByPods(key) {
// There can only be at most one pod using the ReadWriteOncePod PVC.
conflictingPVCRefCount += 1
}
}
return &preFilterState{
readWriteOncePodPVCs: pvcs,
conflictingPVCRefCount: conflictingPVCRefCount,
}, nil
}
func (pl *VolumeRestrictions) readWriteOncePodPVCsForPod(ctx context.Context, pod *v1.Pod) (sets.Set[string], error) {
pvcs := sets.New[string]()
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim == nil {
continue
}
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName)
if err != nil {
return nil, err
}
if !v1helper.ContainsAccessMode(pvc.Spec.AccessModes, v1.ReadWriteOncePod) {
continue
}
pvcs.Insert(pvc.Name)
}
return pvcs, nil
}
// Checks if scheduling the pod onto this node would cause any conflicts with
// existing volumes.
func satisfyVolumeConflicts(pod *v1.Pod, nodeInfo *framework.NodeInfo) bool {
for i := range pod.Spec.Volumes {
v := pod.Spec.Volumes[i]
if !needsRestrictionsCheck(v) {
continue
}
for _, ev := range nodeInfo.Pods {
if isVolumeConflict(&v, ev.Pod) {
return false
}
}
}
return true
}
// Checks if scheduling the pod would cause any ReadWriteOncePod PVC access mode conflicts.
func satisfyReadWriteOncePod(ctx context.Context, state *preFilterState) *framework.Status {
if state == nil {
return nil
}
if state.conflictingPVCRefCount > 0 {
return framework.NewStatus(framework.Unschedulable, ErrReasonReadWriteOncePodConflict)
}
return nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *VolumeRestrictions) PreFilterExtensions() framework.PreFilterExtensions {
return pl
}
// Filter invoked at the filter extension point.
// It evaluates if a pod can fit due to the volumes it requests, and those that
// are already mounted. If there is already a volume mounted on that node, another pod that uses the same volume
// can't be scheduled there.
// This is GCE, Amazon EBS, ISCSI and Ceph RBD specific for now:
// - GCE PD allows multiple mounts as long as they're all read-only
// - AWS EBS forbids any two pods mounting the same volume ID
// - Ceph RBD forbids if any two pods share at least same monitor, and match pool and image, and the image is read-only
// - ISCSI forbids if any two pods share at least same IQN and ISCSI volume is read-only
// If the pod uses PVCs with the ReadWriteOncePod access mode, it evaluates if
// these PVCs are already in-use and if preemption will help.
func (pl *VolumeRestrictions) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
if !satisfyVolumeConflicts(pod, nodeInfo) {
return framework.NewStatus(framework.Unschedulable, ErrReasonDiskConflict)
}
state, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}
return satisfyReadWriteOncePod(ctx, state)
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *VolumeRestrictions) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A note about UpdateNodeTaint/UpdateNodeLabel event:
// Ideally, it's supposed to register only Add because any Node update event will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeTaint | framework.UpdateNodeLabel
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled, and hence Update event isn't necessary.
nodeActionType = framework.Add
}
return []framework.ClusterEventWithHint{
// Pods may fail to schedule because of volumes conflicting with other pods on same node.
// Once running pods are deleted and volumes have been released, the unschedulable pod will be schedulable.
// Due to immutable fields `spec.volumes`, pod update events are ignored.
{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
// A new Node may make a pod schedulable.
// We intentionally don't set QueueingHint since all Node/Add events could make Pods schedulable.
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// Pods may fail to schedule because the PVC it uses has not yet been created.
// This PVC is required to exist to check its access modes.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add},
QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimAdded},
}, nil
}
// isSchedulableAfterPersistentVolumeClaimAdded is invoked whenever a PersistentVolumeClaim added or changed, It checks whether
// that change made a previously unschedulable pod schedulable.
func (pl *VolumeRestrictions) isSchedulableAfterPersistentVolumeClaimAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, newPersistentVolumeClaim, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeClaimChange: %w", err)
}
if newPersistentVolumeClaim.Namespace != pod.Namespace {
return framework.QueueSkip, nil
}
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim == nil {
continue
}
if volume.PersistentVolumeClaim.ClaimName == newPersistentVolumeClaim.Name {
logger.V(5).Info("PVC that is referred from the pod was created, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(newPersistentVolumeClaim))
return framework.Queue, nil
}
}
logger.V(5).Info("PVC irrelevant to the Pod was created, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(newPersistentVolumeClaim))
return framework.QueueSkip, nil
}
// isSchedulableAfterPodDeleted is invoked whenever a pod deleted,
// It checks whether the deleted pod will conflict with volumes of other pods on the same node
func (pl *VolumeRestrictions) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err)
}
if deletedPod.Namespace != pod.Namespace {
return framework.QueueSkip, nil
}
nodeInfo := framework.NewNodeInfo(deletedPod)
if !satisfyVolumeConflicts(pod, nodeInfo) {
logger.V(5).Info("Pod with the volume that the target pod requires was deleted, which might make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.Queue, nil
}
// Return Queue if a deleted pod uses the same PVC since the pod may be unschedulable due to the ReadWriteOncePod access mode of the PVC.
//
// For now, we don't actually fetch PVC and check the access mode because that operation could be expensive.
// Once the observability around QHint is established,
// we may want to do that depending on how much the operation would impact the QHint latency negatively.
// https://github.com/kubernetes/kubernetes/issues/124566
claims := sets.New[string]()
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim != nil {
claims.Insert(volume.PersistentVolumeClaim.ClaimName)
}
}
for _, volume := range deletedPod.Spec.Volumes {
if volume.PersistentVolumeClaim != nil && claims.Has(volume.PersistentVolumeClaim.ClaimName) {
logger.V(5).Info("Pod with the same PVC that the target pod requires was deleted, which might make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.Queue, nil
}
}
logger.V(5).Info("An irrelevant Pod was deleted, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "deletedPod", klog.KObj(deletedPod))
return framework.QueueSkip, nil
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
informerFactory := handle.SharedInformerFactory()
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
sharedLister := handle.SnapshotSharedLister()
return &VolumeRestrictions{
pvcLister: pvcLister,
sharedLister: sharedLister,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,10 @@
# See the OWNERS docs at https://go.k8s.io/owners
approvers:
- sig-storage-approvers
- cofyc
reviewers:
- sig-storage-reviewers
- cofyc
labels:
- sig/storage

View File

@ -0,0 +1,410 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package volumezone
import (
"context"
"errors"
"fmt"
"reflect"
v1 "k8s.io/api/core/v1"
storage "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/sets"
corelisters "k8s.io/client-go/listers/core/v1"
storagelisters "k8s.io/client-go/listers/storage/v1"
volumehelpers "k8s.io/cloud-provider/volume/helpers"
storagehelpers "k8s.io/component-helpers/storage/volume"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// VolumeZone is a plugin that checks volume zone.
type VolumeZone struct {
pvLister corelisters.PersistentVolumeLister
pvcLister corelisters.PersistentVolumeClaimLister
scLister storagelisters.StorageClassLister
enableSchedulingQueueHint bool
}
var _ framework.FilterPlugin = &VolumeZone{}
var _ framework.PreFilterPlugin = &VolumeZone{}
var _ framework.EnqueueExtensions = &VolumeZone{}
const (
// Name is the name of the plugin used in the plugin registry and configurations.
Name = names.VolumeZone
preFilterStateKey framework.StateKey = "PreFilter" + Name
// ErrReasonConflict is used for NoVolumeZoneConflict predicate error.
ErrReasonConflict = "node(s) had no available volume zone"
)
// pvTopology holds the value of a pv's topologyLabel
type pvTopology struct {
pvName string
key string
values sets.Set[string]
}
// the state is initialized in PreFilter phase. because we save the pointer in
// framework.CycleState, in the later phases we don't need to call Write method
// to update the value
type stateData struct {
// podPVTopologies holds the pv information we need
// it's initialized in the PreFilter phase
podPVTopologies []pvTopology
}
func (d *stateData) Clone() framework.StateData {
return d
}
var topologyLabels = []string{
v1.LabelFailureDomainBetaZone,
v1.LabelFailureDomainBetaRegion,
v1.LabelTopologyZone,
v1.LabelTopologyRegion,
}
func translateToGALabel(label string) string {
if label == v1.LabelFailureDomainBetaRegion {
return v1.LabelTopologyRegion
}
if label == v1.LabelFailureDomainBetaZone {
return v1.LabelTopologyZone
}
return label
}
// Name returns name of the plugin. It is used in logs, etc.
func (pl *VolumeZone) Name() string {
return Name
}
// PreFilter invoked at the prefilter extension point
//
// # It finds the topology of the PersistentVolumes corresponding to the volumes a pod requests
//
// Currently, this is only supported with PersistentVolumeClaims,
// and only looks for the bound PersistentVolume.
func (pl *VolumeZone) PreFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
logger := klog.FromContext(ctx)
podPVTopologies, status := pl.getPVbyPod(logger, pod)
if !status.IsSuccess() {
return nil, status
}
if len(podPVTopologies) == 0 {
return nil, framework.NewStatus(framework.Skip)
}
cs.Write(preFilterStateKey, &stateData{podPVTopologies: podPVTopologies})
return nil, nil
}
// getPVbyPod gets PVTopology from pod
func (pl *VolumeZone) getPVbyPod(logger klog.Logger, pod *v1.Pod) ([]pvTopology, *framework.Status) {
podPVTopologies := make([]pvTopology, 0)
pvcNames := pl.getPersistentVolumeClaimNameFromPod(pod)
for _, pvcName := range pvcNames {
if pvcName == "" {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolumeClaim had no name")
}
pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
if s := getErrorAsStatus(err); !s.IsSuccess() {
return nil, s
}
pvName := pvc.Spec.VolumeName
if pvName == "" {
scName := storagehelpers.GetPersistentVolumeClaimClass(pvc)
if len(scName) == 0 {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolumeClaim had no pv name and storageClass name")
}
class, err := pl.scLister.Get(scName)
if s := getErrorAsStatus(err); !s.IsSuccess() {
return nil, s
}
if class.VolumeBindingMode == nil {
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, fmt.Sprintf("VolumeBindingMode not set for StorageClass %q", scName))
}
if *class.VolumeBindingMode == storage.VolumeBindingWaitForFirstConsumer {
// Skip unbound volumes
continue
}
return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, "PersistentVolume had no name")
}
pv, err := pl.pvLister.Get(pvName)
if s := getErrorAsStatus(err); !s.IsSuccess() {
return nil, s
}
podPVTopologies = append(podPVTopologies, pl.getPVTopologies(logger, pv)...)
}
return podPVTopologies, nil
}
// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (pl *VolumeZone) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}
// Filter invoked at the filter extension point.
//
// It evaluates if a pod can fit due to the volumes it requests, given
// that some volumes may have zone scheduling constraints. The requirement is that any
// volume zone-labels must match the equivalent zone-labels on the node. It is OK for
// the node to have more zone-label constraints (for example, a hypothetical replicated
// volume might allow region-wide access)
//
// Currently this is only supported with PersistentVolumeClaims, and looks to the labels
// only on the bound PersistentVolume.
//
// Working with volumes declared inline in the pod specification (i.e. not
// using a PersistentVolume) is likely to be harder, as it would require
// determining the zone of a volume during scheduling, and that is likely to
// require calling out to the cloud provider. It seems that we are moving away
// from inline volume declarations anyway.
func (pl *VolumeZone) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
logger := klog.FromContext(ctx)
// If a pod doesn't have any volume attached to it, the predicate will always be true.
// Thus we make a fast path for it, to avoid unnecessary computations in this case.
if len(pod.Spec.Volumes) == 0 {
return nil
}
var podPVTopologies []pvTopology
state, err := getStateData(cs)
if err != nil {
// Fallback to calculate pv list here
var status *framework.Status
podPVTopologies, status = pl.getPVbyPod(logger, pod)
if !status.IsSuccess() {
return status
}
} else {
podPVTopologies = state.podPVTopologies
}
node := nodeInfo.Node()
hasAnyNodeConstraint := false
for _, topologyLabel := range topologyLabels {
if _, ok := node.Labels[topologyLabel]; ok {
hasAnyNodeConstraint = true
break
}
}
if !hasAnyNodeConstraint {
// The node has no zone constraints, so we're OK to schedule.
// This is to handle a single-zone cluster scenario where the node may not have any topology labels.
return nil
}
for _, pvTopology := range podPVTopologies {
v, ok := node.Labels[pvTopology.key]
if !ok {
// if we can't match the beta label, try to match pv's beta label with node's ga label
v, ok = node.Labels[translateToGALabel(pvTopology.key)]
}
if !ok || !pvTopology.values.Has(v) {
logger.V(10).Info("Won't schedule pod onto node due to volume (mismatch on label key)", "pod", klog.KObj(pod), "node", klog.KObj(node), "PV", klog.KRef("", pvTopology.pvName), "PVLabelKey", pvTopology.key)
return framework.NewStatus(framework.UnschedulableAndUnresolvable, ErrReasonConflict)
}
}
return nil
}
func getStateData(cs *framework.CycleState) (*stateData, error) {
state, err := cs.Read(preFilterStateKey)
if err != nil {
return nil, err
}
s, ok := state.(*stateData)
if !ok {
return nil, errors.New("unable to convert state into stateData")
}
return s, nil
}
func getErrorAsStatus(err error) *framework.Status {
if err != nil {
if apierrors.IsNotFound(err) {
return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
}
return framework.AsStatus(err)
}
return nil
}
// EventsToRegister returns the possible events that may make a Pod
// failed by this plugin schedulable.
func (pl *VolumeZone) EventsToRegister(_ context.Context) ([]framework.ClusterEventWithHint, error) {
// A new node or updating a node's volume zone labels may make a pod schedulable.
// A note about UpdateNodeTaint event:
// Ideally, it's supposed to register only Add | UpdateNodeLabel because UpdateNodeTaint will never change the result from this plugin.
// But, we may miss Node/Add event due to preCheck, and we decided to register UpdateNodeTaint | UpdateNodeLabel for all plugins registering Node/Add.
// See: https://github.com/kubernetes/kubernetes/issues/109437
nodeActionType := framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint
if pl.enableSchedulingQueueHint {
// preCheck is not used when QHint is enabled.
nodeActionType = framework.Add | framework.UpdateNodeLabel
}
return []framework.ClusterEventWithHint{
// New storageClass with bind mode `VolumeBindingWaitForFirstConsumer` will make a pod schedulable.
// Due to immutable field `storageClass.volumeBindingMode`, storageClass update events are ignored.
{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add}, QueueingHintFn: pl.isSchedulableAfterStorageClassAdded},
{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: nodeActionType}},
// A new pvc may make a pod schedulable.
// Also, if pvc's VolumeName is filled, that also could make a pod schedulable.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeClaimChange},
// A new pv or updating a pv's volume zone labels may make a pod schedulable.
{Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPersistentVolumeChange},
}, nil
}
// getPersistentVolumeClaimNameFromPod gets pvc names bound to a pod.
func (pl *VolumeZone) getPersistentVolumeClaimNameFromPod(pod *v1.Pod) []string {
var pvcNames []string
for i := range pod.Spec.Volumes {
volume := pod.Spec.Volumes[i]
if volume.PersistentVolumeClaim == nil {
continue
}
pvcName := volume.PersistentVolumeClaim.ClaimName
pvcNames = append(pvcNames, pvcName)
}
return pvcNames
}
// isSchedulableAfterPersistentVolumeClaimChange is invoked whenever a PersistentVolumeClaim added or updated.
// It checks whether the change of PVC has made a previously unschedulable pod schedulable.
func (pl *VolumeZone) isSchedulableAfterPersistentVolumeClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, modifiedPVC, err := util.As[*v1.PersistentVolumeClaim](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeClaimChange: %w", err)
}
if pl.isPVCRequestedFromPod(logger, modifiedPVC, pod) {
logger.V(5).Info("PVC that is referred from the pod was created or updated, which might make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(modifiedPVC))
return framework.Queue, nil
}
logger.V(5).Info("PVC irrelevant to the Pod was created or updated, which doesn't make this pod schedulable", "pod", klog.KObj(pod), "PVC", klog.KObj(modifiedPVC))
return framework.QueueSkip, nil
}
// isPVCRequestedFromPod verifies if the PVC is requested from a given Pod.
func (pl *VolumeZone) isPVCRequestedFromPod(logger klog.Logger, pvc *v1.PersistentVolumeClaim, pod *v1.Pod) bool {
if (pvc == nil) || (pod.Namespace != pvc.Namespace) {
return false
}
pvcNames := pl.getPersistentVolumeClaimNameFromPod(pod)
for _, pvcName := range pvcNames {
if pvc.Name == pvcName {
logger.V(5).Info("PVC is referred from the pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc))
return true
}
}
logger.V(5).Info("PVC is not referred from the pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc))
return false
}
// isSchedulableAfterStorageClassAdded is invoked whenever a StorageClass is added.
// It checks whether the addition of StorageClass has made a previously unschedulable pod schedulable.
// Only a new StorageClass with WaitForFirstConsumer will cause a pod to become schedulable.
func (pl *VolumeZone) isSchedulableAfterStorageClassAdded(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
_, addedStorageClass, err := util.As[*storage.StorageClass](nil, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterStorageClassAdded: %w", err)
}
if (addedStorageClass.VolumeBindingMode == nil) || (*addedStorageClass.VolumeBindingMode != storage.VolumeBindingWaitForFirstConsumer) {
logger.V(5).Info("StorageClass is created, but its VolumeBindingMode is not waitForFirstConsumer, which doesn't make the pod schedulable", "storageClass", klog.KObj(addedStorageClass), "pod", klog.KObj(pod))
return framework.QueueSkip, nil
}
logger.V(5).Info("StorageClass with waitForFirstConsumer mode was created and it might make this pod schedulable", "pod", klog.KObj(pod), "StorageClass", klog.KObj(addedStorageClass))
return framework.Queue, nil
}
// isSchedulableAfterPersistentVolumeChange is invoked whenever a PersistentVolume added or updated.
// It checks whether the change of PV has made a previously unschedulable pod schedulable.
// Changing the PV topology labels could cause the pod to become schedulable.
func (pl *VolumeZone) isSchedulableAfterPersistentVolumeChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
originalPV, modifiedPV, err := util.As[*v1.PersistentVolume](oldObj, newObj)
if err != nil {
return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPersistentVolumeChange: %w", err)
}
if originalPV == nil {
logger.V(5).Info("PV is newly created, which might make the pod schedulable")
return framework.Queue, nil
}
originalPVTopologies := pl.getPVTopologies(logger, originalPV)
modifiedPVTopologies := pl.getPVTopologies(logger, modifiedPV)
if !reflect.DeepEqual(originalPVTopologies, modifiedPVTopologies) {
logger.V(5).Info("PV's topology was updated, which might make the pod schedulable.", "pod", klog.KObj(pod), "PV", klog.KObj(modifiedPV))
return framework.Queue, nil
}
logger.V(5).Info("PV was updated, but the topology is unchanged, which it doesn't make the pod schedulable", "pod", klog.KObj(pod), "PV", klog.KObj(modifiedPV))
return framework.QueueSkip, nil
}
// getPVTopologies retrieves pvTopology from a given PV and returns the array
// This function doesn't check spec.nodeAffinity
// because it's read-only after creation and thus cannot be updated
// and nodeAffinity is being handled in node affinity plugin
func (pl *VolumeZone) getPVTopologies(logger klog.Logger, pv *v1.PersistentVolume) []pvTopology {
podPVTopologies := make([]pvTopology, 0)
for _, key := range topologyLabels {
if value, ok := pv.ObjectMeta.Labels[key]; ok {
labelZonesSet, err := volumehelpers.LabelZonesToSet(value)
if err != nil {
logger.V(5).Info("failed to parse PV's topology label, ignoring the label", "label", fmt.Sprintf("%s:%s", key, value), "err", err)
continue
}
podPVTopologies = append(podPVTopologies, pvTopology{
pvName: pv.Name,
key: key,
values: sets.Set[string](labelZonesSet),
})
}
}
return podPVTopologies
}
// New initializes a new plugin and returns it.
func New(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
informerFactory := handle.SharedInformerFactory()
pvLister := informerFactory.Core().V1().PersistentVolumes().Lister()
pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
scLister := informerFactory.Storage().V1().StorageClasses().Lister()
return &VolumeZone{
pvLister: pvLister,
pvcLister: pvcLister,
scLister: scLister,
enableSchedulingQueueHint: fts.EnableSchedulingQueueHint,
}, nil
}

View File

@ -0,0 +1,738 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package preemption
import (
"context"
"errors"
"fmt"
"math"
"sync"
"sync/atomic"
"time"
v1 "k8s.io/api/core/v1"
policy "k8s.io/api/policy/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/types"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/sets"
corelisters "k8s.io/client-go/listers/core/v1"
policylisters "k8s.io/client-go/listers/policy/v1"
corev1helpers "k8s.io/component-helpers/scheduling/corev1"
"k8s.io/klog/v2"
extenderv1 "k8s.io/kube-scheduler/extender/v1"
apipod "k8s.io/kubernetes/pkg/api/v1/pod"
"k8s.io/kubernetes/pkg/scheduler/framework"
"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
"k8s.io/kubernetes/pkg/scheduler/metrics"
"k8s.io/kubernetes/pkg/scheduler/util"
)
// Candidate represents a nominated node on which the preemptor can be scheduled,
// along with the list of victims that should be evicted for the preemptor to fit the node.
type Candidate interface {
// Victims wraps a list of to-be-preempted Pods and the number of PDB violation.
Victims() *extenderv1.Victims
// Name returns the target node name where the preemptor gets nominated to run.
Name() string
}
type candidate struct {
victims *extenderv1.Victims
name string
}
// Victims returns s.victims.
func (s *candidate) Victims() *extenderv1.Victims {
return s.victims
}
// Name returns s.name.
func (s *candidate) Name() string {
return s.name
}
type candidateList struct {
idx int32
items []Candidate
}
func newCandidateList(size int32) *candidateList {
return &candidateList{idx: -1, items: make([]Candidate, size)}
}
// add adds a new candidate to the internal array atomically.
func (cl *candidateList) add(c *candidate) {
if idx := atomic.AddInt32(&cl.idx, 1); idx < int32(len(cl.items)) {
cl.items[idx] = c
}
}
// size returns the number of candidate stored. Note that some add() operations
// might still be executing when this is called, so care must be taken to
// ensure that all add() operations complete before accessing the elements of
// the list.
func (cl *candidateList) size() int32 {
n := atomic.LoadInt32(&cl.idx) + 1
if n >= int32(len(cl.items)) {
n = int32(len(cl.items))
}
return n
}
// get returns the internal candidate array. This function is NOT atomic and
// assumes that all add() operations have been completed.
func (cl *candidateList) get() []Candidate {
return cl.items[:cl.size()]
}
// Interface is expected to be implemented by different preemption plugins as all those member
// methods might have different behavior compared with the default preemption.
type Interface interface {
// GetOffsetAndNumCandidates chooses a random offset and calculates the number of candidates that should be
// shortlisted for dry running preemption.
GetOffsetAndNumCandidates(nodes int32) (int32, int32)
// CandidatesToVictimsMap builds a map from the target node to a list of to-be-preempted Pods and the number of PDB violation.
CandidatesToVictimsMap(candidates []Candidate) map[string]*extenderv1.Victims
// PodEligibleToPreemptOthers returns one bool and one string. The bool indicates whether this pod should be considered for
// preempting other pods or not. The string includes the reason if this pod isn't eligible.
PodEligibleToPreemptOthers(ctx context.Context, pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string)
// SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room
// for "pod" to be scheduled.
// Note that both `state` and `nodeInfo` are deep copied.
SelectVictimsOnNode(ctx context.Context, state *framework.CycleState,
pod *v1.Pod, nodeInfo *framework.NodeInfo, pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status)
// OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted.
// The ordered score functions will be processed one by one iff we find more than one node with the highest score.
// Default score functions will be processed if nil returned here for backwards-compatibility.
OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64
}
type Evaluator struct {
PluginName string
Handler framework.Handle
PodLister corelisters.PodLister
PdbLister policylisters.PodDisruptionBudgetLister
enableAsyncPreemption bool
mu sync.RWMutex
// preempting is a set that records the pods that are currently triggering preemption asynchronously,
// which is used to prevent the pods from entering the scheduling cycle meanwhile.
preempting sets.Set[types.UID]
// PreemptPod is a function that actually makes API calls to preempt a specific Pod.
// This is exposed to be replaced during tests.
PreemptPod func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error
Interface
}
func NewEvaluator(pluginName string, fh framework.Handle, i Interface, enableAsyncPreemption bool) *Evaluator {
podLister := fh.SharedInformerFactory().Core().V1().Pods().Lister()
pdbLister := fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()
ev := &Evaluator{
PluginName: names.DefaultPreemption,
Handler: fh,
PodLister: podLister,
PdbLister: pdbLister,
Interface: i,
enableAsyncPreemption: enableAsyncPreemption,
preempting: sets.New[types.UID](),
}
// PreemptPod actually makes API calls to preempt a specific Pod.
//
// We implement it here directly, rather than creating a separate method like ev.preemptPod(...)
// to prevent the misuse of the PreemptPod function.
ev.PreemptPod = func(ctx context.Context, c Candidate, preemptor, victim *v1.Pod, pluginName string) error {
logger := klog.FromContext(ctx)
// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
// Otherwise we should delete the victim.
if waitingPod := ev.Handler.GetWaitingPod(victim.UID); waitingPod != nil {
waitingPod.Reject(pluginName, "preempted")
logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(preemptor), "waitingPod", klog.KObj(victim), "node", c.Name())
} else {
condition := &v1.PodCondition{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
Reason: v1.PodReasonPreemptionByScheduler,
Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", preemptor.Spec.SchedulerName),
}
newStatus := victim.Status.DeepCopy()
updated := apipod.UpdatePodCondition(newStatus, condition)
if updated {
if err := util.PatchPodStatus(ctx, ev.Handler.ClientSet(), victim, newStatus); err != nil {
logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
return err
}
}
if err := util.DeletePod(ctx, ev.Handler.ClientSet(), victim); err != nil {
if !apierrors.IsNotFound(err) {
logger.Error(err, "Tried to preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(preemptor))
return err
}
logger.V(2).Info("Victim Pod is already deleted", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
return nil
}
logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(preemptor), "victim", klog.KObj(victim), "node", c.Name())
}
ev.Handler.EventRecorder().Eventf(victim, preemptor, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", preemptor.UID, c.Name())
return nil
}
return ev
}
// IsPodRunningPreemption returns true if the pod is currently triggering preemption asynchronously.
func (ev *Evaluator) IsPodRunningPreemption(podUID types.UID) bool {
ev.mu.RLock()
defer ev.mu.RUnlock()
return ev.preempting.Has(podUID)
}
// Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
// The semantics of returned <PostFilterResult, Status> varies on different scenarios:
//
// - <nil, Error>. This denotes it's a transient/rare error that may be self-healed in future cycles.
//
// - <nil, Unschedulable>. This status is mostly as expected like the preemptor is waiting for the
// victims to be fully terminated.
//
// - In both cases above, a nil PostFilterResult is returned to keep the pod's nominatedNodeName unchanged.
//
// - <non-nil PostFilterResult, Unschedulable>. It indicates the pod cannot be scheduled even with preemption.
// In this case, a non-nil PostFilterResult is returned and result.NominatingMode instructs how to deal with
// the nominatedNodeName.
//
// - <non-nil PostFilterResult, Success>. It's the regular happy path
// and the non-empty nominatedNodeName will be applied to the preemptor pod.
func (ev *Evaluator) Preempt(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusReader) (*framework.PostFilterResult, *framework.Status) {
logger := klog.FromContext(ctx)
// 0) Fetch the latest version of <pod>.
// It's safe to directly fetch pod here. Because the informer cache has already been
// initialized when creating the Scheduler obj.
// However, tests may need to manually initialize the shared pod informer.
podNamespace, podName := pod.Namespace, pod.Name
pod, err := ev.PodLister.Pods(pod.Namespace).Get(pod.Name)
if err != nil {
logger.Error(err, "Could not get the updated preemptor pod object", "pod", klog.KRef(podNamespace, podName))
return nil, framework.AsStatus(err)
}
// 1) Ensure the preemptor is eligible to preempt other pods.
nominatedNodeStatus := m.Get(pod.Status.NominatedNodeName)
if ok, msg := ev.PodEligibleToPreemptOthers(ctx, pod, nominatedNodeStatus); !ok {
logger.V(5).Info("Pod is not eligible for preemption", "pod", klog.KObj(pod), "reason", msg)
return nil, framework.NewStatus(framework.Unschedulable, msg)
}
// 2) Find all preemption candidates.
allNodes, err := ev.Handler.SnapshotSharedLister().NodeInfos().List()
if err != nil {
return nil, framework.AsStatus(err)
}
candidates, nodeToStatusMap, err := ev.findCandidates(ctx, state, allNodes, pod, m)
if err != nil && len(candidates) == 0 {
return nil, framework.AsStatus(err)
}
// Return a FitError only when there are no candidates that fit the pod.
if len(candidates) == 0 {
fitError := &framework.FitError{
Pod: pod,
NumAllNodes: len(allNodes),
Diagnosis: framework.Diagnosis{
NodeToStatus: nodeToStatusMap,
// Leave UnschedulablePlugins or PendingPlugins as nil as it won't be used on moving Pods.
},
}
fitError.Diagnosis.NodeToStatus.SetAbsentNodesStatus(framework.NewStatus(framework.UnschedulableAndUnresolvable, "Preemption is not helpful for scheduling"))
// Specify nominatedNodeName to clear the pod's nominatedNodeName status, if applicable.
return framework.NewPostFilterResultWithNominatedNode(""), framework.NewStatus(framework.Unschedulable, fitError.Error())
}
// 3) Interact with registered Extenders to filter out some candidates if needed.
candidates, status := ev.callExtenders(logger, pod, candidates)
if !status.IsSuccess() {
return nil, status
}
// 4) Find the best candidate.
bestCandidate := ev.SelectCandidate(ctx, candidates)
if bestCandidate == nil || len(bestCandidate.Name()) == 0 {
return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
}
logger.V(2).Info("the target node for the preemption is determined", "node", bestCandidate.Name(), "pod", klog.KObj(pod))
// 5) Perform preparation work before nominating the selected candidate.
if ev.enableAsyncPreemption {
ev.prepareCandidateAsync(bestCandidate, pod, ev.PluginName)
} else {
if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
return nil, status
}
}
return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
}
// FindCandidates calculates a slice of preemption candidates.
// Each candidate is executable to make the given <pod> schedulable.
func (ev *Evaluator) findCandidates(ctx context.Context, state *framework.CycleState, allNodes []*framework.NodeInfo, pod *v1.Pod, m framework.NodeToStatusReader) ([]Candidate, *framework.NodeToStatus, error) {
if len(allNodes) == 0 {
return nil, nil, errors.New("no nodes available")
}
logger := klog.FromContext(ctx)
// Get a list of nodes with failed predicates (Unschedulable) that may be satisfied by removing pods from the node.
potentialNodes, err := m.NodesForStatusCode(ev.Handler.SnapshotSharedLister().NodeInfos(), framework.Unschedulable)
if err != nil {
return nil, nil, err
}
if len(potentialNodes) == 0 {
logger.V(3).Info("Preemption will not help schedule pod on any node", "pod", klog.KObj(pod))
// In this case, we should clean-up any existing nominated node name of the pod.
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), pod); err != nil {
logger.Error(err, "Could not clear the nominatedNodeName field of pod", "pod", klog.KObj(pod))
// We do not return as this error is not critical.
}
return nil, framework.NewDefaultNodeToStatus(), nil
}
pdbs, err := getPodDisruptionBudgets(ev.PdbLister)
if err != nil {
return nil, nil, err
}
offset, candidatesNum := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
return ev.DryRunPreemption(ctx, state, pod, potentialNodes, pdbs, offset, candidatesNum)
}
// callExtenders calls given <extenders> to select the list of feasible candidates.
// We will only check <candidates> with extenders that support preemption.
// Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated
// node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles.
func (ev *Evaluator) callExtenders(logger klog.Logger, pod *v1.Pod, candidates []Candidate) ([]Candidate, *framework.Status) {
extenders := ev.Handler.Extenders()
nodeLister := ev.Handler.SnapshotSharedLister().NodeInfos()
if len(extenders) == 0 {
return candidates, nil
}
// Migrate candidate slice to victimsMap to adapt to the Extender interface.
// It's only applicable for candidate slice that have unique nominated node name.
victimsMap := ev.CandidatesToVictimsMap(candidates)
if len(victimsMap) == 0 {
return candidates, nil
}
for _, extender := range extenders {
if !extender.SupportsPreemption() || !extender.IsInterested(pod) {
continue
}
nodeNameToVictims, err := extender.ProcessPreemption(pod, victimsMap, nodeLister)
if err != nil {
if extender.IsIgnorable() {
logger.Info("Skipped extender as it returned error and has ignorable flag set",
"extender", extender.Name(), "err", err)
continue
}
return nil, framework.AsStatus(err)
}
// Check if the returned victims are valid.
for nodeName, victims := range nodeNameToVictims {
if victims == nil || len(victims.Pods) == 0 {
if extender.IsIgnorable() {
delete(nodeNameToVictims, nodeName)
logger.Info("Ignored node for which the extender didn't report victims", "node", klog.KRef("", nodeName), "extender", extender.Name())
continue
}
return nil, framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeName))
}
}
// Replace victimsMap with new result after preemption. So the
// rest of extenders can continue use it as parameter.
victimsMap = nodeNameToVictims
// If node list becomes empty, no preemption can happen regardless of other extenders.
if len(victimsMap) == 0 {
break
}
}
var newCandidates []Candidate
for nodeName := range victimsMap {
newCandidates = append(newCandidates, &candidate{
victims: victimsMap[nodeName],
name: nodeName,
})
}
return newCandidates, nil
}
// SelectCandidate chooses the best-fit candidate from given <candidates> and return it.
// NOTE: This method is exported for easier testing in default preemption.
func (ev *Evaluator) SelectCandidate(ctx context.Context, candidates []Candidate) Candidate {
logger := klog.FromContext(ctx)
if len(candidates) == 0 {
return nil
}
if len(candidates) == 1 {
return candidates[0]
}
victimsMap := ev.CandidatesToVictimsMap(candidates)
scoreFuncs := ev.OrderedScoreFuncs(ctx, victimsMap)
candidateNode := pickOneNodeForPreemption(logger, victimsMap, scoreFuncs)
// Same as candidatesToVictimsMap, this logic is not applicable for out-of-tree
// preemption plugins that exercise different candidates on the same nominated node.
if victims := victimsMap[candidateNode]; victims != nil {
return &candidate{
victims: victims,
name: candidateNode,
}
}
// We shouldn't reach here.
logger.Error(errors.New("no candidate selected"), "Should not reach here", "candidates", candidates)
// To not break the whole flow, return the first candidate.
return candidates[0]
}
// prepareCandidate does some preparation work before nominating the selected candidate:
// - Evict the victim pods
// - Reject the victim pods if they are in waitingPod map
// - Clear the low-priority pods' nominatedNodeName status if needed
func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.Pod, pluginName string) *framework.Status {
fh := ev.Handler
cs := ev.Handler.ClientSet()
ctx, cancel := context.WithCancel(ctx)
defer cancel()
logger := klog.FromContext(ctx)
errCh := parallelize.NewErrorChannel()
fh.Parallelizer().Until(ctx, len(c.Victims().Pods), func(index int) {
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[index], pluginName); err != nil {
errCh.SendErrorWithCancel(err, cancel)
}
}, ev.PluginName)
if err := errCh.ReceiveError(); err != nil {
return framework.AsStatus(err)
}
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
// Lower priority pods nominated to run on this node, may no longer fit on
// this node. So, we should remove their nomination. Removing their
// nomination updates these pods and moves them to the active queue. It
// lets scheduler find another place for them.
nominatedPods := getLowerPriorityNominatedPods(logger, fh, pod, c.Name())
if err := util.ClearNominatedNodeName(ctx, cs, nominatedPods...); err != nil {
logger.Error(err, "Cannot clear 'NominatedNodeName' field")
// We do not return as this error is not critical.
}
return nil
}
// prepareCandidateAsync triggers a goroutine for some preparation work:
// - Evict the victim pods
// - Reject the victim pods if they are in waitingPod map
// - Clear the low-priority pods' nominatedNodeName status if needed
// The Pod won't be retried until the goroutine triggered here completes.
//
// See http://kep.k8s.io/4832 for how the async preemption works.
func (ev *Evaluator) prepareCandidateAsync(c Candidate, pod *v1.Pod, pluginName string) {
metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
// Intentionally create a new context, not using a ctx from the scheduling cycle, to create ctx,
// because this process could continue even after this scheduling cycle finishes.
ctx, cancel := context.WithCancel(context.Background())
errCh := parallelize.NewErrorChannel()
preemptPod := func(index int) {
victim := c.Victims().Pods[index]
if err := ev.PreemptPod(ctx, c, pod, victim, pluginName); err != nil {
errCh.SendErrorWithCancel(err, cancel)
}
}
ev.mu.Lock()
ev.preempting.Insert(pod.UID)
ev.mu.Unlock()
logger := klog.FromContext(ctx)
go func() {
startTime := time.Now()
result := metrics.GoroutineResultSuccess
defer metrics.PreemptionGoroutinesDuration.WithLabelValues(result).Observe(metrics.SinceInSeconds(startTime))
defer metrics.PreemptionGoroutinesExecutionTotal.WithLabelValues(result).Inc()
defer func() {
if result == metrics.GoroutineResultError {
// When API call isn't successful, the Pod may get stuck in the unschedulable pod pool in the worst case.
// So, we should move the Pod to the activeQ.
ev.Handler.Activate(logger, map[string]*v1.Pod{pod.Name: pod})
}
}()
defer cancel()
logger.V(2).Info("Start the preemption asynchronously", "preemptor", klog.KObj(pod), "node", c.Name(), "numVictims", len(c.Victims().Pods))
// Lower priority pods nominated to run on this node, may no longer fit on
// this node. So, we should remove their nomination. Removing their
// nomination updates these pods and moves them to the active queue. It
// lets scheduler find another place for them.
nominatedPods := getLowerPriorityNominatedPods(logger, ev.Handler, pod, c.Name())
if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), nominatedPods...); err != nil {
logger.Error(err, "Cannot clear 'NominatedNodeName' field from lower priority pods on the same target node", "node", c.Name())
result = metrics.GoroutineResultError
// We do not return as this error is not critical.
}
if len(c.Victims().Pods) == 0 {
ev.mu.Lock()
delete(ev.preempting, pod.UID)
ev.mu.Unlock()
return
}
// We can evict all victims in parallel, but the last one.
// We have to remove the pod from the preempting map before the last one is evicted
// because, otherwise, the pod removal might be notified to the scheduling queue before
// we remove this pod from the preempting map,
// and the pod could end up stucking at the unschedulable pod pool
// by all the pod removal events being ignored.
ev.Handler.Parallelizer().Until(ctx, len(c.Victims().Pods)-1, preemptPod, ev.PluginName)
if err := errCh.ReceiveError(); err != nil {
logger.Error(err, "Error occurred during async preemption")
result = metrics.GoroutineResultError
}
ev.mu.Lock()
delete(ev.preempting, pod.UID)
ev.mu.Unlock()
if err := ev.PreemptPod(ctx, c, pod, c.Victims().Pods[len(c.Victims().Pods)-1], pluginName); err != nil {
logger.Error(err, "Error occurred during async preemption")
result = metrics.GoroutineResultError
}
logger.V(2).Info("Async Preemption finished completely", "preemptor", klog.KObj(pod), "node", c.Name(), "result", result)
}()
}
func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
if pdbLister != nil {
return pdbLister.List(labels.Everything())
}
return nil, nil
}
// pickOneNodeForPreemption chooses one node among the given nodes.
// It assumes pods in each map entry are ordered by decreasing priority.
// If the scoreFuns is not empty, It picks a node based on score scoreFuns returns.
// If the scoreFuns is empty,
// It picks a node based on the following criteria:
// 1. A node with minimum number of PDB violations.
// 2. A node with minimum highest priority victim is picked.
// 3. Ties are broken by sum of priorities of all victims.
// 4. If there are still ties, node with the minimum number of victims is picked.
// 5. If there are still ties, node with the latest start time of all highest priority victims is picked.
// 6. If there are still ties, the first such node is picked (sort of randomly).
// The 'minNodes1' and 'minNodes2' are being reused here to save the memory
// allocation and garbage collection time.
func pickOneNodeForPreemption(logger klog.Logger, nodesToVictims map[string]*extenderv1.Victims, scoreFuncs []func(node string) int64) string {
if len(nodesToVictims) == 0 {
return ""
}
allCandidates := make([]string, 0, len(nodesToVictims))
for node := range nodesToVictims {
allCandidates = append(allCandidates, node)
}
if len(scoreFuncs) == 0 {
minNumPDBViolatingScoreFunc := func(node string) int64 {
// The smaller the NumPDBViolations, the higher the score.
return -nodesToVictims[node].NumPDBViolations
}
minHighestPriorityScoreFunc := func(node string) int64 {
// highestPodPriority is the highest priority among the victims on this node.
highestPodPriority := corev1helpers.PodPriority(nodesToVictims[node].Pods[0])
// The smaller the highestPodPriority, the higher the score.
return -int64(highestPodPriority)
}
minSumPrioritiesScoreFunc := func(node string) int64 {
var sumPriorities int64
for _, pod := range nodesToVictims[node].Pods {
// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
// needed so that a node with a few pods with negative priority is not
// picked over a node with a smaller number of pods with the same negative
// priority (and similar scenarios).
sumPriorities += int64(corev1helpers.PodPriority(pod)) + int64(math.MaxInt32+1)
}
// The smaller the sumPriorities, the higher the score.
return -sumPriorities
}
minNumPodsScoreFunc := func(node string) int64 {
// The smaller the length of pods, the higher the score.
return -int64(len(nodesToVictims[node].Pods))
}
latestStartTimeScoreFunc := func(node string) int64 {
// Get the earliest start time of all pods on the current node.
earliestStartTimeOnNode := util.GetEarliestPodStartTime(nodesToVictims[node])
if earliestStartTimeOnNode == nil {
logger.Error(errors.New("earliestStartTime is nil for node"), "Should not reach here", "node", node)
return int64(math.MinInt64)
}
// The bigger the earliestStartTimeOnNode, the higher the score.
return earliestStartTimeOnNode.UnixNano()
}
// Each scoreFunc scores the nodes according to specific rules and keeps the name of the node
// with the highest score. If and only if the scoreFunc has more than one node with the highest
// score, we will execute the other scoreFunc in order of precedence.
scoreFuncs = []func(string) int64{
// A node with a minimum number of PDB is preferable.
minNumPDBViolatingScoreFunc,
// A node with a minimum highest priority victim is preferable.
minHighestPriorityScoreFunc,
// A node with the smallest sum of priorities is preferable.
minSumPrioritiesScoreFunc,
// A node with the minimum number of pods is preferable.
minNumPodsScoreFunc,
// A node with the latest start time of all highest priority victims is preferable.
latestStartTimeScoreFunc,
// If there are still ties, then the first Node in the list is selected.
}
}
for _, f := range scoreFuncs {
selectedNodes := []string{}
maxScore := int64(math.MinInt64)
for _, node := range allCandidates {
score := f(node)
if score > maxScore {
maxScore = score
selectedNodes = []string{}
}
if score == maxScore {
selectedNodes = append(selectedNodes, node)
}
}
if len(selectedNodes) == 1 {
return selectedNodes[0]
}
allCandidates = selectedNodes
}
return allCandidates[0]
}
// getLowerPriorityNominatedPods returns pods whose priority is smaller than the
// priority of the given "pod" and are nominated to run on the given node.
// Note: We could possibly check if the nominated lower priority pods still fit
// and return those that no longer fit, but that would require lots of
// manipulation of NodeInfo and PreFilter state per nominated pod. It may not be
// worth the complexity, especially because we generally expect to have a very
// small number of nominated pods per node.
func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator, pod *v1.Pod, nodeName string) []*v1.Pod {
podInfos := pn.NominatedPodsForNode(nodeName)
if len(podInfos) == 0 {
return nil
}
var lowerPriorityPods []*v1.Pod
podPriority := corev1helpers.PodPriority(pod)
for _, pi := range podInfos {
if corev1helpers.PodPriority(pi.Pod) < podPriority {
lowerPriorityPods = append(lowerPriorityPods, pi.Pod)
}
}
return lowerPriorityPods
}
// DryRunPreemption simulates Preemption logic on <potentialNodes> in parallel,
// returns preemption candidates and a map indicating filtered nodes statuses.
// The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
// candidates, ones that do not violate PDB are preferred over ones that do.
// NOTE: This method is exported for easier testing in default preemption.
func (ev *Evaluator) DryRunPreemption(ctx context.Context, state *framework.CycleState, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
pdbs []*policy.PodDisruptionBudget, offset int32, candidatesNum int32) ([]Candidate, *framework.NodeToStatus, error) {
fh := ev.Handler
nonViolatingCandidates := newCandidateList(candidatesNum)
violatingCandidates := newCandidateList(candidatesNum)
ctx, cancel := context.WithCancel(ctx)
defer cancel()
nodeStatuses := framework.NewDefaultNodeToStatus()
logger := klog.FromContext(ctx)
logger.V(5).Info("Dry run the preemption", "potentialNodesNumber", len(potentialNodes), "pdbsNumber", len(pdbs), "offset", offset, "candidatesNumber", candidatesNum)
var statusesLock sync.Mutex
var errs []error
checkNode := func(i int) {
nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
logger.V(5).Info("Check the potential node for preemption", "node", nodeInfoCopy.Node().Name)
stateCopy := state.Clone()
pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
if status.IsSuccess() && len(pods) != 0 {
victims := extenderv1.Victims{
Pods: pods,
NumPDBViolations: int64(numPDBViolations),
}
c := &candidate{
victims: &victims,
name: nodeInfoCopy.Node().Name,
}
if numPDBViolations == 0 {
nonViolatingCandidates.add(c)
} else {
violatingCandidates.add(c)
}
nvcSize, vcSize := nonViolatingCandidates.size(), violatingCandidates.size()
if nvcSize > 0 && nvcSize+vcSize >= candidatesNum {
cancel()
}
return
}
if status.IsSuccess() && len(pods) == 0 {
status = framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeInfoCopy.Node().Name))
}
statusesLock.Lock()
if status.Code() == framework.Error {
errs = append(errs, status.AsError())
}
nodeStatuses.Set(nodeInfoCopy.Node().Name, status)
statusesLock.Unlock()
}
fh.Parallelizer().Until(ctx, len(potentialNodes), checkNode, ev.PluginName)
return append(nonViolatingCandidates.get(), violatingCandidates.get()...), nodeStatuses, utilerrors.NewAggregate(errs)
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,83 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"context"
v1 "k8s.io/api/core/v1"
compbasemetrics "k8s.io/component-base/metrics"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
type instrumentedFilterPlugin struct {
framework.FilterPlugin
metric compbasemetrics.CounterMetric
}
var _ framework.FilterPlugin = &instrumentedFilterPlugin{}
func (p *instrumentedFilterPlugin) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
p.metric.Inc()
return p.FilterPlugin.Filter(ctx, state, pod, nodeInfo)
}
type instrumentedPreFilterPlugin struct {
framework.PreFilterPlugin
metric compbasemetrics.CounterMetric
}
var _ framework.PreFilterPlugin = &instrumentedPreFilterPlugin{}
func (p *instrumentedPreFilterPlugin) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
result, status := p.PreFilterPlugin.PreFilter(ctx, state, pod)
if !status.IsSkip() {
p.metric.Inc()
}
return result, status
}
type instrumentedPreScorePlugin struct {
framework.PreScorePlugin
metric compbasemetrics.CounterMetric
}
var _ framework.PreScorePlugin = &instrumentedPreScorePlugin{}
func (p *instrumentedPreScorePlugin) PreScore(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
status := p.PreScorePlugin.PreScore(ctx, state, pod, nodes)
if !status.IsSkip() {
p.metric.Inc()
}
return status
}
type instrumentedScorePlugin struct {
framework.ScorePlugin
metric compbasemetrics.CounterMetric
}
var _ framework.ScorePlugin = &instrumentedScorePlugin{}
func (p *instrumentedScorePlugin) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
p.metric.Inc()
return p.ScorePlugin.Score(ctx, state, pod, nodeName)
}

View File

@ -0,0 +1,101 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"context"
"fmt"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/json"
"k8s.io/kubernetes/pkg/scheduler/framework"
plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"sigs.k8s.io/yaml"
)
// PluginFactory is a function that builds a plugin.
type PluginFactory = func(ctx context.Context, configuration runtime.Object, f framework.Handle) (framework.Plugin, error)
// PluginFactoryWithFts is a function that builds a plugin with certain feature gates.
type PluginFactoryWithFts func(context.Context, runtime.Object, framework.Handle, plfeature.Features) (framework.Plugin, error)
// FactoryAdapter can be used to inject feature gates for a plugin that needs
// them when the caller expects the older PluginFactory method.
func FactoryAdapter(fts plfeature.Features, withFts PluginFactoryWithFts) PluginFactory {
return func(ctx context.Context, plArgs runtime.Object, fh framework.Handle) (framework.Plugin, error) {
return withFts(ctx, plArgs, fh, fts)
}
}
// DecodeInto decodes configuration whose type is *runtime.Unknown to the interface into.
func DecodeInto(obj runtime.Object, into interface{}) error {
if obj == nil {
return nil
}
configuration, ok := obj.(*runtime.Unknown)
if !ok {
return fmt.Errorf("want args of type runtime.Unknown, got %T", obj)
}
if configuration.Raw == nil {
return nil
}
switch configuration.ContentType {
// If ContentType is empty, it means ContentTypeJSON by default.
case runtime.ContentTypeJSON, "":
return json.Unmarshal(configuration.Raw, into)
case runtime.ContentTypeYAML:
return yaml.Unmarshal(configuration.Raw, into)
default:
return fmt.Errorf("not supported content type %s", configuration.ContentType)
}
}
// Registry is a collection of all available plugins. The framework uses a
// registry to enable and initialize configured plugins.
// All plugins must be in the registry before initializing the framework.
type Registry map[string]PluginFactory
// Register adds a new plugin to the registry. If a plugin with the same name
// exists, it returns an error.
func (r Registry) Register(name string, factory PluginFactory) error {
if _, ok := r[name]; ok {
return fmt.Errorf("a plugin named %v already exists", name)
}
r[name] = factory
return nil
}
// Unregister removes an existing plugin from the registry. If no plugin with
// the provided name exists, it returns an error.
func (r Registry) Unregister(name string) error {
if _, ok := r[name]; !ok {
return fmt.Errorf("no plugin named %v exists", name)
}
delete(r, name)
return nil
}
// Merge merges the provided registry to the current one.
func (r Registry) Merge(in Registry) error {
for name, factory := range in {
if err := r.Register(name, factory); err != nil {
return err
}
}
return nil
}

View File

@ -0,0 +1,165 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package runtime
import (
"fmt"
"sync"
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/scheduler/framework"
)
// waitingPodsMap a thread-safe map used to maintain pods waiting in the permit phase.
type waitingPodsMap struct {
pods map[types.UID]*waitingPod
mu sync.RWMutex
}
// NewWaitingPodsMap returns a new waitingPodsMap.
func NewWaitingPodsMap() *waitingPodsMap {
return &waitingPodsMap{
pods: make(map[types.UID]*waitingPod),
}
}
// add a new WaitingPod to the map.
func (m *waitingPodsMap) add(wp *waitingPod) {
m.mu.Lock()
defer m.mu.Unlock()
m.pods[wp.GetPod().UID] = wp
}
// remove a WaitingPod from the map.
func (m *waitingPodsMap) remove(uid types.UID) {
m.mu.Lock()
defer m.mu.Unlock()
delete(m.pods, uid)
}
// get a WaitingPod from the map.
func (m *waitingPodsMap) get(uid types.UID) *waitingPod {
m.mu.RLock()
defer m.mu.RUnlock()
return m.pods[uid]
}
// iterate acquires a read lock and iterates over the WaitingPods map.
func (m *waitingPodsMap) iterate(callback func(framework.WaitingPod)) {
m.mu.RLock()
defer m.mu.RUnlock()
for _, v := range m.pods {
callback(v)
}
}
// waitingPod represents a pod waiting in the permit phase.
type waitingPod struct {
pod *v1.Pod
pendingPlugins map[string]*time.Timer
s chan *framework.Status
mu sync.RWMutex
}
var _ framework.WaitingPod = &waitingPod{}
// newWaitingPod returns a new waitingPod instance.
func newWaitingPod(pod *v1.Pod, pluginsMaxWaitTime map[string]time.Duration) *waitingPod {
wp := &waitingPod{
pod: pod,
// Allow() and Reject() calls are non-blocking. This property is guaranteed
// by using non-blocking send to this channel. This channel has a buffer of size 1
// to ensure that non-blocking send will not be ignored - possible situation when
// receiving from this channel happens after non-blocking send.
s: make(chan *framework.Status, 1),
}
wp.pendingPlugins = make(map[string]*time.Timer, len(pluginsMaxWaitTime))
// The time.AfterFunc calls wp.Reject which iterates through pendingPlugins map. Acquire the
// lock here so that time.AfterFunc can only execute after newWaitingPod finishes.
wp.mu.Lock()
defer wp.mu.Unlock()
for k, v := range pluginsMaxWaitTime {
plugin, waitTime := k, v
wp.pendingPlugins[plugin] = time.AfterFunc(waitTime, func() {
msg := fmt.Sprintf("rejected due to timeout after waiting %v at plugin %v",
waitTime, plugin)
wp.Reject(plugin, msg)
})
}
return wp
}
// GetPod returns a reference to the waiting pod.
func (w *waitingPod) GetPod() *v1.Pod {
return w.pod
}
// GetPendingPlugins returns a list of pending permit plugin's name.
func (w *waitingPod) GetPendingPlugins() []string {
w.mu.RLock()
defer w.mu.RUnlock()
plugins := make([]string, 0, len(w.pendingPlugins))
for p := range w.pendingPlugins {
plugins = append(plugins, p)
}
return plugins
}
// Allow declares the waiting pod is allowed to be scheduled by plugin pluginName.
// If this is the last remaining plugin to allow, then a success signal is delivered
// to unblock the pod.
func (w *waitingPod) Allow(pluginName string) {
w.mu.Lock()
defer w.mu.Unlock()
if timer, exist := w.pendingPlugins[pluginName]; exist {
timer.Stop()
delete(w.pendingPlugins, pluginName)
}
// Only signal success status after all plugins have allowed
if len(w.pendingPlugins) != 0 {
return
}
// The select clause works as a non-blocking send.
// If there is no receiver, it's a no-op (default case).
select {
case w.s <- framework.NewStatus(framework.Success, ""):
default:
}
}
// Reject declares the waiting pod unschedulable.
func (w *waitingPod) Reject(pluginName, msg string) {
w.mu.RLock()
defer w.mu.RUnlock()
for _, timer := range w.pendingPlugins {
timer.Stop()
}
// The select clause works as a non-blocking send.
// If there is no receiver, it's a no-op (default case).
select {
case w.s <- framework.NewStatus(framework.Unschedulable, msg).WithPlugin(pluginName):
default:
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,224 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"time"
"k8s.io/component-base/metrics"
)
// MetricRecorder represents a metric recorder which takes action when the
// metric Inc(), Dec() and Clear()
type MetricRecorder interface {
Inc()
Dec()
Clear()
}
var _ MetricRecorder = &PendingPodsRecorder{}
// PendingPodsRecorder is an implementation of MetricRecorder
type PendingPodsRecorder struct {
recorder metrics.GaugeMetric
}
// NewActivePodsRecorder returns ActivePods in a Prometheus metric fashion
func NewActivePodsRecorder() *PendingPodsRecorder {
return &PendingPodsRecorder{
recorder: ActivePods(),
}
}
// NewUnschedulablePodsRecorder returns UnschedulablePods in a Prometheus metric fashion
func NewUnschedulablePodsRecorder() *PendingPodsRecorder {
return &PendingPodsRecorder{
recorder: UnschedulablePods(),
}
}
// NewBackoffPodsRecorder returns BackoffPods in a Prometheus metric fashion
func NewBackoffPodsRecorder() *PendingPodsRecorder {
return &PendingPodsRecorder{
recorder: BackoffPods(),
}
}
// NewGatedPodsRecorder returns GatedPods in a Prometheus metric fashion
func NewGatedPodsRecorder() *PendingPodsRecorder {
return &PendingPodsRecorder{
recorder: GatedPods(),
}
}
// Inc increases a metric counter by 1, in an atomic way
func (r *PendingPodsRecorder) Inc() {
r.recorder.Inc()
}
// Dec decreases a metric counter by 1, in an atomic way
func (r *PendingPodsRecorder) Dec() {
r.recorder.Dec()
}
// Clear set a metric counter to 0, in an atomic way
func (r *PendingPodsRecorder) Clear() {
r.recorder.Set(float64(0))
}
// histogramVecMetric is the data structure passed in the buffer channel between the main framework thread
// and the metricsRecorder goroutine.
type histogramVecMetric struct {
metric *metrics.HistogramVec
labelValues []string
value float64
}
type gaugeVecMetric struct {
metric *metrics.GaugeVec
labelValues []string
valueToAdd float64
}
type gaugeVecMetricKey struct {
metricName string
labelValue string
}
// MetricAsyncRecorder records metric in a separate goroutine to avoid overhead in the critical path.
type MetricAsyncRecorder struct {
// bufferCh is a channel that serves as a metrics buffer before the metricsRecorder goroutine reports it.
bufferCh chan *histogramVecMetric
// if bufferSize is reached, incoming metrics will be discarded.
bufferSize int
// how often the recorder runs to flush the metrics.
interval time.Duration
// aggregatedInflightEventMetric is only to record InFlightEvents metric asynchronously.
// It's a map from gaugeVecMetricKey to the aggregated value
// and the aggregated value is flushed to Prometheus every time the interval is reached.
// Note that we don't lock the map deliberately because we assume the queue takes lock before updating the in-flight events.
aggregatedInflightEventMetric map[gaugeVecMetricKey]int
aggregatedInflightEventMetricLastFlushTime time.Time
aggregatedInflightEventMetricBufferCh chan *gaugeVecMetric
// stopCh is used to stop the goroutine which periodically flushes metrics.
stopCh <-chan struct{}
// IsStoppedCh indicates whether the goroutine is stopped. It's used in tests only to make sure
// the metric flushing goroutine is stopped so that tests can collect metrics for verification.
IsStoppedCh chan struct{}
}
func NewMetricsAsyncRecorder(bufferSize int, interval time.Duration, stopCh <-chan struct{}) *MetricAsyncRecorder {
recorder := &MetricAsyncRecorder{
bufferCh: make(chan *histogramVecMetric, bufferSize),
bufferSize: bufferSize,
interval: interval,
stopCh: stopCh,
aggregatedInflightEventMetric: make(map[gaugeVecMetricKey]int),
aggregatedInflightEventMetricLastFlushTime: time.Now(),
aggregatedInflightEventMetricBufferCh: make(chan *gaugeVecMetric, bufferSize),
IsStoppedCh: make(chan struct{}),
}
go recorder.run()
return recorder
}
// ObservePluginDurationAsync observes the plugin_execution_duration_seconds metric.
// The metric will be flushed to Prometheus asynchronously.
func (r *MetricAsyncRecorder) ObservePluginDurationAsync(extensionPoint, pluginName, status string, value float64) {
r.observeMetricAsync(PluginExecutionDuration, value, pluginName, extensionPoint, status)
}
// ObserveQueueingHintDurationAsync observes the queueing_hint_execution_duration_seconds metric.
// The metric will be flushed to Prometheus asynchronously.
func (r *MetricAsyncRecorder) ObserveQueueingHintDurationAsync(pluginName, event, hint string, value float64) {
r.observeMetricAsync(queueingHintExecutionDuration, value, pluginName, event, hint)
}
// ObserveInFlightEventsAsync observes the in_flight_events metric.
//
// Note that this function is not goroutine-safe;
// we don't lock the map deliberately for the performance reason and we assume the queue (i.e., the caller) takes lock before updating the in-flight events.
func (r *MetricAsyncRecorder) ObserveInFlightEventsAsync(eventLabel string, valueToAdd float64, forceFlush bool) {
r.aggregatedInflightEventMetric[gaugeVecMetricKey{metricName: InFlightEvents.Name, labelValue: eventLabel}] += int(valueToAdd)
// Only flush the metric to the channel if the interval is reached.
// The values are flushed to Prometheus in the run() function, which runs once the interval time.
// Note: we implement this flushing here, not in FlushMetrics, because, if we did so, we would need to implement a lock for the map, which we want to avoid.
if forceFlush || time.Since(r.aggregatedInflightEventMetricLastFlushTime) > r.interval {
for key, value := range r.aggregatedInflightEventMetric {
newMetric := &gaugeVecMetric{
metric: InFlightEvents,
labelValues: []string{key.labelValue},
valueToAdd: float64(value),
}
select {
case r.aggregatedInflightEventMetricBufferCh <- newMetric:
default:
}
}
r.aggregatedInflightEventMetricLastFlushTime = time.Now()
// reset
r.aggregatedInflightEventMetric = make(map[gaugeVecMetricKey]int)
}
}
func (r *MetricAsyncRecorder) observeMetricAsync(m *metrics.HistogramVec, value float64, labelsValues ...string) {
newMetric := &histogramVecMetric{
metric: m,
labelValues: labelsValues,
value: value,
}
select {
case r.bufferCh <- newMetric:
default:
}
}
// run flushes buffered metrics into Prometheus every second.
func (r *MetricAsyncRecorder) run() {
for {
select {
case <-r.stopCh:
close(r.IsStoppedCh)
return
default:
}
r.FlushMetrics()
time.Sleep(r.interval)
}
}
// FlushMetrics tries to clean up the bufferCh by reading at most bufferSize metrics.
func (r *MetricAsyncRecorder) FlushMetrics() {
for i := 0; i < r.bufferSize; i++ {
select {
case m := <-r.bufferCh:
m.metric.WithLabelValues(m.labelValues...).Observe(m.value)
default:
// no more value
}
select {
case m := <-r.aggregatedInflightEventMetricBufferCh:
m.metric.WithLabelValues(m.labelValues...).Add(m.valueToAdd)
default:
// no more value
}
}
}

View File

@ -0,0 +1,416 @@
/*
Copyright 2015 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package metrics
import (
"sync"
"time"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/component-base/metrics"
"k8s.io/component-base/metrics/legacyregistry"
"k8s.io/kubernetes/pkg/features"
volumebindingmetrics "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
)
const (
// SchedulerSubsystem - subsystem name used by scheduler.
SchedulerSubsystem = "scheduler"
)
// Below are possible values for the work and operation label.
const (
// PrioritizingExtender - prioritizing extender work/operation label value.
PrioritizingExtender = "prioritizing_extender"
// Binding - binding work/operation label value.
Binding = "binding"
)
const (
GoroutineResultSuccess = "success"
GoroutineResultError = "error"
)
// ExtentionPoints is a list of possible values for the extension_point label.
var ExtentionPoints = []string{
PreFilter,
Filter,
PreFilterExtensionAddPod,
PreFilterExtensionRemovePod,
PostFilter,
PreScore,
Score,
ScoreExtensionNormalize,
PreBind,
Bind,
PostBind,
Reserve,
Unreserve,
Permit,
}
const (
PreFilter = "PreFilter"
Filter = "Filter"
PreFilterExtensionAddPod = "PreFilterExtensionAddPod"
PreFilterExtensionRemovePod = "PreFilterExtensionRemovePod"
PostFilter = "PostFilter"
PreScore = "PreScore"
Score = "Score"
ScoreExtensionNormalize = "ScoreExtensionNormalize"
PreBind = "PreBind"
Bind = "Bind"
PostBind = "PostBind"
Reserve = "Reserve"
Unreserve = "Unreserve"
Permit = "Permit"
)
const (
QueueingHintResultQueue = "Queue"
QueueingHintResultQueueSkip = "QueueSkip"
QueueingHintResultError = "Error"
)
const (
PodPoppedInFlightEvent = "PodPopped"
)
// All the histogram based metrics have 1ms as size for the smallest bucket.
var (
scheduleAttempts *metrics.CounterVec
EventHandlingLatency *metrics.HistogramVec
schedulingLatency *metrics.HistogramVec
SchedulingAlgorithmLatency *metrics.Histogram
PreemptionVictims *metrics.Histogram
PreemptionAttempts *metrics.Counter
pendingPods *metrics.GaugeVec
InFlightEvents *metrics.GaugeVec
Goroutines *metrics.GaugeVec
// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
// in v1.31. Please use PodSchedulingSLIDuration instead.
PodSchedulingDuration *metrics.HistogramVec
PodSchedulingSLIDuration *metrics.HistogramVec
PodSchedulingAttempts *metrics.Histogram
FrameworkExtensionPointDuration *metrics.HistogramVec
PluginExecutionDuration *metrics.HistogramVec
PermitWaitDuration *metrics.HistogramVec
CacheSize *metrics.GaugeVec
unschedulableReasons *metrics.GaugeVec
PluginEvaluationTotal *metrics.CounterVec
// The below two are only available when the QHint feature gate is enabled.
queueingHintExecutionDuration *metrics.HistogramVec
SchedulerQueueIncomingPods *metrics.CounterVec
// The below two are only available when the async-preemption feature gate is enabled.
PreemptionGoroutinesDuration *metrics.HistogramVec
PreemptionGoroutinesExecutionTotal *metrics.CounterVec
// metricsList is a list of all metrics that should be registered always, regardless of any feature gate's value.
metricsList []metrics.Registerable
)
var registerMetrics sync.Once
// Register all metrics.
func Register() {
// Register the metrics.
registerMetrics.Do(func() {
InitMetrics()
RegisterMetrics(metricsList...)
volumebindingmetrics.RegisterVolumeSchedulingMetrics()
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) {
RegisterMetrics(queueingHintExecutionDuration, InFlightEvents)
}
if utilfeature.DefaultFeatureGate.Enabled(features.SchedulerAsyncPreemption) {
RegisterMetrics(PreemptionGoroutinesDuration, PreemptionGoroutinesExecutionTotal)
}
})
}
func InitMetrics() {
scheduleAttempts = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "schedule_attempts_total",
Help: "Number of attempts to schedule pods, by the result. 'unschedulable' means a pod could not be scheduled, while 'error' means an internal scheduler problem.",
StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"})
EventHandlingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "event_handling_duration_seconds",
Help: "Event handling latency in seconds.",
// Start with 0.1ms with the last bucket being [~200ms, Inf)
Buckets: metrics.ExponentialBuckets(0.0001, 2, 12),
StabilityLevel: metrics.ALPHA,
}, []string{"event"})
schedulingLatency = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "scheduling_attempt_duration_seconds",
Help: "Scheduling attempt latency in seconds (scheduling algorithm + binding)",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.STABLE,
}, []string{"result", "profile"})
SchedulingAlgorithmLatency = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "scheduling_algorithm_duration_seconds",
Help: "Scheduling algorithm latency in seconds",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
},
)
PreemptionVictims = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_victims",
Help: "Number of selected preemption victims",
// we think #victims>64 is pretty rare, therefore [64, +Inf) is considered a single bucket.
Buckets: metrics.ExponentialBuckets(1, 2, 7),
StabilityLevel: metrics.STABLE,
})
PreemptionAttempts = metrics.NewCounter(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_attempts_total",
Help: "Total preemption attempts in the cluster till now",
StabilityLevel: metrics.STABLE,
})
pendingPods = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "pending_pods",
Help: "Number of pending pods, by the queue type. 'active' means number of pods in activeQ; 'backoff' means number of pods in backoffQ; 'unschedulable' means number of pods in unschedulablePods that the scheduler attempted to schedule and failed; 'gated' is the number of unschedulable pods that the scheduler never attempted to schedule because they are gated.",
StabilityLevel: metrics.STABLE,
}, []string{"queue"})
InFlightEvents = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "inflight_events",
Help: "Number of events currently tracked in the scheduling queue.",
StabilityLevel: metrics.ALPHA,
}, []string{"event"})
Goroutines = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "goroutines",
Help: "Number of running goroutines split by the work they do such as binding.",
StabilityLevel: metrics.ALPHA,
}, []string{"operation"})
// PodSchedulingDuration is deprecated as of Kubernetes v1.28, and will be removed
// in v1.31. Please use PodSchedulingSLIDuration instead.
PodSchedulingDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "pod_scheduling_duration_seconds",
Help: "E2e latency for a pod being scheduled which may include multiple scheduling attempts.",
// Start with 10ms with the last bucket being [~88m, Inf).
Buckets: metrics.ExponentialBuckets(0.01, 2, 20),
StabilityLevel: metrics.STABLE,
DeprecatedVersion: "1.29.0",
},
[]string{"attempts"})
PodSchedulingSLIDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "pod_scheduling_sli_duration_seconds",
Help: "E2e latency for a pod being scheduled, from the time the pod enters the scheduling queue and might involve multiple scheduling attempts.",
// Start with 10ms with the last bucket being [~88m, Inf).
Buckets: metrics.ExponentialBuckets(0.01, 2, 20),
StabilityLevel: metrics.BETA,
},
[]string{"attempts"})
PodSchedulingAttempts = metrics.NewHistogram(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "pod_scheduling_attempts",
Help: "Number of attempts to successfully schedule a pod.",
Buckets: metrics.ExponentialBuckets(1, 2, 5),
StabilityLevel: metrics.STABLE,
})
FrameworkExtensionPointDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "framework_extension_point_duration_seconds",
Help: "Latency for running all plugins of a specific extension point.",
// Start with 0.1ms with the last bucket being [~200ms, Inf)
Buckets: metrics.ExponentialBuckets(0.0001, 2, 12),
StabilityLevel: metrics.STABLE,
},
[]string{"extension_point", "status", "profile"})
PluginExecutionDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "plugin_execution_duration_seconds",
Help: "Duration for running a plugin at a specific extension point.",
// Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5)
// so that we have better granularity since plugin latency is very sensitive.
Buckets: metrics.ExponentialBuckets(0.00001, 1.5, 20),
StabilityLevel: metrics.ALPHA,
},
[]string{"plugin", "extension_point", "status"})
// This is only available when the QHint feature gate is enabled.
queueingHintExecutionDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "queueing_hint_execution_duration_seconds",
Help: "Duration for running a queueing hint function of a plugin.",
// Start with 0.01ms with the last bucket being [~22ms, Inf). We use a small factor (1.5)
// so that we have better granularity since plugin latency is very sensitive.
Buckets: metrics.ExponentialBuckets(0.00001, 1.5, 20),
StabilityLevel: metrics.ALPHA,
},
[]string{"plugin", "event", "hint"})
SchedulerQueueIncomingPods = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "queue_incoming_pods_total",
Help: "Number of pods added to scheduling queues by event and queue type.",
StabilityLevel: metrics.STABLE,
}, []string{"queue", "event"})
PermitWaitDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "permit_wait_duration_seconds",
Help: "Duration of waiting on permit.",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
},
[]string{"result"})
CacheSize = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "scheduler_cache_size",
Help: "Number of nodes, pods, and assumed (bound) pods in the scheduler cache.",
StabilityLevel: metrics.ALPHA,
}, []string{"type"})
unschedulableReasons = metrics.NewGaugeVec(
&metrics.GaugeOpts{
Subsystem: SchedulerSubsystem,
Name: "unschedulable_pods",
Help: "The number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.",
StabilityLevel: metrics.ALPHA,
}, []string{"plugin", "profile"})
PluginEvaluationTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "plugin_evaluation_total",
Help: "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).",
StabilityLevel: metrics.ALPHA,
}, []string{"plugin", "extension_point", "profile"})
PreemptionGoroutinesDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_goroutines_duration_seconds",
Help: "Duration in seconds for running goroutines for the preemption.",
Buckets: metrics.ExponentialBuckets(0.01, 2, 20),
StabilityLevel: metrics.ALPHA,
},
[]string{"result"})
PreemptionGoroutinesExecutionTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "preemption_goroutines_execution_total",
Help: "Number of preemption goroutines executed.",
StabilityLevel: metrics.ALPHA,
},
[]string{"result"})
metricsList = []metrics.Registerable{
scheduleAttempts,
schedulingLatency,
SchedulingAlgorithmLatency,
EventHandlingLatency,
PreemptionVictims,
PreemptionAttempts,
pendingPods,
PodSchedulingDuration,
PodSchedulingSLIDuration,
PodSchedulingAttempts,
FrameworkExtensionPointDuration,
PluginExecutionDuration,
SchedulerQueueIncomingPods,
Goroutines,
PermitWaitDuration,
CacheSize,
unschedulableReasons,
PluginEvaluationTotal,
}
}
// RegisterMetrics registers a list of metrics.
// This function is exported because it is intended to be used by out-of-tree plugins to register their custom metrics.
func RegisterMetrics(extraMetrics ...metrics.Registerable) {
for _, metric := range extraMetrics {
legacyregistry.MustRegister(metric)
}
}
// GetGather returns the gatherer. It used by test case outside current package.
func GetGather() metrics.Gatherer {
return legacyregistry.DefaultGatherer
}
// ActivePods returns the pending pods metrics with the label active
func ActivePods() metrics.GaugeMetric {
return pendingPods.With(metrics.Labels{"queue": "active"})
}
// BackoffPods returns the pending pods metrics with the label backoff
func BackoffPods() metrics.GaugeMetric {
return pendingPods.With(metrics.Labels{"queue": "backoff"})
}
// UnschedulablePods returns the pending pods metrics with the label unschedulable
func UnschedulablePods() metrics.GaugeMetric {
return pendingPods.With(metrics.Labels{"queue": "unschedulable"})
}
// GatedPods returns the pending pods metrics with the label gated
func GatedPods() metrics.GaugeMetric {
return pendingPods.With(metrics.Labels{"queue": "gated"})
}
// SinceInSeconds gets the time since the specified start in seconds.
func SinceInSeconds(start time.Time) float64 {
return time.Since(start).Seconds()
}
func UnschedulableReason(plugin string, profile string) metrics.GaugeMetric {
return unschedulableReasons.With(metrics.Labels{"plugin": plugin, "profile": profile})
}

Some files were not shown because too many files have changed in this diff Show More